Module eyecite.models
Expand source code
import re
from collections import UserString
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import (
Any,
Callable,
Dict,
Hashable,
List,
Optional,
Sequence,
Tuple,
Union,
cast,
)
from eyecite.utils import hash_sha256
ResourceType = Hashable
@dataclass(eq=True, frozen=True)
class Reporter:
"""Class for top-level reporters in `reporters_db`, like "S.W." """
short_name: str
name: str
cite_type: str
source: str # one of "reporters", "laws", "journals"
is_scotus: bool = False
def __post_init__(self):
if (
self.cite_type == "federal" and "supreme" in self.name.lower()
) or "scotus" in self.cite_type.lower():
# use setattr because this class is frozen
object.__setattr__(self, "is_scotus", True)
@dataclass(eq=True, frozen=True)
class Edition:
"""Class for individual editions in `reporters_db`,
like "S.W." and "S.W.2d"."""
reporter: Reporter
short_name: str
start: Optional[datetime]
end: Optional[datetime]
def includes_year(
self,
year: int,
) -> bool:
"""Return True if edition contains cases for the given year."""
return (
year <= datetime.now().year
and (self.start is None or self.start.year <= year)
and (self.end is None or self.end.year >= year)
)
@dataclass(eq=False, unsafe_hash=False)
class CitationBase:
"""Base class for objects returned by `eyecite.find.get_citations`. We
define several subclasses of this class below, representing the various
types of citations that might exist."""
token: "Token" # token this citation came from
index: int # index of _token in the token list
# span() overrides
span_start: Optional[int] = None
span_end: Optional[int] = None
full_span_start: Optional[int] = None
full_span_end: Optional[int] = None
groups: dict = field(default_factory=dict)
metadata: Any = None
def __post_init__(self):
"""Set up groups and metadata."""
# Allow groups to be used in comparisons:
self.groups = self.token.groups
# Make metadata a self.Metadata object:
self.metadata = (
self.Metadata(**self.metadata)
if isinstance(self.metadata, dict)
else self.Metadata()
)
# Set known missing page numbers to None
if re.search("^_+$", self.groups.get("page", "") or ""):
self.groups["page"] = None
def __repr__(self):
"""Simplified repr() to be more readable than full dataclass repr().
Just shows 'FullCaseCitation("matched text", groups=...)'."""
return (
f"{self.__class__.__name__}("
+ f"{repr(self.matched_text())}"
+ (f", groups={repr(self.groups)}" if self.groups else "")
+ f", metadata={repr(self.metadata)}"
+ ")"
)
def __hash__(self) -> int:
"""In general, citations are considered equivalent if they have the
same group values (i.e., the same regex group content that is extracted
from the matched text). Subclasses may override this method in order to
specify equivalence behavior that is more appropriate for certain
kinds of citations (e.g., see CaseCitation override).
self.groups typically contains different keys for different objects:
FullLawCitation (non-exhaustive and non-guaranteed):
- chapter
- reporter
- law_section
- issue
- page
- docket_number
- pamphlet
- title
FullJournalCitation (non-exhaustive and non-guaranteed):
- volume
- reporter
- page
FullCaseCitation (see CaseCitation.__hash__() notes)
"""
return hash(
hash_sha256(
{**dict(self.groups.items()), **{"class": type(self).__name__}}
)
)
def __eq__(self, other):
"""This method is inherited by all subclasses and should not be
overridden. It implements object equality in exactly the same way as
defined in an object's __hash__() function, which should be overridden
instead if desired.
"""
return self.__hash__() == other.__hash__()
@dataclass(eq=True, unsafe_hash=True)
class Metadata:
"""Define fields on self.metadata."""
parenthetical: Optional[str] = None
def corrected_citation(self):
"""Return citation with any variations normalized."""
return self.matched_text()
def corrected_citation_full(self):
"""Return citation with any variations normalized, including extracted
metadata if any."""
return self.matched_text()
def dump(self) -> dict:
"""Return citation data for printing by dump_citations."""
return {
"groups": self.groups,
"metadata": {
k: v
for k, v in self.metadata.__dict__.items()
if v is not None
},
}
def matched_text(self):
"""Text that identified this citation, such as '1 U.S. 1' or 'Id.'"""
return str(self.token)
def span(self):
"""Start and stop offsets in source text for matched_text()."""
return (
(
self.span_start
if self.span_start is not None
else self.token.start
),
self.span_end if self.span_end is not None else self.token.end,
)
def full_span(self) -> Tuple[int, int]:
"""Span indices that fully cover the citation
Start and stop offsets in source text for full citation text (including
plaintiff, defendant, post citation, ...)
Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.
:returns: Tuple of start and end indicies
"""
start = self.full_span_start
if start is None:
start = self.span()[0]
end = self.full_span_end
if end is None:
end = self.span()[1]
return start, end
@dataclass(eq=False, unsafe_hash=False, repr=False)
class ResourceCitation(CitationBase):
"""Base class for a case, law, or journal citation. Could be short or
long."""
# Editions that might match this reporter string
exact_editions: Sequence[Edition] = field(default_factory=tuple)
variation_editions: Sequence[Edition] = field(default_factory=tuple)
all_editions: Sequence[Edition] = field(default_factory=tuple)
edition_guess: Optional[Edition] = None
# year extracted from metadata["year"] and converted to int,
# if in a valid range
year: Optional[int] = None
def __post_init__(self):
"""Make iterables into tuples to make sure we're hashable."""
self.exact_editions = tuple(self.exact_editions)
self.variation_editions = tuple(self.variation_editions)
self.all_editions = tuple(self.exact_editions) + tuple(
self.variation_editions
)
super().__post_init__()
def __hash__(self) -> int:
"""ResourceCitation objects are hashed in the same way as their
parent class (CitationBase) objects, except that we also take into
consideration the all_editions field.
"""
return hash(
hash_sha256(
{
**dict(self.groups.items()),
**{
"all_editions": sorted(
[asdict(e) for e in self.all_editions],
key=lambda d: d["short_name"], # type: ignore
),
"class": type(self).__name__,
},
}
)
)
@dataclass(eq=True, unsafe_hash=True)
class Metadata(CitationBase.Metadata):
"""Define fields on self.metadata."""
pin_cite: Optional[str] = None
year: Optional[str] = None
def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
self.guess_edition()
def dump(self) -> dict:
"""Return citation data for printing by dump_citations."""
return {
**super().dump(),
"year": self.year,
}
def corrected_reporter(self):
"""Get official reporter string from edition_guess, if possible."""
return (
self.edition_guess.short_name
if self.edition_guess
else self.groups["reporter"]
)
def corrected_citation(self):
"""Return citation with corrected reporter."""
if self.edition_guess:
return self.matched_text().replace(
self.groups["reporter"], self.edition_guess.short_name
)
return self.matched_text()
def guess_edition(self):
"""Set edition_guess."""
# Use exact matches if possible, otherwise try variations
editions = self.exact_editions or self.variation_editions
if not editions:
return
# Attempt resolution by date
if len(editions) > 1 and self.year:
editions = [e for e in editions if e.includes_year(self.year)]
if len(editions) == 1:
self.edition_guess = editions[0]
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCitation(ResourceCitation):
"""Abstract base class indicating that a citation fully identifies a
resource."""
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullLawCitation(FullCitation):
"""Citation to a source from `reporters_db/laws.json`."""
@dataclass(eq=True, unsafe_hash=True)
class Metadata(FullCitation.Metadata):
"""Define fields on self.metadata."""
publisher: Optional[str] = None
day: Optional[str] = None
month: Optional[str] = None
def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
# pylint: disable=import-outside-toplevel
from eyecite.helpers import add_law_metadata
add_law_metadata(self, words)
super().add_metadata(words)
def corrected_citation_full(self):
"""Return citation with any variations normalized, including extracted
metadata if any."""
parts = [self.corrected_citation()]
m = self.metadata
if m.pin_cite:
parts.append(f"{m.pin_cite}")
publisher_date = " ".join(
i for i in (m.publisher, m.month, m.day, m.year) if i
)
if publisher_date:
parts.append(f" ({publisher_date})")
if m.parenthetical:
parts.append(f" ({m.parenthetical})")
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullJournalCitation(FullCitation):
"""Citation to a source from `reporters_db/journals.json`."""
def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
# pylint: disable=import-outside-toplevel
from eyecite.helpers import add_journal_metadata
add_journal_metadata(self, words)
super().add_metadata(words)
def corrected_citation_full(self):
"""Return citation with any variations normalized, including extracted
metadata if any."""
parts = [self.corrected_citation()]
m = self.metadata
if m.pin_cite:
parts.append(f", {m.pin_cite}")
if m.year:
parts.append(f" ({m.year})")
if m.parenthetical:
parts.append(f" ({m.parenthetical})")
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class CaseCitation(ResourceCitation):
"""Convenience class which represents a single citation found in a
document.
"""
def __hash__(self) -> int:
"""CaseCitation objects that have the same volume, reporter, and page
are considered equivalent, unless the citation is missing a page, in
which case the object's hash will be unique for safety.
self.groups for CaseCitation objects usually contains these keys:
- page (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129) # noqa: E501
- reporter (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129) # noqa: E501
- volume (almost always present, but some tax court citations don't have volumes) # noqa: E501
- reporter_nominative (sometimes)
- volumes_nominative (sometimes)
"""
if self.groups["page"] is None:
return id(self)
else:
return hash(
hash_sha256(
{
**{
k: self.groups[k]
for k in ["volume", "page"]
if k in self.groups
},
**{
"reporter": self.corrected_reporter(),
"class": type(self).__name__,
},
}
)
)
@dataclass(eq=True, unsafe_hash=True)
class Metadata(FullCitation.Metadata):
"""Define fields on self.metadata."""
# court is included for ShortCaseCitation as well. It won't appear in
# the cite itself but can also be guessed from the reporter
court: Optional[str] = None
def guess_court(self):
"""Set court based on reporter."""
if not self.metadata.court and any(
e.reporter.is_scotus for e in self.all_editions
):
self.metadata.court = "scotus"
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCaseCitation(CaseCitation, FullCitation):
"""Convenience class which represents a standard, fully named citation,
i.e., the kind of citation that marks the first time a document is cited.
Example:
```
Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240
```
"""
@dataclass(eq=True, unsafe_hash=True)
class Metadata(CaseCitation.Metadata):
"""Define fields on self.metadata."""
plaintiff: Optional[str] = None
defendant: Optional[str] = None
extra: Optional[str] = None
def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
# pylint: disable=import-outside-toplevel
from eyecite.helpers import add_defendant, add_post_citation
add_post_citation(self, words)
add_defendant(self, words)
self.guess_court()
super().add_metadata(words)
def corrected_citation_full(self):
"""Return formatted version of extracted cite."""
parts = []
m = self.metadata
if m.plaintiff:
parts.append(f"{m.plaintiff} v. ")
if m.defendant:
parts.append(f"{m.defendant}, ")
parts.append(self.corrected_citation())
if m.pin_cite:
parts.append(f", {m.pin_cite}")
if m.extra:
parts.append(m.extra)
publisher_date = " ".join(i for i in (m.court, m.year) if i)
if publisher_date:
parts.append(f" ({publisher_date})")
if m.parenthetical:
parts.append(f" ({m.parenthetical})")
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class ShortCaseCitation(CaseCitation):
"""Convenience class which represents a short form citation, i.e., the kind
of citation made after a full citation has already appeared. This kind of
citation lacks a full case name and usually has a different page number
than the canonical citation.
Examples:
```
Adarand, 515 U.S., at 241
Adarand, 515 U.S. at 241
515 U.S., at 241
```
"""
@dataclass(eq=True, unsafe_hash=True)
class Metadata(CaseCitation.Metadata):
"""Define fields on self.metadata."""
antecedent_guess: Optional[str] = None
def corrected_citation_full(self):
"""Return formatted version of extracted cite."""
parts = []
if self.metadata.antecedent_guess:
parts.append(f"{self.metadata.antecedent_guess}, ")
parts.append(self.corrected_citation())
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class SupraCitation(CitationBase):
"""Convenience class which represents a 'supra' citation, i.e., a citation
to something that is above in the document. Like a short form citation,
this kind of citation lacks a full case name and usually has a different
page number than the canonical citation.
Examples:
```
Adarand, supra, at 240
Adarand, 515 supra, at 240
Adarand, supra, somethingelse
Adarand, supra. somethingelse
```
"""
@dataclass(eq=True, unsafe_hash=True)
class Metadata(CitationBase.Metadata):
"""Define fields on self.metadata."""
antecedent_guess: Optional[str] = None
pin_cite: Optional[str] = None
volume: Optional[str] = None
def formatted(self):
"""Return formatted version of extracted cite."""
parts = []
m = self.metadata
if m.antecedent_guess:
parts.append(f"{m.antecedent_guess}, ")
if m.volume:
parts.append(f"{m.volume} ")
parts.append("supra")
if m.pin_cite:
parts.append(f", {m.pin_cite}")
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class IdCitation(CitationBase):
"""Convenience class which represents an 'id' or 'ibid' citation, i.e., a
citation to the document referenced immediately prior. An 'id' citation is
unlike a regular citation object since it has no knowledge of its reporter,
volume, or page. Instead, the only helpful information that this reference
possesses is a record of the pin cite after the 'id' token.
Example: "... foo bar," id., at 240
"""
def __hash__(self) -> int:
"""IdCitation objects are always considered unique for safety."""
return id(self)
@dataclass(eq=True, unsafe_hash=True)
class Metadata(CitationBase.Metadata):
"""Define fields on self.metadata."""
pin_cite: Optional[str] = None
def formatted(self):
"""Return formatted version of extracted cite."""
parts = ["id."]
if self.metadata.pin_cite:
parts.append(f", {self.metadata.pin_cite}")
return "".join(parts)
@dataclass(eq=False, unsafe_hash=False, repr=False)
class UnknownCitation(CitationBase):
"""Convenience class which represents an unknown citation. A recognized
citation should theoretically be parsed as a CaseCitation, FullLawCitation,
or a FullJournalCitation. If it's something else, this class serves as
a naive catch-all.
"""
def __hash__(self) -> int:
"""UnknownCitation objects are always considered unique for safety."""
return id(self)
@dataclass(eq=True, unsafe_hash=True)
class Token(UserString):
"""Base class for special tokens. For performance, this isn't used
for generic words."""
data: str
start: int
end: int
groups: dict = field(default_factory=dict, compare=False)
@classmethod
def from_match(cls, m, extra, offset=0) -> "Token":
"""Return a token object based on a regular expression match.
This gets called by TokenExtractor. By default, just use the
entire matched string."""
start, end = m.span(1)
# ignore "too many arguments" type error -- this is called
# by subclasses with additional attributes
return cls( # type: ignore[call-arg]
m[1], start + offset, end + offset, groups=m.groupdict(), **extra
)
def merge(self, other: "Token") -> Optional["Token"]:
"""Merge two tokens, by returning self if other is identical to
self."""
if (
self.start == other.start
and self.end == other.end
and type(self) is type(other)
and self.groups == other.groups
):
return self
return None
# For performance, lists of tokens can include either Token subclasses
# or bare strings (the typical case of words that aren't
# related to citations)
TokenOrStr = Union[Token, str]
Tokens = List[TokenOrStr]
@dataclass(eq=True, unsafe_hash=True)
class CitationToken(Token):
"""String matching a citation regex from `reporters_db/reporters.json`."""
exact_editions: Sequence[Edition] = field(default_factory=tuple)
variation_editions: Sequence[Edition] = field(default_factory=tuple)
short: bool = False
def __post_init__(self):
"""Make iterables into tuples to make sure we're hashable."""
self.exact_editions = tuple(self.exact_editions)
self.variation_editions = tuple(self.variation_editions)
def merge(self, other: "Token") -> Optional["Token"]:
"""To merge citation tokens, also make sure `short` matches,
and combine their editions."""
merged = super().merge(other)
if merged:
other = cast(CitationToken, other)
if self.short == other.short:
self.exact_editions = cast(tuple, self.exact_editions) + cast(
tuple, other.exact_editions
)
self.variation_editions = cast(
tuple, self.variation_editions
) + cast(tuple, other.variation_editions)
# Remove duplicate editions after merge
self.exact_editions = tuple(set(self.exact_editions))
self.variation_editions = tuple(set(self.variation_editions))
return self
return None
@dataclass(eq=True, unsafe_hash=True)
class SectionToken(Token):
"""Word containing a section symbol."""
@dataclass(eq=True, unsafe_hash=True)
class SupraToken(Token):
"""Word matching "supra" with or without punctuation."""
@dataclass(eq=True, unsafe_hash=True)
class IdToken(Token):
"""Word matching "id" or "ibid"."""
@dataclass(eq=True, unsafe_hash=True)
class ParagraphToken(Token):
"""Word matching a break between paragraphs."""
@dataclass(eq=True, unsafe_hash=True)
class StopWordToken(Token):
"""Word matching one of the STOP_TOKENS."""
@dataclass
class TokenExtractor:
"""Class for extracting all matches from a given string for the given
regex, and then for returning Token objects for all matches."""
regex: str
# constructor should be Callable[[re.Match, dict, int], Token]
# but this issue makes it inconvenient to specify the input types:
# https://github.com/python/mypy/issues/5485
constructor: Callable[..., Token]
extra: Dict = field(default_factory=dict)
flags: int = 0
strings: List = field(default_factory=list)
def get_matches(self, text):
"""Return match objects for all matches in text."""
return self.compiled_regex.finditer(text)
def get_token(self, m, offset=0) -> Token:
"""For a given match object, return a Token."""
return self.constructor(m, self.extra, offset)
def __hash__(self):
"""This needs to be hashable so we can remove redundant
extractors returned by the pyahocorasick filter."""
return hash(repr(self))
@property
def compiled_regex(self):
"""Cache compiled regex as a property."""
if not hasattr(self, "_compiled_regex"):
self._compiled_regex = re.compile(self.regex, flags=self.flags)
return self._compiled_regex
@dataclass(frozen=True)
class Resource(ResourceType):
"""Thin resource class representing an object to which a citation can be
resolved. See `eyecite.resolve` for more details."""
citation: FullCitation
def __hash__(self):
"""Resources are the same if their citations are semantically
equivalent, as defined by their hash function.
Note: Resources composed of citations with missing page numbers are
NOT considered the same, even if their other attributes are identical.
This is to avoid potential false positives.
"""
return hash(
hash_sha256(
{
"citation": hash(self.citation),
"class": type(self).__name__,
}
)
)
def __eq__(self, other):
return self.__hash__() == other.__hash__()
Classes
class CaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Convenience class which represents a single citation found in a document.
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class CaseCitation(ResourceCitation): """Convenience class which represents a single citation found in a document. """ def __hash__(self) -> int: """CaseCitation objects that have the same volume, reporter, and page are considered equivalent, unless the citation is missing a page, in which case the object's hash will be unique for safety. self.groups for CaseCitation objects usually contains these keys: - page (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129) # noqa: E501 - reporter (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129) # noqa: E501 - volume (almost always present, but some tax court citations don't have volumes) # noqa: E501 - reporter_nominative (sometimes) - volumes_nominative (sometimes) """ if self.groups["page"] is None: return id(self) else: return hash( hash_sha256( { **{ k: self.groups[k] for k in ["volume", "page"] if k in self.groups }, **{ "reporter": self.corrected_reporter(), "class": type(self).__name__, }, } ) ) @dataclass(eq=True, unsafe_hash=True) class Metadata(FullCitation.Metadata): """Define fields on self.metadata.""" # court is included for ShortCaseCitation as well. It won't appear in # the cite itself but can also be guessed from the reporter court: Optional[str] = None def guess_court(self): """Set court based on reporter.""" if not self.metadata.court and any( e.reporter.is_scotus for e in self.all_editions ): self.metadata.court = "scotus"
Ancestors
Subclasses
Methods
def guess_court(self)
-
Set court based on reporter.
Expand source code
def guess_court(self): """Set court based on reporter.""" if not self.metadata.court and any( e.reporter.is_scotus for e in self.all_editions ): self.metadata.court = "scotus"
Inherited members
class CitationBase (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)
-
Base class for objects returned by
get_citations()
. We define several subclasses of this class below, representing the various types of citations that might exist.Expand source code
@dataclass(eq=False, unsafe_hash=False) class CitationBase: """Base class for objects returned by `eyecite.find.get_citations`. We define several subclasses of this class below, representing the various types of citations that might exist.""" token: "Token" # token this citation came from index: int # index of _token in the token list # span() overrides span_start: Optional[int] = None span_end: Optional[int] = None full_span_start: Optional[int] = None full_span_end: Optional[int] = None groups: dict = field(default_factory=dict) metadata: Any = None def __post_init__(self): """Set up groups and metadata.""" # Allow groups to be used in comparisons: self.groups = self.token.groups # Make metadata a self.Metadata object: self.metadata = ( self.Metadata(**self.metadata) if isinstance(self.metadata, dict) else self.Metadata() ) # Set known missing page numbers to None if re.search("^_+$", self.groups.get("page", "") or ""): self.groups["page"] = None def __repr__(self): """Simplified repr() to be more readable than full dataclass repr(). Just shows 'FullCaseCitation("matched text", groups=...)'.""" return ( f"{self.__class__.__name__}(" + f"{repr(self.matched_text())}" + (f", groups={repr(self.groups)}" if self.groups else "") + f", metadata={repr(self.metadata)}" + ")" ) def __hash__(self) -> int: """In general, citations are considered equivalent if they have the same group values (i.e., the same regex group content that is extracted from the matched text). Subclasses may override this method in order to specify equivalence behavior that is more appropriate for certain kinds of citations (e.g., see CaseCitation override). self.groups typically contains different keys for different objects: FullLawCitation (non-exhaustive and non-guaranteed): - chapter - reporter - law_section - issue - page - docket_number - pamphlet - title FullJournalCitation (non-exhaustive and non-guaranteed): - volume - reporter - page FullCaseCitation (see CaseCitation.__hash__() notes) """ return hash( hash_sha256( {**dict(self.groups.items()), **{"class": type(self).__name__}} ) ) def __eq__(self, other): """This method is inherited by all subclasses and should not be overridden. It implements object equality in exactly the same way as defined in an object's __hash__() function, which should be overridden instead if desired. """ return self.__hash__() == other.__hash__() @dataclass(eq=True, unsafe_hash=True) class Metadata: """Define fields on self.metadata.""" parenthetical: Optional[str] = None def corrected_citation(self): """Return citation with any variations normalized.""" return self.matched_text() def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" return self.matched_text() def dump(self) -> dict: """Return citation data for printing by dump_citations.""" return { "groups": self.groups, "metadata": { k: v for k, v in self.metadata.__dict__.items() if v is not None }, } def matched_text(self): """Text that identified this citation, such as '1 U.S. 1' or 'Id.'""" return str(self.token) def span(self): """Start and stop offsets in source text for matched_text().""" return ( ( self.span_start if self.span_start is not None else self.token.start ), self.span_end if self.span_end is not None else self.token.end, ) def full_span(self) -> Tuple[int, int]: """Span indices that fully cover the citation Start and stop offsets in source text for full citation text (including plaintiff, defendant, post citation, ...) Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation. :returns: Tuple of start and end indicies """ start = self.full_span_start if start is None: start = self.span()[0] end = self.full_span_end if end is None: end = self.span()[1] return start, end
Subclasses
Class variables
var Metadata
-
Define fields on self.metadata.
var full_span_end : Optional[int]
var full_span_start : Optional[int]
var groups : dict
var index : int
var metadata : Any
var span_end : Optional[int]
var span_start : Optional[int]
var token : Token
Methods
def corrected_citation(self)
-
Return citation with any variations normalized.
Expand source code
def corrected_citation(self): """Return citation with any variations normalized.""" return self.matched_text()
def corrected_citation_full(self)
-
Return citation with any variations normalized, including extracted metadata if any.
Expand source code
def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" return self.matched_text()
def dump(self) ‑> dict
-
Return citation data for printing by dump_citations.
Expand source code
def dump(self) -> dict: """Return citation data for printing by dump_citations.""" return { "groups": self.groups, "metadata": { k: v for k, v in self.metadata.__dict__.items() if v is not None }, }
def full_span(self) ‑> Tuple[int, int]
-
Span indices that fully cover the citation
Start and stop offsets in source text for full citation text (including plaintiff, defendant, post citation, …)
Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.
:returns: Tuple of start and end indicies
Expand source code
def full_span(self) -> Tuple[int, int]: """Span indices that fully cover the citation Start and stop offsets in source text for full citation text (including plaintiff, defendant, post citation, ...) Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation. :returns: Tuple of start and end indicies """ start = self.full_span_start if start is None: start = self.span()[0] end = self.full_span_end if end is None: end = self.span()[1] return start, end
def matched_text(self)
-
Text that identified this citation, such as '1 U.S. 1' or 'Id.'
Expand source code
def matched_text(self): """Text that identified this citation, such as '1 U.S. 1' or 'Id.'""" return str(self.token)
def span(self)
-
Start and stop offsets in source text for matched_text().
Expand source code
def span(self): """Start and stop offsets in source text for matched_text().""" return ( ( self.span_start if self.span_start is not None else self.token.start ), self.span_end if self.span_end is not None else self.token.end, )
class CitationToken (data: str, start: int, end: int, groups: dict = <factory>, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, short: bool = False)
-
String matching a citation regex from
reporters_db/reporters.json
.Expand source code
@dataclass(eq=True, unsafe_hash=True) class CitationToken(Token): """String matching a citation regex from `reporters_db/reporters.json`.""" exact_editions: Sequence[Edition] = field(default_factory=tuple) variation_editions: Sequence[Edition] = field(default_factory=tuple) short: bool = False def __post_init__(self): """Make iterables into tuples to make sure we're hashable.""" self.exact_editions = tuple(self.exact_editions) self.variation_editions = tuple(self.variation_editions) def merge(self, other: "Token") -> Optional["Token"]: """To merge citation tokens, also make sure `short` matches, and combine their editions.""" merged = super().merge(other) if merged: other = cast(CitationToken, other) if self.short == other.short: self.exact_editions = cast(tuple, self.exact_editions) + cast( tuple, other.exact_editions ) self.variation_editions = cast( tuple, self.variation_editions ) + cast(tuple, other.variation_editions) # Remove duplicate editions after merge self.exact_editions = tuple(set(self.exact_editions)) self.variation_editions = tuple(set(self.variation_editions)) return self return None
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Class variables
var exact_editions : Sequence[Edition]
var short : bool
var variation_editions : Sequence[Edition]
Methods
def merge(self, other: Token) ‑> Optional[Token]
-
To merge citation tokens, also make sure
short
matches, and combine their editions.Expand source code
def merge(self, other: "Token") -> Optional["Token"]: """To merge citation tokens, also make sure `short` matches, and combine their editions.""" merged = super().merge(other) if merged: other = cast(CitationToken, other) if self.short == other.short: self.exact_editions = cast(tuple, self.exact_editions) + cast( tuple, other.exact_editions ) self.variation_editions = cast( tuple, self.variation_editions ) + cast(tuple, other.variation_editions) # Remove duplicate editions after merge self.exact_editions = tuple(set(self.exact_editions)) self.variation_editions = tuple(set(self.variation_editions)) return self return None
Inherited members
class Edition (reporter: Reporter, short_name: str, start: Optional[datetime.datetime], end: Optional[datetime.datetime])
-
Class for individual editions in
reporters_db
, like "S.W." and "S.W.2d".Expand source code
@dataclass(eq=True, frozen=True) class Edition: """Class for individual editions in `reporters_db`, like "S.W." and "S.W.2d".""" reporter: Reporter short_name: str start: Optional[datetime] end: Optional[datetime] def includes_year( self, year: int, ) -> bool: """Return True if edition contains cases for the given year.""" return ( year <= datetime.now().year and (self.start is None or self.start.year <= year) and (self.end is None or self.end.year >= year) )
Class variables
var end : Optional[datetime.datetime]
var reporter : Reporter
var short_name : str
var start : Optional[datetime.datetime]
Methods
def includes_year(self, year: int) ‑> bool
-
Return True if edition contains cases for the given year.
Expand source code
def includes_year( self, year: int, ) -> bool: """Return True if edition contains cases for the given year.""" return ( year <= datetime.now().year and (self.start is None or self.start.year <= year) and (self.end is None or self.end.year >= year) )
class FullCaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Convenience class which represents a standard, fully named citation, i.e., the kind of citation that marks the first time a document is cited.
Example:
Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class FullCaseCitation(CaseCitation, FullCitation): """Convenience class which represents a standard, fully named citation, i.e., the kind of citation that marks the first time a document is cited. Example: ``` Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240 ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CaseCitation.Metadata): """Define fields on self.metadata.""" plaintiff: Optional[str] = None defendant: Optional[str] = None extra: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_defendant, add_post_citation add_post_citation(self, words) add_defendant(self, words) self.guess_court() super().add_metadata(words) def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.plaintiff: parts.append(f"{m.plaintiff} v. ") if m.defendant: parts.append(f"{m.defendant}, ") parts.append(self.corrected_citation()) if m.pin_cite: parts.append(f", {m.pin_cite}") if m.extra: parts.append(m.extra) publisher_date = " ".join(i for i in (m.court, m.year) if i) if publisher_date: parts.append(f" ({publisher_date})") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts)
Ancestors
Methods
def corrected_citation_full(self)
-
Return formatted version of extracted cite.
Expand source code
def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.plaintiff: parts.append(f"{m.plaintiff} v. ") if m.defendant: parts.append(f"{m.defendant}, ") parts.append(self.corrected_citation()) if m.pin_cite: parts.append(f", {m.pin_cite}") if m.extra: parts.append(m.extra) publisher_date = " ".join(i for i in (m.court, m.year) if i) if publisher_date: parts.append(f" ({publisher_date})") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts)
Inherited members
class FullCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Abstract base class indicating that a citation fully identifies a resource.
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class FullCitation(ResourceCitation): """Abstract base class indicating that a citation fully identifies a resource."""
Ancestors
Subclasses
Inherited members
class FullJournalCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Citation to a source from
reporters_db/journals.json
.Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class FullJournalCitation(FullCitation): """Citation to a source from `reporters_db/journals.json`.""" def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_journal_metadata add_journal_metadata(self, words) super().add_metadata(words) def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" parts = [self.corrected_citation()] m = self.metadata if m.pin_cite: parts.append(f", {m.pin_cite}") if m.year: parts.append(f" ({m.year})") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts)
Ancestors
Inherited members
class FullLawCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Citation to a source from
reporters_db/laws.json
.Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class FullLawCitation(FullCitation): """Citation to a source from `reporters_db/laws.json`.""" @dataclass(eq=True, unsafe_hash=True) class Metadata(FullCitation.Metadata): """Define fields on self.metadata.""" publisher: Optional[str] = None day: Optional[str] = None month: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_law_metadata add_law_metadata(self, words) super().add_metadata(words) def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" parts = [self.corrected_citation()] m = self.metadata if m.pin_cite: parts.append(f"{m.pin_cite}") publisher_date = " ".join( i for i in (m.publisher, m.month, m.day, m.year) if i ) if publisher_date: parts.append(f" ({publisher_date})") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts)
Ancestors
Inherited members
class IdCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)
-
Convenience class which represents an 'id' or 'ibid' citation, i.e., a citation to the document referenced immediately prior. An 'id' citation is unlike a regular citation object since it has no knowledge of its reporter, volume, or page. Instead, the only helpful information that this reference possesses is a record of the pin cite after the 'id' token.
Example: "… foo bar," id., at 240
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class IdCitation(CitationBase): """Convenience class which represents an 'id' or 'ibid' citation, i.e., a citation to the document referenced immediately prior. An 'id' citation is unlike a regular citation object since it has no knowledge of its reporter, volume, or page. Instead, the only helpful information that this reference possesses is a record of the pin cite after the 'id' token. Example: "... foo bar," id., at 240 """ def __hash__(self) -> int: """IdCitation objects are always considered unique for safety.""" return id(self) @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" pin_cite: Optional[str] = None def formatted(self): """Return formatted version of extracted cite.""" parts = ["id."] if self.metadata.pin_cite: parts.append(f", {self.metadata.pin_cite}") return "".join(parts)
Ancestors
Methods
def formatted(self)
-
Return formatted version of extracted cite.
Expand source code
def formatted(self): """Return formatted version of extracted cite.""" parts = ["id."] if self.metadata.pin_cite: parts.append(f", {self.metadata.pin_cite}") return "".join(parts)
Inherited members
class IdToken (data: str, start: int, end: int, groups: dict = <factory>)
-
Word matching "id" or "ibid".
Expand source code
@dataclass(eq=True, unsafe_hash=True) class IdToken(Token): """Word matching "id" or "ibid"."""
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Inherited members
class ParagraphToken (data: str, start: int, end: int, groups: dict = <factory>)
-
Word matching a break between paragraphs.
Expand source code
@dataclass(eq=True, unsafe_hash=True) class ParagraphToken(Token): """Word matching a break between paragraphs."""
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Inherited members
class Reporter (short_name: str, name: str, cite_type: str, source: str, is_scotus: bool = False)
-
Class for top-level reporters in
reporters_db
, like "S.W."Expand source code
@dataclass(eq=True, frozen=True) class Reporter: """Class for top-level reporters in `reporters_db`, like "S.W." """ short_name: str name: str cite_type: str source: str # one of "reporters", "laws", "journals" is_scotus: bool = False def __post_init__(self): if ( self.cite_type == "federal" and "supreme" in self.name.lower() ) or "scotus" in self.cite_type.lower(): # use setattr because this class is frozen object.__setattr__(self, "is_scotus", True)
Class variables
var cite_type : str
var is_scotus : bool
var name : str
var short_name : str
var source : str
class Resource (citation: FullCitation)
-
Thin resource class representing an object to which a citation can be resolved. See
eyecite.resolve
for more details.Expand source code
@dataclass(frozen=True) class Resource(ResourceType): """Thin resource class representing an object to which a citation can be resolved. See `eyecite.resolve` for more details.""" citation: FullCitation def __hash__(self): """Resources are the same if their citations are semantically equivalent, as defined by their hash function. Note: Resources composed of citations with missing page numbers are NOT considered the same, even if their other attributes are identical. This is to avoid potential false positives. """ return hash( hash_sha256( { "citation": hash(self.citation), "class": type(self).__name__, } ) ) def __eq__(self, other): return self.__hash__() == other.__hash__()
Ancestors
- collections.abc.Hashable
- typing.Generic
Class variables
var citation : FullCitation
class ResourceCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Base class for a case, law, or journal citation. Could be short or long.
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class ResourceCitation(CitationBase): """Base class for a case, law, or journal citation. Could be short or long.""" # Editions that might match this reporter string exact_editions: Sequence[Edition] = field(default_factory=tuple) variation_editions: Sequence[Edition] = field(default_factory=tuple) all_editions: Sequence[Edition] = field(default_factory=tuple) edition_guess: Optional[Edition] = None # year extracted from metadata["year"] and converted to int, # if in a valid range year: Optional[int] = None def __post_init__(self): """Make iterables into tuples to make sure we're hashable.""" self.exact_editions = tuple(self.exact_editions) self.variation_editions = tuple(self.variation_editions) self.all_editions = tuple(self.exact_editions) + tuple( self.variation_editions ) super().__post_init__() def __hash__(self) -> int: """ResourceCitation objects are hashed in the same way as their parent class (CitationBase) objects, except that we also take into consideration the all_editions field. """ return hash( hash_sha256( { **dict(self.groups.items()), **{ "all_editions": sorted( [asdict(e) for e in self.all_editions], key=lambda d: d["short_name"], # type: ignore ), "class": type(self).__name__, }, } ) ) @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" pin_cite: Optional[str] = None year: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" self.guess_edition() def dump(self) -> dict: """Return citation data for printing by dump_citations.""" return { **super().dump(), "year": self.year, } def corrected_reporter(self): """Get official reporter string from edition_guess, if possible.""" return ( self.edition_guess.short_name if self.edition_guess else self.groups["reporter"] ) def corrected_citation(self): """Return citation with corrected reporter.""" if self.edition_guess: return self.matched_text().replace( self.groups["reporter"], self.edition_guess.short_name ) return self.matched_text() def guess_edition(self): """Set edition_guess.""" # Use exact matches if possible, otherwise try variations editions = self.exact_editions or self.variation_editions if not editions: return # Attempt resolution by date if len(editions) > 1 and self.year: editions = [e for e in editions if e.includes_year(self.year)] if len(editions) == 1: self.edition_guess = editions[0]
Ancestors
Subclasses
Class variables
var all_editions : Sequence[Edition]
var edition_guess : Optional[Edition]
var exact_editions : Sequence[Edition]
var variation_editions : Sequence[Edition]
var year : Optional[int]
Methods
def add_metadata(self, words: Tokens)
-
Extract metadata from text before and after citation.
Expand source code
def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" self.guess_edition()
def corrected_citation(self)
-
Return citation with corrected reporter.
Expand source code
def corrected_citation(self): """Return citation with corrected reporter.""" if self.edition_guess: return self.matched_text().replace( self.groups["reporter"], self.edition_guess.short_name ) return self.matched_text()
def corrected_reporter(self)
-
Get official reporter string from edition_guess, if possible.
Expand source code
def corrected_reporter(self): """Get official reporter string from edition_guess, if possible.""" return ( self.edition_guess.short_name if self.edition_guess else self.groups["reporter"] )
def guess_edition(self)
-
Set edition_guess.
Expand source code
def guess_edition(self): """Set edition_guess.""" # Use exact matches if possible, otherwise try variations editions = self.exact_editions or self.variation_editions if not editions: return # Attempt resolution by date if len(editions) > 1 and self.year: editions = [e for e in editions if e.includes_year(self.year)] if len(editions) == 1: self.edition_guess = editions[0]
Inherited members
class SectionToken (data: str, start: int, end: int, groups: dict = <factory>)
-
Word containing a section symbol.
Expand source code
@dataclass(eq=True, unsafe_hash=True) class SectionToken(Token): """Word containing a section symbol."""
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Inherited members
class ShortCaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)
-
Convenience class which represents a short form citation, i.e., the kind of citation made after a full citation has already appeared. This kind of citation lacks a full case name and usually has a different page number than the canonical citation.
Examples:
Adarand, 515 U.S., at 241 Adarand, 515 U.S. at 241 515 U.S., at 241
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class ShortCaseCitation(CaseCitation): """Convenience class which represents a short form citation, i.e., the kind of citation made after a full citation has already appeared. This kind of citation lacks a full case name and usually has a different page number than the canonical citation. Examples: ``` Adarand, 515 U.S., at 241 Adarand, 515 U.S. at 241 515 U.S., at 241 ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CaseCitation.Metadata): """Define fields on self.metadata.""" antecedent_guess: Optional[str] = None def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] if self.metadata.antecedent_guess: parts.append(f"{self.metadata.antecedent_guess}, ") parts.append(self.corrected_citation()) return "".join(parts)
Ancestors
Methods
def corrected_citation_full(self)
-
Return formatted version of extracted cite.
Expand source code
def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] if self.metadata.antecedent_guess: parts.append(f"{self.metadata.antecedent_guess}, ") parts.append(self.corrected_citation()) return "".join(parts)
Inherited members
class StopWordToken (data: str, start: int, end: int, groups: dict = <factory>)
-
Word matching one of the STOP_TOKENS.
Expand source code
@dataclass(eq=True, unsafe_hash=True) class StopWordToken(Token): """Word matching one of the STOP_TOKENS."""
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Inherited members
class SupraCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)
-
Convenience class which represents a 'supra' citation, i.e., a citation to something that is above in the document. Like a short form citation, this kind of citation lacks a full case name and usually has a different page number than the canonical citation.
Examples:
Adarand, supra, at 240 Adarand, 515 supra, at 240 Adarand, supra, somethingelse Adarand, supra. somethingelse
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class SupraCitation(CitationBase): """Convenience class which represents a 'supra' citation, i.e., a citation to something that is above in the document. Like a short form citation, this kind of citation lacks a full case name and usually has a different page number than the canonical citation. Examples: ``` Adarand, supra, at 240 Adarand, 515 supra, at 240 Adarand, supra, somethingelse Adarand, supra. somethingelse ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" antecedent_guess: Optional[str] = None pin_cite: Optional[str] = None volume: Optional[str] = None def formatted(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.antecedent_guess: parts.append(f"{m.antecedent_guess}, ") if m.volume: parts.append(f"{m.volume} ") parts.append("supra") if m.pin_cite: parts.append(f", {m.pin_cite}") return "".join(parts)
Ancestors
Methods
def formatted(self)
-
Return formatted version of extracted cite.
Expand source code
def formatted(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.antecedent_guess: parts.append(f"{m.antecedent_guess}, ") if m.volume: parts.append(f"{m.volume} ") parts.append("supra") if m.pin_cite: parts.append(f", {m.pin_cite}") return "".join(parts)
Inherited members
class SupraToken (data: str, start: int, end: int, groups: dict = <factory>)
-
Word matching "supra" with or without punctuation.
Expand source code
@dataclass(eq=True, unsafe_hash=True) class SupraToken(Token): """Word matching "supra" with or without punctuation."""
Ancestors
- Token
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Inherited members
class Token (data: str, start: int, end: int, groups: dict = <factory>)
-
Base class for special tokens. For performance, this isn't used for generic words.
Expand source code
@dataclass(eq=True, unsafe_hash=True) class Token(UserString): """Base class for special tokens. For performance, this isn't used for generic words.""" data: str start: int end: int groups: dict = field(default_factory=dict, compare=False) @classmethod def from_match(cls, m, extra, offset=0) -> "Token": """Return a token object based on a regular expression match. This gets called by TokenExtractor. By default, just use the entire matched string.""" start, end = m.span(1) # ignore "too many arguments" type error -- this is called # by subclasses with additional attributes return cls( # type: ignore[call-arg] m[1], start + offset, end + offset, groups=m.groupdict(), **extra ) def merge(self, other: "Token") -> Optional["Token"]: """Merge two tokens, by returning self if other is identical to self.""" if ( self.start == other.start and self.end == other.end and type(self) is type(other) and self.groups == other.groups ): return self return None
Ancestors
- collections.UserString
- collections.abc.Sequence
- collections.abc.Reversible
- collections.abc.Collection
- collections.abc.Sized
- collections.abc.Iterable
- collections.abc.Container
Subclasses
Class variables
var data : str
var end : int
var groups : dict
var start : int
Static methods
def from_match(m, extra, offset=0) ‑> Token
-
Return a token object based on a regular expression match. This gets called by TokenExtractor. By default, just use the entire matched string.
Expand source code
@classmethod def from_match(cls, m, extra, offset=0) -> "Token": """Return a token object based on a regular expression match. This gets called by TokenExtractor. By default, just use the entire matched string.""" start, end = m.span(1) # ignore "too many arguments" type error -- this is called # by subclasses with additional attributes return cls( # type: ignore[call-arg] m[1], start + offset, end + offset, groups=m.groupdict(), **extra )
Methods
def merge(self, other: Token) ‑> Optional[Token]
-
Merge two tokens, by returning self if other is identical to self.
Expand source code
def merge(self, other: "Token") -> Optional["Token"]: """Merge two tokens, by returning self if other is identical to self.""" if ( self.start == other.start and self.end == other.end and type(self) is type(other) and self.groups == other.groups ): return self return None
class TokenExtractor (regex: str, constructor: Callable[..., Token], extra: Dict = <factory>, flags: int = 0, strings: List = <factory>)
-
Class for extracting all matches from a given string for the given regex, and then for returning Token objects for all matches.
Expand source code
@dataclass class TokenExtractor: """Class for extracting all matches from a given string for the given regex, and then for returning Token objects for all matches.""" regex: str # constructor should be Callable[[re.Match, dict, int], Token] # but this issue makes it inconvenient to specify the input types: # https://github.com/python/mypy/issues/5485 constructor: Callable[..., Token] extra: Dict = field(default_factory=dict) flags: int = 0 strings: List = field(default_factory=list) def get_matches(self, text): """Return match objects for all matches in text.""" return self.compiled_regex.finditer(text) def get_token(self, m, offset=0) -> Token: """For a given match object, return a Token.""" return self.constructor(m, self.extra, offset) def __hash__(self): """This needs to be hashable so we can remove redundant extractors returned by the pyahocorasick filter.""" return hash(repr(self)) @property def compiled_regex(self): """Cache compiled regex as a property.""" if not hasattr(self, "_compiled_regex"): self._compiled_regex = re.compile(self.regex, flags=self.flags) return self._compiled_regex
Class variables
var constructor : Callable[..., Token]
var extra : Dict
var flags : int
var regex : str
var strings : List
Instance variables
var compiled_regex
-
Cache compiled regex as a property.
Expand source code
@property def compiled_regex(self): """Cache compiled regex as a property.""" if not hasattr(self, "_compiled_regex"): self._compiled_regex = re.compile(self.regex, flags=self.flags) return self._compiled_regex
Methods
def get_matches(self, text)
-
Return match objects for all matches in text.
Expand source code
def get_matches(self, text): """Return match objects for all matches in text.""" return self.compiled_regex.finditer(text)
def get_token(self, m, offset=0) ‑> Token
-
For a given match object, return a Token.
Expand source code
def get_token(self, m, offset=0) -> Token: """For a given match object, return a Token.""" return self.constructor(m, self.extra, offset)
class UnknownCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)
-
Convenience class which represents an unknown citation. A recognized citation should theoretically be parsed as a CaseCitation, FullLawCitation, or a FullJournalCitation. If it's something else, this class serves as a naive catch-all.
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False) class UnknownCitation(CitationBase): """Convenience class which represents an unknown citation. A recognized citation should theoretically be parsed as a CaseCitation, FullLawCitation, or a FullJournalCitation. If it's something else, this class serves as a naive catch-all. """ def __hash__(self) -> int: """UnknownCitation objects are always considered unique for safety.""" return id(self)
Ancestors
Inherited members