Module eyecite.models

Expand source code
import re
from collections import UserString
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import (
    Any,
    Callable,
    Dict,
    Hashable,
    List,
    Optional,
    Sequence,
    Tuple,
    Union,
    cast,
)

from eyecite.utils import hash_sha256

ResourceType = Hashable


@dataclass(eq=True, frozen=True)
class Reporter:
    """Class for top-level reporters in `reporters_db`, like "S.W." """

    short_name: str
    name: str
    cite_type: str
    source: str  # one of "reporters", "laws", "journals"
    is_scotus: bool = False

    def __post_init__(self):
        if (
            self.cite_type == "federal" and "supreme" in self.name.lower()
        ) or "scotus" in self.cite_type.lower():
            # use setattr because this class is frozen
            object.__setattr__(self, "is_scotus", True)


@dataclass(eq=True, frozen=True)
class Edition:
    """Class for individual editions in `reporters_db`,
    like "S.W." and "S.W.2d"."""

    reporter: Reporter
    short_name: str
    start: Optional[datetime]
    end: Optional[datetime]

    def includes_year(
        self,
        year: int,
    ) -> bool:
        """Return True if edition contains cases for the given year."""
        return (
            year <= datetime.now().year
            and (self.start is None or self.start.year <= year)
            and (self.end is None or self.end.year >= year)
        )


@dataclass(eq=False, unsafe_hash=False)
class CitationBase:
    """Base class for objects returned by `eyecite.find.get_citations`. We
    define several subclasses of this class below, representing the various
    types of citations that might exist."""

    token: "Token"  # token this citation came from
    index: int  # index of _token in the token list
    # span() overrides
    span_start: Optional[int] = None
    span_end: Optional[int] = None
    full_span_start: Optional[int] = None
    full_span_end: Optional[int] = None
    groups: dict = field(default_factory=dict)
    metadata: Any = None

    def __post_init__(self):
        """Set up groups and metadata."""
        # Allow groups to be used in comparisons:
        self.groups = self.token.groups
        # Make metadata a self.Metadata object:
        self.metadata = (
            self.Metadata(**self.metadata)
            if isinstance(self.metadata, dict)
            else self.Metadata()
        )
        # Set known missing page numbers to None
        if re.search("^_+$", self.groups.get("page", "") or ""):
            self.groups["page"] = None

    def __repr__(self):
        """Simplified repr() to be more readable than full dataclass repr().
        Just shows 'FullCaseCitation("matched text", groups=...)'."""
        return (
            f"{self.__class__.__name__}("
            + f"{repr(self.matched_text())}"
            + (f", groups={repr(self.groups)}" if self.groups else "")
            + f", metadata={repr(self.metadata)}"
            + ")"
        )

    def __hash__(self) -> int:
        """In general, citations are considered equivalent if they have the
        same group values (i.e., the same regex group content that is extracted
        from the matched text). Subclasses may override this method in order to
        specify equivalence behavior that is more appropriate for certain
        kinds of citations (e.g., see CaseCitation override).

        self.groups typically contains different keys for different objects:

        FullLawCitation (non-exhaustive and non-guaranteed):
        - chapter
        - reporter
        - law_section
        - issue
        - page
        - docket_number
        - pamphlet
        - title

        FullJournalCitation (non-exhaustive and non-guaranteed):
        - volume
        - reporter
        - page

        FullCaseCitation (see CaseCitation.__hash__() notes)
        """
        return hash(
            hash_sha256(
                {**dict(self.groups.items()), **{"class": type(self).__name__}}
            )
        )

    def __eq__(self, other):
        """This method is inherited by all subclasses and should not be
        overridden. It implements object equality in exactly the same way as
        defined in an object's __hash__() function, which should be overridden
        instead if desired.
        """
        return self.__hash__() == other.__hash__()

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata:
        """Define fields on self.metadata."""

        parenthetical: Optional[str] = None

    def corrected_citation(self):
        """Return citation with any variations normalized."""
        return self.matched_text()

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        return self.matched_text()

    def dump(self) -> dict:
        """Return citation data for printing by dump_citations."""
        return {
            "groups": self.groups,
            "metadata": {
                k: v
                for k, v in self.metadata.__dict__.items()
                if v is not None
            },
        }

    def matched_text(self):
        """Text that identified this citation, such as '1 U.S. 1' or 'Id.'"""
        return str(self.token)

    def span(self):
        """Start and stop offsets in source text for matched_text()."""
        return (
            (
                self.span_start
                if self.span_start is not None
                else self.token.start
            ),
            self.span_end if self.span_end is not None else self.token.end,
        )

    def full_span(self) -> Tuple[int, int]:
        """Span indices that fully cover the citation

        Start and stop offsets in source text for full citation text (including
        plaintiff, defendant, post citation, ...)

        Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.

        :returns: Tuple of start and end indicies
        """
        start = self.full_span_start
        if start is None:
            start = self.span()[0]

        end = self.full_span_end
        if end is None:
            end = self.span()[1]

        return start, end


@dataclass(eq=False, unsafe_hash=False, repr=False)
class ResourceCitation(CitationBase):
    """Base class for a case, law, or journal citation. Could be short or
    long."""

    # Editions that might match this reporter string
    exact_editions: Sequence[Edition] = field(default_factory=tuple)
    variation_editions: Sequence[Edition] = field(default_factory=tuple)
    all_editions: Sequence[Edition] = field(default_factory=tuple)
    edition_guess: Optional[Edition] = None

    # year extracted from metadata["year"] and converted to int,
    # if in a valid range
    year: Optional[int] = None

    def __post_init__(self):
        """Make iterables into tuples to make sure we're hashable."""
        self.exact_editions = tuple(self.exact_editions)
        self.variation_editions = tuple(self.variation_editions)
        self.all_editions = tuple(self.exact_editions) + tuple(
            self.variation_editions
        )
        super().__post_init__()

    def __hash__(self) -> int:
        """ResourceCitation objects are hashed in the same way as their
        parent class (CitationBase) objects, except that we also take into
        consideration the all_editions field.
        """
        return hash(
            hash_sha256(
                {
                    **dict(self.groups.items()),
                    **{
                        "all_editions": sorted(
                            [asdict(e) for e in self.all_editions],
                            key=lambda d: d["short_name"],  # type: ignore
                        ),
                        "class": type(self).__name__,
                    },
                }
            )
        )

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        pin_cite: Optional[str] = None
        year: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        self.guess_edition()

    def dump(self) -> dict:
        """Return citation data for printing by dump_citations."""
        return {
            **super().dump(),
            "year": self.year,
        }

    def corrected_reporter(self):
        """Get official reporter string from edition_guess, if possible."""
        return (
            self.edition_guess.short_name
            if self.edition_guess
            else self.groups["reporter"]
        )

    def corrected_citation(self):
        """Return citation with corrected reporter."""
        if self.edition_guess:
            return self.matched_text().replace(
                self.groups["reporter"], self.edition_guess.short_name
            )
        return self.matched_text()

    def guess_edition(self):
        """Set edition_guess."""
        # Use exact matches if possible, otherwise try variations
        editions = self.exact_editions or self.variation_editions
        if not editions:
            return

        # Attempt resolution by date
        if len(editions) > 1 and self.year:
            editions = [e for e in editions if e.includes_year(self.year)]

        if len(editions) == 1:
            self.edition_guess = editions[0]


@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCitation(ResourceCitation):
    """Abstract base class indicating that a citation fully identifies a
    resource."""


@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullLawCitation(FullCitation):
    """Citation to a source from `reporters_db/laws.json`."""

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(FullCitation.Metadata):
        """Define fields on self.metadata."""

        publisher: Optional[str] = None
        day: Optional[str] = None
        month: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_law_metadata

        add_law_metadata(self, words)
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        parts = [self.corrected_citation()]
        m = self.metadata
        if m.pin_cite:
            parts.append(f"{m.pin_cite}")
        publisher_date = " ".join(
            i for i in (m.publisher, m.month, m.day, m.year) if i
        )
        if publisher_date:
            parts.append(f" ({publisher_date})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullJournalCitation(FullCitation):
    """Citation to a source from `reporters_db/journals.json`."""

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_journal_metadata

        add_journal_metadata(self, words)
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        parts = [self.corrected_citation()]
        m = self.metadata
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        if m.year:
            parts.append(f" ({m.year})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class CaseCitation(ResourceCitation):
    """Convenience class which represents a single citation found in a
    document.
    """

    def __hash__(self) -> int:
        """CaseCitation objects that have the same volume, reporter, and page
        are considered equivalent, unless the citation is missing a page, in
        which case the object's hash will be unique for safety.

        self.groups for CaseCitation objects usually contains these keys:
        - page (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
        - reporter (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
        - volume (almost always present, but some tax court citations don't have volumes)  # noqa: E501
        - reporter_nominative (sometimes)
        - volumes_nominative (sometimes)
        """
        if self.groups["page"] is None:
            return id(self)
        else:
            return hash(
                hash_sha256(
                    {
                        **{
                            k: self.groups[k]
                            for k in ["volume", "page"]
                            if k in self.groups
                        },
                        **{
                            "reporter": self.corrected_reporter(),
                            "class": type(self).__name__,
                        },
                    }
                )
            )

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(FullCitation.Metadata):
        """Define fields on self.metadata."""

        # court is included for ShortCaseCitation as well. It won't appear in
        # the cite itself but can also be guessed from the reporter
        court: Optional[str] = None

    def guess_court(self):
        """Set court based on reporter."""
        if not self.metadata.court and any(
            e.reporter.is_scotus for e in self.all_editions
        ):
            self.metadata.court = "scotus"


@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCaseCitation(CaseCitation, FullCitation):
    """Convenience class which represents a standard, fully named citation,
    i.e., the kind of citation that marks the first time a document is cited.

    Example:
    ```
    Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CaseCitation.Metadata):
        """Define fields on self.metadata."""

        plaintiff: Optional[str] = None
        defendant: Optional[str] = None
        extra: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_defendant, add_post_citation

        add_post_citation(self, words)
        add_defendant(self, words)
        self.guess_court()
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return formatted version of extracted cite."""
        parts = []
        m = self.metadata
        if m.plaintiff:
            parts.append(f"{m.plaintiff} v. ")
        if m.defendant:
            parts.append(f"{m.defendant}, ")
        parts.append(self.corrected_citation())
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        if m.extra:
            parts.append(m.extra)
        publisher_date = " ".join(i for i in (m.court, m.year) if i)
        if publisher_date:
            parts.append(f" ({publisher_date})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class ShortCaseCitation(CaseCitation):
    """Convenience class which represents a short form citation, i.e., the kind
    of citation made after a full citation has already appeared. This kind of
    citation lacks a full case name and usually has a different page number
    than the canonical citation.

    Examples:
    ```
    Adarand, 515 U.S., at 241
    Adarand, 515 U.S. at 241
    515 U.S., at 241
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CaseCitation.Metadata):
        """Define fields on self.metadata."""

        antecedent_guess: Optional[str] = None

    def corrected_citation_full(self):
        """Return formatted version of extracted cite."""
        parts = []
        if self.metadata.antecedent_guess:
            parts.append(f"{self.metadata.antecedent_guess}, ")
        parts.append(self.corrected_citation())
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class SupraCitation(CitationBase):
    """Convenience class which represents a 'supra' citation, i.e., a citation
    to something that is above in the document. Like a short form citation,
    this kind of citation lacks a full case name and usually has a different
    page number than the canonical citation.


    Examples:
    ```
    Adarand, supra, at 240
    Adarand, 515 supra, at 240
    Adarand, supra, somethingelse
    Adarand, supra. somethingelse
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        antecedent_guess: Optional[str] = None
        pin_cite: Optional[str] = None
        volume: Optional[str] = None

    def formatted(self):
        """Return formatted version of extracted cite."""
        parts = []
        m = self.metadata
        if m.antecedent_guess:
            parts.append(f"{m.antecedent_guess}, ")
        if m.volume:
            parts.append(f"{m.volume} ")
        parts.append("supra")
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class IdCitation(CitationBase):
    """Convenience class which represents an 'id' or 'ibid' citation, i.e., a
    citation to the document referenced immediately prior. An 'id' citation is
    unlike a regular citation object since it has no knowledge of its reporter,
    volume, or page. Instead, the only helpful information that this reference
    possesses is a record of the pin cite after the 'id' token.

    Example: "... foo bar," id., at 240
    """

    def __hash__(self) -> int:
        """IdCitation objects are always considered unique for safety."""
        return id(self)

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        pin_cite: Optional[str] = None

    def formatted(self):
        """Return formatted version of extracted cite."""
        parts = ["id."]
        if self.metadata.pin_cite:
            parts.append(f", {self.metadata.pin_cite}")
        return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class UnknownCitation(CitationBase):
    """Convenience class which represents an unknown citation. A recognized
    citation should theoretically be parsed as a CaseCitation, FullLawCitation,
    or a FullJournalCitation. If it's something else, this class serves as
    a naive catch-all.
    """

    def __hash__(self) -> int:
        """UnknownCitation objects are always considered unique for safety."""
        return id(self)


@dataclass(eq=True, unsafe_hash=True)
class Token(UserString):
    """Base class for special tokens. For performance, this isn't used
    for generic words."""

    data: str
    start: int
    end: int
    groups: dict = field(default_factory=dict, compare=False)

    @classmethod
    def from_match(cls, m, extra, offset=0) -> "Token":
        """Return a token object based on a regular expression match.
        This gets called by TokenExtractor. By default, just use the
        entire matched string."""
        start, end = m.span(1)
        # ignore "too many arguments" type error -- this is called
        # by subclasses with additional attributes
        return cls(  # type: ignore[call-arg]
            m[1], start + offset, end + offset, groups=m.groupdict(), **extra
        )

    def merge(self, other: "Token") -> Optional["Token"]:
        """Merge two tokens, by returning self if other is identical to
        self."""
        if (
            self.start == other.start
            and self.end == other.end
            and type(self) is type(other)
            and self.groups == other.groups
        ):
            return self
        return None


# For performance, lists of tokens can include either Token subclasses
# or bare strings (the typical case of words that aren't
# related to citations)
TokenOrStr = Union[Token, str]
Tokens = List[TokenOrStr]


@dataclass(eq=True, unsafe_hash=True)
class CitationToken(Token):
    """String matching a citation regex from `reporters_db/reporters.json`."""

    exact_editions: Sequence[Edition] = field(default_factory=tuple)
    variation_editions: Sequence[Edition] = field(default_factory=tuple)
    short: bool = False

    def __post_init__(self):
        """Make iterables into tuples to make sure we're hashable."""
        self.exact_editions = tuple(self.exact_editions)
        self.variation_editions = tuple(self.variation_editions)

    def merge(self, other: "Token") -> Optional["Token"]:
        """To merge citation tokens, also make sure `short` matches,
        and combine their editions."""
        merged = super().merge(other)
        if merged:
            other = cast(CitationToken, other)
            if self.short == other.short:
                self.exact_editions = cast(tuple, self.exact_editions) + cast(
                    tuple, other.exact_editions
                )
                self.variation_editions = cast(
                    tuple, self.variation_editions
                ) + cast(tuple, other.variation_editions)
                # Remove duplicate editions after merge
                self.exact_editions = tuple(set(self.exact_editions))
                self.variation_editions = tuple(set(self.variation_editions))
                return self
        return None


@dataclass(eq=True, unsafe_hash=True)
class SectionToken(Token):
    """Word containing a section symbol."""


@dataclass(eq=True, unsafe_hash=True)
class SupraToken(Token):
    """Word matching "supra" with or without punctuation."""


@dataclass(eq=True, unsafe_hash=True)
class IdToken(Token):
    """Word matching "id" or "ibid"."""


@dataclass(eq=True, unsafe_hash=True)
class ParagraphToken(Token):
    """Word matching a break between paragraphs."""


@dataclass(eq=True, unsafe_hash=True)
class StopWordToken(Token):
    """Word matching one of the STOP_TOKENS."""


@dataclass
class TokenExtractor:
    """Class for extracting all matches from a given string for the given
    regex, and then for returning Token objects for all matches."""

    regex: str
    # constructor should be Callable[[re.Match, dict, int], Token]
    # but this issue makes it inconvenient to specify the input types:
    # https://github.com/python/mypy/issues/5485
    constructor: Callable[..., Token]
    extra: Dict = field(default_factory=dict)
    flags: int = 0
    strings: List = field(default_factory=list)

    def get_matches(self, text):
        """Return match objects for all matches in text."""
        return self.compiled_regex.finditer(text)

    def get_token(self, m, offset=0) -> Token:
        """For a given match object, return a Token."""
        return self.constructor(m, self.extra, offset)

    def __hash__(self):
        """This needs to be hashable so we can remove redundant
        extractors returned by the pyahocorasick filter."""
        return hash(repr(self))

    @property
    def compiled_regex(self):
        """Cache compiled regex as a property."""
        if not hasattr(self, "_compiled_regex"):
            self._compiled_regex = re.compile(self.regex, flags=self.flags)
        return self._compiled_regex


@dataclass(frozen=True)
class Resource(ResourceType):
    """Thin resource class representing an object to which a citation can be
    resolved. See `eyecite.resolve` for more details."""

    citation: FullCitation

    def __hash__(self):
        """Resources are the same if their citations are semantically
        equivalent, as defined by their hash function.

        Note: Resources composed of citations with missing page numbers are
        NOT considered the same, even if their other attributes are identical.
        This is to avoid potential false positives.
        """
        return hash(
            hash_sha256(
                {
                    "citation": hash(self.citation),
                    "class": type(self).__name__,
                }
            )
        )

    def __eq__(self, other):
        return self.__hash__() == other.__hash__()

Classes

class CaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Convenience class which represents a single citation found in a document.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class CaseCitation(ResourceCitation):
    """Convenience class which represents a single citation found in a
    document.
    """

    def __hash__(self) -> int:
        """CaseCitation objects that have the same volume, reporter, and page
        are considered equivalent, unless the citation is missing a page, in
        which case the object's hash will be unique for safety.

        self.groups for CaseCitation objects usually contains these keys:
        - page (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
        - reporter (guaranteed here: https://github.com/freelawproject/reporters-db/blob/main/tests.py#L129)  # noqa: E501
        - volume (almost always present, but some tax court citations don't have volumes)  # noqa: E501
        - reporter_nominative (sometimes)
        - volumes_nominative (sometimes)
        """
        if self.groups["page"] is None:
            return id(self)
        else:
            return hash(
                hash_sha256(
                    {
                        **{
                            k: self.groups[k]
                            for k in ["volume", "page"]
                            if k in self.groups
                        },
                        **{
                            "reporter": self.corrected_reporter(),
                            "class": type(self).__name__,
                        },
                    }
                )
            )

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(FullCitation.Metadata):
        """Define fields on self.metadata."""

        # court is included for ShortCaseCitation as well. It won't appear in
        # the cite itself but can also be guessed from the reporter
        court: Optional[str] = None

    def guess_court(self):
        """Set court based on reporter."""
        if not self.metadata.court and any(
            e.reporter.is_scotus for e in self.all_editions
        ):
            self.metadata.court = "scotus"

Ancestors

Subclasses

Methods

def guess_court(self)

Set court based on reporter.

Expand source code
def guess_court(self):
    """Set court based on reporter."""
    if not self.metadata.court and any(
        e.reporter.is_scotus for e in self.all_editions
    ):
        self.metadata.court = "scotus"

Inherited members

class CitationBase (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)

Base class for objects returned by get_citations(). We define several subclasses of this class below, representing the various types of citations that might exist.

Expand source code
@dataclass(eq=False, unsafe_hash=False)
class CitationBase:
    """Base class for objects returned by `eyecite.find.get_citations`. We
    define several subclasses of this class below, representing the various
    types of citations that might exist."""

    token: "Token"  # token this citation came from
    index: int  # index of _token in the token list
    # span() overrides
    span_start: Optional[int] = None
    span_end: Optional[int] = None
    full_span_start: Optional[int] = None
    full_span_end: Optional[int] = None
    groups: dict = field(default_factory=dict)
    metadata: Any = None

    def __post_init__(self):
        """Set up groups and metadata."""
        # Allow groups to be used in comparisons:
        self.groups = self.token.groups
        # Make metadata a self.Metadata object:
        self.metadata = (
            self.Metadata(**self.metadata)
            if isinstance(self.metadata, dict)
            else self.Metadata()
        )
        # Set known missing page numbers to None
        if re.search("^_+$", self.groups.get("page", "") or ""):
            self.groups["page"] = None

    def __repr__(self):
        """Simplified repr() to be more readable than full dataclass repr().
        Just shows 'FullCaseCitation("matched text", groups=...)'."""
        return (
            f"{self.__class__.__name__}("
            + f"{repr(self.matched_text())}"
            + (f", groups={repr(self.groups)}" if self.groups else "")
            + f", metadata={repr(self.metadata)}"
            + ")"
        )

    def __hash__(self) -> int:
        """In general, citations are considered equivalent if they have the
        same group values (i.e., the same regex group content that is extracted
        from the matched text). Subclasses may override this method in order to
        specify equivalence behavior that is more appropriate for certain
        kinds of citations (e.g., see CaseCitation override).

        self.groups typically contains different keys for different objects:

        FullLawCitation (non-exhaustive and non-guaranteed):
        - chapter
        - reporter
        - law_section
        - issue
        - page
        - docket_number
        - pamphlet
        - title

        FullJournalCitation (non-exhaustive and non-guaranteed):
        - volume
        - reporter
        - page

        FullCaseCitation (see CaseCitation.__hash__() notes)
        """
        return hash(
            hash_sha256(
                {**dict(self.groups.items()), **{"class": type(self).__name__}}
            )
        )

    def __eq__(self, other):
        """This method is inherited by all subclasses and should not be
        overridden. It implements object equality in exactly the same way as
        defined in an object's __hash__() function, which should be overridden
        instead if desired.
        """
        return self.__hash__() == other.__hash__()

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata:
        """Define fields on self.metadata."""

        parenthetical: Optional[str] = None

    def corrected_citation(self):
        """Return citation with any variations normalized."""
        return self.matched_text()

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        return self.matched_text()

    def dump(self) -> dict:
        """Return citation data for printing by dump_citations."""
        return {
            "groups": self.groups,
            "metadata": {
                k: v
                for k, v in self.metadata.__dict__.items()
                if v is not None
            },
        }

    def matched_text(self):
        """Text that identified this citation, such as '1 U.S. 1' or 'Id.'"""
        return str(self.token)

    def span(self):
        """Start and stop offsets in source text for matched_text()."""
        return (
            (
                self.span_start
                if self.span_start is not None
                else self.token.start
            ),
            self.span_end if self.span_end is not None else self.token.end,
        )

    def full_span(self) -> Tuple[int, int]:
        """Span indices that fully cover the citation

        Start and stop offsets in source text for full citation text (including
        plaintiff, defendant, post citation, ...)

        Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.

        :returns: Tuple of start and end indicies
        """
        start = self.full_span_start
        if start is None:
            start = self.span()[0]

        end = self.full_span_end
        if end is None:
            end = self.span()[1]

        return start, end

Subclasses

Class variables

var Metadata

Define fields on self.metadata.

var full_span_end : Optional[int]
var full_span_start : Optional[int]
var groups : dict
var index : int
var metadata : Any
var span_end : Optional[int]
var span_start : Optional[int]
var tokenToken

Methods

def corrected_citation(self)

Return citation with any variations normalized.

Expand source code
def corrected_citation(self):
    """Return citation with any variations normalized."""
    return self.matched_text()
def corrected_citation_full(self)

Return citation with any variations normalized, including extracted metadata if any.

Expand source code
def corrected_citation_full(self):
    """Return citation with any variations normalized, including extracted
    metadata if any."""
    return self.matched_text()
def dump(self) ‑> dict

Return citation data for printing by dump_citations.

Expand source code
def dump(self) -> dict:
    """Return citation data for printing by dump_citations."""
    return {
        "groups": self.groups,
        "metadata": {
            k: v
            for k, v in self.metadata.__dict__.items()
            if v is not None
        },
    }
def full_span(self) ‑> Tuple[int, int]

Span indices that fully cover the citation

Start and stop offsets in source text for full citation text (including plaintiff, defendant, post citation, …)

Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.

:returns: Tuple of start and end indicies

Expand source code
def full_span(self) -> Tuple[int, int]:
    """Span indices that fully cover the citation

    Start and stop offsets in source text for full citation text (including
    plaintiff, defendant, post citation, ...)

    Relevant for FullCaseCitation, FullJournalCitation and FullLawCitation.

    :returns: Tuple of start and end indicies
    """
    start = self.full_span_start
    if start is None:
        start = self.span()[0]

    end = self.full_span_end
    if end is None:
        end = self.span()[1]

    return start, end
def matched_text(self)

Text that identified this citation, such as '1 U.S. 1' or 'Id.'

Expand source code
def matched_text(self):
    """Text that identified this citation, such as '1 U.S. 1' or 'Id.'"""
    return str(self.token)
def span(self)

Start and stop offsets in source text for matched_text().

Expand source code
def span(self):
    """Start and stop offsets in source text for matched_text()."""
    return (
        (
            self.span_start
            if self.span_start is not None
            else self.token.start
        ),
        self.span_end if self.span_end is not None else self.token.end,
    )
class CitationToken (data: str, start: int, end: int, groups: dict = <factory>, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, short: bool = False)

String matching a citation regex from reporters_db/reporters.json.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class CitationToken(Token):
    """String matching a citation regex from `reporters_db/reporters.json`."""

    exact_editions: Sequence[Edition] = field(default_factory=tuple)
    variation_editions: Sequence[Edition] = field(default_factory=tuple)
    short: bool = False

    def __post_init__(self):
        """Make iterables into tuples to make sure we're hashable."""
        self.exact_editions = tuple(self.exact_editions)
        self.variation_editions = tuple(self.variation_editions)

    def merge(self, other: "Token") -> Optional["Token"]:
        """To merge citation tokens, also make sure `short` matches,
        and combine their editions."""
        merged = super().merge(other)
        if merged:
            other = cast(CitationToken, other)
            if self.short == other.short:
                self.exact_editions = cast(tuple, self.exact_editions) + cast(
                    tuple, other.exact_editions
                )
                self.variation_editions = cast(
                    tuple, self.variation_editions
                ) + cast(tuple, other.variation_editions)
                # Remove duplicate editions after merge
                self.exact_editions = tuple(set(self.exact_editions))
                self.variation_editions = tuple(set(self.variation_editions))
                return self
        return None

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Class variables

var exact_editions : Sequence[Edition]
var short : bool
var variation_editions : Sequence[Edition]

Methods

def merge(self, other: Token) ‑> Optional[Token]

To merge citation tokens, also make sure short matches, and combine their editions.

Expand source code
def merge(self, other: "Token") -> Optional["Token"]:
    """To merge citation tokens, also make sure `short` matches,
    and combine their editions."""
    merged = super().merge(other)
    if merged:
        other = cast(CitationToken, other)
        if self.short == other.short:
            self.exact_editions = cast(tuple, self.exact_editions) + cast(
                tuple, other.exact_editions
            )
            self.variation_editions = cast(
                tuple, self.variation_editions
            ) + cast(tuple, other.variation_editions)
            # Remove duplicate editions after merge
            self.exact_editions = tuple(set(self.exact_editions))
            self.variation_editions = tuple(set(self.variation_editions))
            return self
    return None

Inherited members

class Edition (reporter: Reporter, short_name: str, start: Optional[datetime.datetime], end: Optional[datetime.datetime])

Class for individual editions in reporters_db, like "S.W." and "S.W.2d".

Expand source code
@dataclass(eq=True, frozen=True)
class Edition:
    """Class for individual editions in `reporters_db`,
    like "S.W." and "S.W.2d"."""

    reporter: Reporter
    short_name: str
    start: Optional[datetime]
    end: Optional[datetime]

    def includes_year(
        self,
        year: int,
    ) -> bool:
        """Return True if edition contains cases for the given year."""
        return (
            year <= datetime.now().year
            and (self.start is None or self.start.year <= year)
            and (self.end is None or self.end.year >= year)
        )

Class variables

var end : Optional[datetime.datetime]
var reporterReporter
var short_name : str
var start : Optional[datetime.datetime]

Methods

def includes_year(self, year: int) ‑> bool

Return True if edition contains cases for the given year.

Expand source code
def includes_year(
    self,
    year: int,
) -> bool:
    """Return True if edition contains cases for the given year."""
    return (
        year <= datetime.now().year
        and (self.start is None or self.start.year <= year)
        and (self.end is None or self.end.year >= year)
    )
class FullCaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Convenience class which represents a standard, fully named citation, i.e., the kind of citation that marks the first time a document is cited.

Example:

Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCaseCitation(CaseCitation, FullCitation):
    """Convenience class which represents a standard, fully named citation,
    i.e., the kind of citation that marks the first time a document is cited.

    Example:
    ```
    Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CaseCitation.Metadata):
        """Define fields on self.metadata."""

        plaintiff: Optional[str] = None
        defendant: Optional[str] = None
        extra: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_defendant, add_post_citation

        add_post_citation(self, words)
        add_defendant(self, words)
        self.guess_court()
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return formatted version of extracted cite."""
        parts = []
        m = self.metadata
        if m.plaintiff:
            parts.append(f"{m.plaintiff} v. ")
        if m.defendant:
            parts.append(f"{m.defendant}, ")
        parts.append(self.corrected_citation())
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        if m.extra:
            parts.append(m.extra)
        publisher_date = " ".join(i for i in (m.court, m.year) if i)
        if publisher_date:
            parts.append(f" ({publisher_date})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)

Ancestors

Methods

def corrected_citation_full(self)

Return formatted version of extracted cite.

Expand source code
def corrected_citation_full(self):
    """Return formatted version of extracted cite."""
    parts = []
    m = self.metadata
    if m.plaintiff:
        parts.append(f"{m.plaintiff} v. ")
    if m.defendant:
        parts.append(f"{m.defendant}, ")
    parts.append(self.corrected_citation())
    if m.pin_cite:
        parts.append(f", {m.pin_cite}")
    if m.extra:
        parts.append(m.extra)
    publisher_date = " ".join(i for i in (m.court, m.year) if i)
    if publisher_date:
        parts.append(f" ({publisher_date})")
    if m.parenthetical:
        parts.append(f" ({m.parenthetical})")
    return "".join(parts)

Inherited members

class FullCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Abstract base class indicating that a citation fully identifies a resource.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullCitation(ResourceCitation):
    """Abstract base class indicating that a citation fully identifies a
    resource."""

Ancestors

Subclasses

Inherited members

class FullJournalCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Citation to a source from reporters_db/journals.json.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullJournalCitation(FullCitation):
    """Citation to a source from `reporters_db/journals.json`."""

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_journal_metadata

        add_journal_metadata(self, words)
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        parts = [self.corrected_citation()]
        m = self.metadata
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        if m.year:
            parts.append(f" ({m.year})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)

Ancestors

Inherited members

class FullLawCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Citation to a source from reporters_db/laws.json.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullLawCitation(FullCitation):
    """Citation to a source from `reporters_db/laws.json`."""

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(FullCitation.Metadata):
        """Define fields on self.metadata."""

        publisher: Optional[str] = None
        day: Optional[str] = None
        month: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        # pylint: disable=import-outside-toplevel
        from eyecite.helpers import add_law_metadata

        add_law_metadata(self, words)
        super().add_metadata(words)

    def corrected_citation_full(self):
        """Return citation with any variations normalized, including extracted
        metadata if any."""
        parts = [self.corrected_citation()]
        m = self.metadata
        if m.pin_cite:
            parts.append(f"{m.pin_cite}")
        publisher_date = " ".join(
            i for i in (m.publisher, m.month, m.day, m.year) if i
        )
        if publisher_date:
            parts.append(f" ({publisher_date})")
        if m.parenthetical:
            parts.append(f" ({m.parenthetical})")
        return "".join(parts)

Ancestors

Inherited members

class IdCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)

Convenience class which represents an 'id' or 'ibid' citation, i.e., a citation to the document referenced immediately prior. An 'id' citation is unlike a regular citation object since it has no knowledge of its reporter, volume, or page. Instead, the only helpful information that this reference possesses is a record of the pin cite after the 'id' token.

Example: "… foo bar," id., at 240

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class IdCitation(CitationBase):
    """Convenience class which represents an 'id' or 'ibid' citation, i.e., a
    citation to the document referenced immediately prior. An 'id' citation is
    unlike a regular citation object since it has no knowledge of its reporter,
    volume, or page. Instead, the only helpful information that this reference
    possesses is a record of the pin cite after the 'id' token.

    Example: "... foo bar," id., at 240
    """

    def __hash__(self) -> int:
        """IdCitation objects are always considered unique for safety."""
        return id(self)

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        pin_cite: Optional[str] = None

    def formatted(self):
        """Return formatted version of extracted cite."""
        parts = ["id."]
        if self.metadata.pin_cite:
            parts.append(f", {self.metadata.pin_cite}")
        return "".join(parts)

Ancestors

Methods

def formatted(self)

Return formatted version of extracted cite.

Expand source code
def formatted(self):
    """Return formatted version of extracted cite."""
    parts = ["id."]
    if self.metadata.pin_cite:
        parts.append(f", {self.metadata.pin_cite}")
    return "".join(parts)

Inherited members

class IdToken (data: str, start: int, end: int, groups: dict = <factory>)

Word matching "id" or "ibid".

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class IdToken(Token):
    """Word matching "id" or "ibid"."""

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Inherited members

class ParagraphToken (data: str, start: int, end: int, groups: dict = <factory>)

Word matching a break between paragraphs.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class ParagraphToken(Token):
    """Word matching a break between paragraphs."""

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Inherited members

class Reporter (short_name: str, name: str, cite_type: str, source: str, is_scotus: bool = False)

Class for top-level reporters in reporters_db, like "S.W."

Expand source code
@dataclass(eq=True, frozen=True)
class Reporter:
    """Class for top-level reporters in `reporters_db`, like "S.W." """

    short_name: str
    name: str
    cite_type: str
    source: str  # one of "reporters", "laws", "journals"
    is_scotus: bool = False

    def __post_init__(self):
        if (
            self.cite_type == "federal" and "supreme" in self.name.lower()
        ) or "scotus" in self.cite_type.lower():
            # use setattr because this class is frozen
            object.__setattr__(self, "is_scotus", True)

Class variables

var cite_type : str
var is_scotus : bool
var name : str
var short_name : str
var source : str
class Resource (citation: FullCitation)

Thin resource class representing an object to which a citation can be resolved. See eyecite.resolve for more details.

Expand source code
@dataclass(frozen=True)
class Resource(ResourceType):
    """Thin resource class representing an object to which a citation can be
    resolved. See `eyecite.resolve` for more details."""

    citation: FullCitation

    def __hash__(self):
        """Resources are the same if their citations are semantically
        equivalent, as defined by their hash function.

        Note: Resources composed of citations with missing page numbers are
        NOT considered the same, even if their other attributes are identical.
        This is to avoid potential false positives.
        """
        return hash(
            hash_sha256(
                {
                    "citation": hash(self.citation),
                    "class": type(self).__name__,
                }
            )
        )

    def __eq__(self, other):
        return self.__hash__() == other.__hash__()

Ancestors

  • collections.abc.Hashable
  • typing.Generic

Class variables

var citationFullCitation
class ResourceCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Base class for a case, law, or journal citation. Could be short or long.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class ResourceCitation(CitationBase):
    """Base class for a case, law, or journal citation. Could be short or
    long."""

    # Editions that might match this reporter string
    exact_editions: Sequence[Edition] = field(default_factory=tuple)
    variation_editions: Sequence[Edition] = field(default_factory=tuple)
    all_editions: Sequence[Edition] = field(default_factory=tuple)
    edition_guess: Optional[Edition] = None

    # year extracted from metadata["year"] and converted to int,
    # if in a valid range
    year: Optional[int] = None

    def __post_init__(self):
        """Make iterables into tuples to make sure we're hashable."""
        self.exact_editions = tuple(self.exact_editions)
        self.variation_editions = tuple(self.variation_editions)
        self.all_editions = tuple(self.exact_editions) + tuple(
            self.variation_editions
        )
        super().__post_init__()

    def __hash__(self) -> int:
        """ResourceCitation objects are hashed in the same way as their
        parent class (CitationBase) objects, except that we also take into
        consideration the all_editions field.
        """
        return hash(
            hash_sha256(
                {
                    **dict(self.groups.items()),
                    **{
                        "all_editions": sorted(
                            [asdict(e) for e in self.all_editions],
                            key=lambda d: d["short_name"],  # type: ignore
                        ),
                        "class": type(self).__name__,
                    },
                }
            )
        )

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        pin_cite: Optional[str] = None
        year: Optional[str] = None

    def add_metadata(self, words: "Tokens"):
        """Extract metadata from text before and after citation."""
        self.guess_edition()

    def dump(self) -> dict:
        """Return citation data for printing by dump_citations."""
        return {
            **super().dump(),
            "year": self.year,
        }

    def corrected_reporter(self):
        """Get official reporter string from edition_guess, if possible."""
        return (
            self.edition_guess.short_name
            if self.edition_guess
            else self.groups["reporter"]
        )

    def corrected_citation(self):
        """Return citation with corrected reporter."""
        if self.edition_guess:
            return self.matched_text().replace(
                self.groups["reporter"], self.edition_guess.short_name
            )
        return self.matched_text()

    def guess_edition(self):
        """Set edition_guess."""
        # Use exact matches if possible, otherwise try variations
        editions = self.exact_editions or self.variation_editions
        if not editions:
            return

        # Attempt resolution by date
        if len(editions) > 1 and self.year:
            editions = [e for e in editions if e.includes_year(self.year)]

        if len(editions) == 1:
            self.edition_guess = editions[0]

Ancestors

Subclasses

Class variables

var all_editions : Sequence[Edition]
var edition_guess : Optional[Edition]
var exact_editions : Sequence[Edition]
var variation_editions : Sequence[Edition]
var year : Optional[int]

Methods

def add_metadata(self, words: Tokens)

Extract metadata from text before and after citation.

Expand source code
def add_metadata(self, words: "Tokens"):
    """Extract metadata from text before and after citation."""
    self.guess_edition()
def corrected_citation(self)

Return citation with corrected reporter.

Expand source code
def corrected_citation(self):
    """Return citation with corrected reporter."""
    if self.edition_guess:
        return self.matched_text().replace(
            self.groups["reporter"], self.edition_guess.short_name
        )
    return self.matched_text()
def corrected_reporter(self)

Get official reporter string from edition_guess, if possible.

Expand source code
def corrected_reporter(self):
    """Get official reporter string from edition_guess, if possible."""
    return (
        self.edition_guess.short_name
        if self.edition_guess
        else self.groups["reporter"]
    )
def guess_edition(self)

Set edition_guess.

Expand source code
def guess_edition(self):
    """Set edition_guess."""
    # Use exact matches if possible, otherwise try variations
    editions = self.exact_editions or self.variation_editions
    if not editions:
        return

    # Attempt resolution by date
    if len(editions) > 1 and self.year:
        editions = [e for e in editions if e.includes_year(self.year)]

    if len(editions) == 1:
        self.edition_guess = editions[0]

Inherited members

class SectionToken (data: str, start: int, end: int, groups: dict = <factory>)

Word containing a section symbol.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class SectionToken(Token):
    """Word containing a section symbol."""

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Inherited members

class ShortCaseCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None, exact_editions: Sequence[Edition] = <factory>, variation_editions: Sequence[Edition] = <factory>, all_editions: Sequence[Edition] = <factory>, edition_guess: Optional[Edition] = None, year: Optional[int] = None)

Convenience class which represents a short form citation, i.e., the kind of citation made after a full citation has already appeared. This kind of citation lacks a full case name and usually has a different page number than the canonical citation.

Examples:

Adarand, 515 U.S., at 241
Adarand, 515 U.S. at 241
515 U.S., at 241
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class ShortCaseCitation(CaseCitation):
    """Convenience class which represents a short form citation, i.e., the kind
    of citation made after a full citation has already appeared. This kind of
    citation lacks a full case name and usually has a different page number
    than the canonical citation.

    Examples:
    ```
    Adarand, 515 U.S., at 241
    Adarand, 515 U.S. at 241
    515 U.S., at 241
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CaseCitation.Metadata):
        """Define fields on self.metadata."""

        antecedent_guess: Optional[str] = None

    def corrected_citation_full(self):
        """Return formatted version of extracted cite."""
        parts = []
        if self.metadata.antecedent_guess:
            parts.append(f"{self.metadata.antecedent_guess}, ")
        parts.append(self.corrected_citation())
        return "".join(parts)

Ancestors

Methods

def corrected_citation_full(self)

Return formatted version of extracted cite.

Expand source code
def corrected_citation_full(self):
    """Return formatted version of extracted cite."""
    parts = []
    if self.metadata.antecedent_guess:
        parts.append(f"{self.metadata.antecedent_guess}, ")
    parts.append(self.corrected_citation())
    return "".join(parts)

Inherited members

class StopWordToken (data: str, start: int, end: int, groups: dict = <factory>)

Word matching one of the STOP_TOKENS.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class StopWordToken(Token):
    """Word matching one of the STOP_TOKENS."""

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Inherited members

class SupraCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)

Convenience class which represents a 'supra' citation, i.e., a citation to something that is above in the document. Like a short form citation, this kind of citation lacks a full case name and usually has a different page number than the canonical citation.

Examples:

Adarand, supra, at 240
Adarand, 515 supra, at 240
Adarand, supra, somethingelse
Adarand, supra. somethingelse
Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class SupraCitation(CitationBase):
    """Convenience class which represents a 'supra' citation, i.e., a citation
    to something that is above in the document. Like a short form citation,
    this kind of citation lacks a full case name and usually has a different
    page number than the canonical citation.


    Examples:
    ```
    Adarand, supra, at 240
    Adarand, 515 supra, at 240
    Adarand, supra, somethingelse
    Adarand, supra. somethingelse
    ```
    """

    @dataclass(eq=True, unsafe_hash=True)
    class Metadata(CitationBase.Metadata):
        """Define fields on self.metadata."""

        antecedent_guess: Optional[str] = None
        pin_cite: Optional[str] = None
        volume: Optional[str] = None

    def formatted(self):
        """Return formatted version of extracted cite."""
        parts = []
        m = self.metadata
        if m.antecedent_guess:
            parts.append(f"{m.antecedent_guess}, ")
        if m.volume:
            parts.append(f"{m.volume} ")
        parts.append("supra")
        if m.pin_cite:
            parts.append(f", {m.pin_cite}")
        return "".join(parts)

Ancestors

Methods

def formatted(self)

Return formatted version of extracted cite.

Expand source code
def formatted(self):
    """Return formatted version of extracted cite."""
    parts = []
    m = self.metadata
    if m.antecedent_guess:
        parts.append(f"{m.antecedent_guess}, ")
    if m.volume:
        parts.append(f"{m.volume} ")
    parts.append("supra")
    if m.pin_cite:
        parts.append(f", {m.pin_cite}")
    return "".join(parts)

Inherited members

class SupraToken (data: str, start: int, end: int, groups: dict = <factory>)

Word matching "supra" with or without punctuation.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class SupraToken(Token):
    """Word matching "supra" with or without punctuation."""

Ancestors

  • Token
  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Inherited members

class Token (data: str, start: int, end: int, groups: dict = <factory>)

Base class for special tokens. For performance, this isn't used for generic words.

Expand source code
@dataclass(eq=True, unsafe_hash=True)
class Token(UserString):
    """Base class for special tokens. For performance, this isn't used
    for generic words."""

    data: str
    start: int
    end: int
    groups: dict = field(default_factory=dict, compare=False)

    @classmethod
    def from_match(cls, m, extra, offset=0) -> "Token":
        """Return a token object based on a regular expression match.
        This gets called by TokenExtractor. By default, just use the
        entire matched string."""
        start, end = m.span(1)
        # ignore "too many arguments" type error -- this is called
        # by subclasses with additional attributes
        return cls(  # type: ignore[call-arg]
            m[1], start + offset, end + offset, groups=m.groupdict(), **extra
        )

    def merge(self, other: "Token") -> Optional["Token"]:
        """Merge two tokens, by returning self if other is identical to
        self."""
        if (
            self.start == other.start
            and self.end == other.end
            and type(self) is type(other)
            and self.groups == other.groups
        ):
            return self
        return None

Ancestors

  • collections.UserString
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container

Subclasses

Class variables

var data : str
var end : int
var groups : dict
var start : int

Static methods

def from_match(m, extra, offset=0) ‑> Token

Return a token object based on a regular expression match. This gets called by TokenExtractor. By default, just use the entire matched string.

Expand source code
@classmethod
def from_match(cls, m, extra, offset=0) -> "Token":
    """Return a token object based on a regular expression match.
    This gets called by TokenExtractor. By default, just use the
    entire matched string."""
    start, end = m.span(1)
    # ignore "too many arguments" type error -- this is called
    # by subclasses with additional attributes
    return cls(  # type: ignore[call-arg]
        m[1], start + offset, end + offset, groups=m.groupdict(), **extra
    )

Methods

def merge(self, other: Token) ‑> Optional[Token]

Merge two tokens, by returning self if other is identical to self.

Expand source code
def merge(self, other: "Token") -> Optional["Token"]:
    """Merge two tokens, by returning self if other is identical to
    self."""
    if (
        self.start == other.start
        and self.end == other.end
        and type(self) is type(other)
        and self.groups == other.groups
    ):
        return self
    return None
class TokenExtractor (regex: str, constructor: Callable[..., Token], extra: Dict = <factory>, flags: int = 0, strings: List = <factory>)

Class for extracting all matches from a given string for the given regex, and then for returning Token objects for all matches.

Expand source code
@dataclass
class TokenExtractor:
    """Class for extracting all matches from a given string for the given
    regex, and then for returning Token objects for all matches."""

    regex: str
    # constructor should be Callable[[re.Match, dict, int], Token]
    # but this issue makes it inconvenient to specify the input types:
    # https://github.com/python/mypy/issues/5485
    constructor: Callable[..., Token]
    extra: Dict = field(default_factory=dict)
    flags: int = 0
    strings: List = field(default_factory=list)

    def get_matches(self, text):
        """Return match objects for all matches in text."""
        return self.compiled_regex.finditer(text)

    def get_token(self, m, offset=0) -> Token:
        """For a given match object, return a Token."""
        return self.constructor(m, self.extra, offset)

    def __hash__(self):
        """This needs to be hashable so we can remove redundant
        extractors returned by the pyahocorasick filter."""
        return hash(repr(self))

    @property
    def compiled_regex(self):
        """Cache compiled regex as a property."""
        if not hasattr(self, "_compiled_regex"):
            self._compiled_regex = re.compile(self.regex, flags=self.flags)
        return self._compiled_regex

Class variables

var constructor : Callable[..., Token]
var extra : Dict
var flags : int
var regex : str
var strings : List

Instance variables

var compiled_regex

Cache compiled regex as a property.

Expand source code
@property
def compiled_regex(self):
    """Cache compiled regex as a property."""
    if not hasattr(self, "_compiled_regex"):
        self._compiled_regex = re.compile(self.regex, flags=self.flags)
    return self._compiled_regex

Methods

def get_matches(self, text)

Return match objects for all matches in text.

Expand source code
def get_matches(self, text):
    """Return match objects for all matches in text."""
    return self.compiled_regex.finditer(text)
def get_token(self, m, offset=0) ‑> Token

For a given match object, return a Token.

Expand source code
def get_token(self, m, offset=0) -> Token:
    """For a given match object, return a Token."""
    return self.constructor(m, self.extra, offset)
class UnknownCitation (token: Token, index: int, span_start: Optional[int] = None, span_end: Optional[int] = None, full_span_start: Optional[int] = None, full_span_end: Optional[int] = None, groups: dict = <factory>, metadata: Any = None)

Convenience class which represents an unknown citation. A recognized citation should theoretically be parsed as a CaseCitation, FullLawCitation, or a FullJournalCitation. If it's something else, this class serves as a naive catch-all.

Expand source code
@dataclass(eq=False, unsafe_hash=False, repr=False)
class UnknownCitation(CitationBase):
    """Convenience class which represents an unknown citation. A recognized
    citation should theoretically be parsed as a CaseCitation, FullLawCitation,
    or a FullJournalCitation. If it's something else, this class serves as
    a naive catch-all.
    """

    def __hash__(self) -> int:
        """UnknownCitation objects are always considered unique for safety."""
        return id(self)

Ancestors

Inherited members