Package eyecite

Expand source code
from .annotate import annotate_citations
from .clean import clean_text
from .find import get_citations
from .resolve import resolve_citations

__all__ = [
    "annotate_citations",
    "get_citations",
    "clean_text",
    "resolve_citations",
]

# No need to create API documentation for these internal helper functions
__pdoc__ = {
    "annotate.SpanUpdater": False,
    "helpers": False,
    "regexes": False,
    "test_factories": False,
    "utils": False,
}

Sub-modules

eyecite.annotate
eyecite.clean
eyecite.find
eyecite.models
eyecite.resolve
eyecite.tokenizers

Functions

def annotate_citations(plain_text: str, annotations: Iterable[Tuple[Tuple[int, int], Any, Any]], source_text: Optional[str] = None, unbalanced_tags: str = 'unchecked', use_dmp: bool = True, annotator: Optional[Callable[[Any, str, Any], str]] = None) ‑> str

Given a list of citations and the text from which they were parsed, insert annotations into the text surrounding each citation. This could be useful for linking the citations to a URL, or otherwise indicating that they were successfully parsed or resolved.

If you pre-processed your text before extracting the citations, this function will intelligently reconcile the differences between the original source text and the cleaned text using a diffing algorithm, ensuring that each annotation is inserted in the correct location.

Example:

>>> plain_text = "foo 1 U.S. 1 bar"
>>> citations = get_citations(plain_text)
>>> annotate_citations("foo 1 U.S. 1 bar",
...     [(citations[0].span(), "<a>", "</a>")])
>>>
>>> returns: "foo <a>1 U.S. 1</a> bar"

Args

plain_text
The text containing the citations. If this text was cleaned, you should also pass the source_text below.
annotations
A Tuple of (1) the start and end positions of the citation in the text, (2) the text to insert before the citation, and (3) the text to insert after the citation.
source_text
If provided, apply annotations to this text instead using a diffing algorithm.
unbalanced_tags
If provided, unbalanced_tags="skip" will skip inserting annotations that result in invalid HTML. unbalanced_tags="wrap" will ensure valid HTML by wrapping annotations around any unbalanced tags.
use_dmp
If True (default), use the fast_diff_match_patch_python library for diffing. If False, use the slower built-in difflib, which may be useful for debugging.
annotator
If provided, should be a function that takes three arguments (the text to insert before, the text of the citation, and the text to insert after) and returns the annotation. This is useful for customizing the annotation action: If you don't pass this function, eyecite will simply concatenate the before_text, citation_text, and after_text together for each annotation.

Returns

The annotated text.

Expand source code
def annotate_citations(
    plain_text: str,
    annotations: Iterable[Tuple[Tuple[int, int], Any, Any]],
    source_text: Optional[str] = None,
    unbalanced_tags: str = "unchecked",
    use_dmp: bool = True,
    annotator: Optional[Callable[[Any, str, Any], str]] = None,
) -> str:
    """Given a list of citations and the text from which they were parsed,
    insert annotations into the text surrounding each citation. This could be
    useful for linking the citations to a URL, or otherwise indicating that
    they were successfully parsed or resolved.

    If you pre-processed your text before extracting the citations, this
    function will intelligently reconcile the differences between the original
    source text and the cleaned text using a diffing algorithm, ensuring that
    each annotation is inserted in the correct location.

    Example:
    >>> plain_text = "foo 1 U.S. 1 bar"
    >>> citations = get_citations(plain_text)
    >>> annotate_citations("foo 1 U.S. 1 bar",
    ...     [(citations[0].span(), "<a>", "</a>")])
    >>>
    >>> returns: "foo <a>1 U.S. 1</a> bar"

    Args:
        plain_text: The text containing the citations. If this text was
            cleaned, you should also pass the `source_text` below.
        annotations: A `Tuple` of (1) the start and end positions of the
            citation in the text, (2) the text to insert before the citation,
            and (3) the text to insert after the citation.
        source_text: If provided, apply annotations to this text instead using
            a diffing algorithm.
        unbalanced_tags: If provided, unbalanced_tags="skip" will skip
            inserting annotations that result in invalid HTML.
            unbalanced_tags="wrap" will ensure valid HTML by wrapping
            annotations around any unbalanced tags.
        use_dmp: If `True` (default), use the fast_diff_match_patch_python
            library for diffing. If `False`, use the slower built-in difflib,
            which may be useful for debugging.
        annotator: If provided, should be a function that takes three
            arguments (the text to insert before, the text of the citation,
            and the text to insert after) and returns the annotation. This is
            useful for customizing the annotation action: If you don't pass
            this function, eyecite will simply concatenate the before_text,
            citation_text, and after_text together for each annotation.

    Returns:
        The annotated text.
    """
    # set up offset_updater if we have to move annotations to source_text
    offset_updater = None
    if source_text and source_text != plain_text:
        offset_updater = SpanUpdater(plain_text, source_text, use_dmp=use_dmp)
        plain_text = source_text

    # append text for each annotation to out
    annotations = sorted(annotations)
    out = []
    last_end = 0
    for (start, end), before, after in annotations:
        # if we're applying to source_text, update offsets
        if offset_updater:
            start = offset_updater.update(start, bisect_right)
            end = offset_updater.update(end, bisect_left)

        # handle overlaps
        if start < last_end:
            # include partial annotation if possible
            start = last_end
            if start >= end:
                # if annotation is entirely covered, skip
                continue

        span_text = plain_text[start:end]

        # handle HTML tags
        if unbalanced_tags == "unchecked":
            pass
        elif unbalanced_tags in ("skip", "wrap"):
            if not is_balanced_html(span_text):
                if unbalanced_tags == "skip":
                    continue
                span_text = wrap_html_tags(span_text, after, before)
        else:
            raise ValueError(f"Unknown option '{unbalanced_tags}")

        if annotator is not None:
            annotated_span = annotator(before, span_text, after)
        else:
            annotated_span = before + span_text + after

        # append each span
        out.extend(
            [
                plain_text[last_end:start],
                annotated_span,
            ]
        )
        last_end = end

    # append text after final citation
    if last_end < len(plain_text):
        out.append(plain_text[last_end:])

    return "".join(out)
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) ‑> str

Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in eyecite.clean, or other custom callables. You may wish to use this tool to pre-process your text before feeding it into get_citations(), especially if the text was OCR'd from a PDF.

Args

text
The text to clean.
steps
Any Iterable (e.g., a list) of cleaning functions to apply.

Returns

The cleaned text.

Expand source code
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str:
    """Given a list of "cleaning" functions, apply each in sequence to a
    given text string and return the result. Steps may be the names of
    functions in `eyecite.clean`, or other custom callables. You may wish to
    use this tool to pre-process your text before feeding it into
    `eyecite.find.get_citations`, especially if the text was
    OCR'd from a PDF.

    Args:
        text: The text to clean.
        steps: Any `Iterable` (e.g., a list) of cleaning functions to apply.

    Returns:
        The cleaned text.
    """
    for step in steps:
        if step in cleaners_lookup:
            step_func = cleaners_lookup[step]  # type: ignore
        elif callable(step):
            step_func = step
        else:
            raise ValueError(
                "clean_text steps must be callable "
                f"or one of {list(cleaners_lookup.keys())}"
            )
        text = step_func(text)

    return text  # type: ignore
def get_citations(plain_text: str, remove_ambiguous: bool = False, tokenizer: Tokenizer = AhocorasickTokenizer() ‑> List[CitationBase]

This is eyecite's main workhorse function. Given a string of text (e.g., a judicial opinion or other legal document), return a list of CitationBase objects representing the citations found in the document.

Args

plain_text
The text to parse. You may wish to use the clean_text() function to pre-process your text before passing it here.
remove_ambiguous
Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date.
tokenizer
An instance of a Tokenizer object. See eyecite.tokenizers for information about available tokenizers. Uses the AhocorasickTokenizer by default.

Returns

A list of CitationBase objects

Expand source code
def get_citations(
    plain_text: str,
    remove_ambiguous: bool = False,
    tokenizer: Tokenizer = default_tokenizer,
) -> List[CitationBase]:
    """This is eyecite's main workhorse function. Given a string of text
    (e.g., a judicial opinion or other legal document), return a list of
    `eyecite.models.CitationBase` objects representing the citations found
    in the document.

    Args:
        plain_text: The text to parse. You may wish to use the
            `eyecite.clean.clean_text` function to pre-process your text
            before passing it here.
        remove_ambiguous: Whether to remove citations that might refer to more
            than one reporter and can't be narrowed down by date.
        tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers`
            for information about available tokenizers. Uses the
            `eyecite.tokenizers.AhocorasickTokenizer` by default.

    Returns:
        A list of `eyecite.models.CitationBase` objects
    """
    if plain_text == "eyecite":
        return joke_cite

    words, citation_tokens = tokenizer.tokenize(plain_text)
    citations = []

    for i, token in citation_tokens:
        citation: CitationBase
        token_type = type(token)

        # CASE 1: Token is a CitationToken (i.e., a reporter, a law journal,
        # or a law).
        # In this case, first try extracting it as a standard, full citation,
        # and if that fails try extracting it as a short form citation.
        if token_type is CitationToken:
            citation_token = cast(CitationToken, token)
            if citation_token.short:
                citation = _extract_shortform_citation(words, i)
            else:
                citation = _extract_full_citation(words, i)

        # CASE 2: Token is an "Id." or "Ibid." reference.
        # In this case, the citation should simply be to the item cited
        # immediately prior, but for safety we will leave that resolution up
        # to the user.
        elif token_type is IdToken:
            citation = _extract_id_citation(words, i)

        # CASE 3: Token is a "supra" reference.
        # In this case, we're not sure yet what the citation's antecedent is.
        # It could be any of the previous citations above. Thus, like an Id.
        # citation, for safety we won't resolve this reference yet.
        elif token_type is SupraToken:
            citation = _extract_supra_citation(words, i)

        # CASE 4: Token is a section marker.
        # In this case, it's likely that this is a reference to a citation,
        # but we're not sure what it is if it doesn't match any of the above.
        # So we record this marker in order to keep an accurate list of the
        # possible antecedents for id citations.
        elif token_type is SectionToken:
            citation = UnknownCitation(cast(SectionToken, token), i)

        # CASE 5: The token is not a citation.
        else:
            continue

        citations.append(citation)

    # Remove citations with multiple reporter candidates where we couldn't
    # guess correct reporter
    if remove_ambiguous:
        citations = disambiguate_reporters(citations)

    # Returns a list of citations ordered in the sequence that they appear in
    # the document. The ordering of this list is important for reconstructing
    # the references of the ShortCaseCitation, SupraCitation, and
    # IdCitation objects.
    return citations
def resolve_citations(citations: List[CitationBase], resolve_full_citation: Callable[[FullCitation], Hashable] = <function resolve_full_citation>, resolve_shortcase_citation: Callable[[ShortCaseCitation, List[Tuple[FullCitation, Hashable]]], Optional[Hashable]] = <function _resolve_shortcase_citation>, resolve_supra_citation: Callable[[SupraCitation, List[Tuple[FullCitation, Hashable]]], Optional[Hashable]] = <function _resolve_supra_citation>, resolve_id_citation: Callable[[IdCitation, Hashable, Dict[Hashable, List[CitationBase]]], Optional[Hashable]] = <function _resolve_id_citation>) ‑> Dict[Hashable, List[CitationBase]]

Resolve a list of citations to their associated resources by matching each type of Citation object (FullCaseCitation, ShortCaseCitation, SupraCitation, and IdCitation) to a "resource" object. A "resource" could be a document, a URL, a database entry, etc. – anything that conforms to the (non-prescriptive) requirements of the eyecite.models.ResourceType type. By default, eyecite uses an extremely thin "resource" object that simply serves as a conceptual way to group citations with the same references together.

This function assumes that the given list of citations is ordered in the order that they were extracted from the text (i.e., assumes that supra citations and id citations can only refer to previous references).

It returns a dict in the following format:

    keys = resources
    values = lists of citations

The individual resolution steps can be supplanted with more complex logic by passing custom functions (e.g., if you have a thicker resource abstraction that you want to use); the default approach is to use simple heuristics to narrow down the set of possible resolutions. If a citation cannot be definitively resolved to a resource, it is dropped and not resolved.

Args

citations
A list of CitationBase objects, returned from calling get_citations().
resolve_full_citation
A function that resolves FullCitation objects to resources.
resolve_shortcase_citation
A function that resolves ShortCaseCitation objects to resources.
resolve_supra_citation
A function that resolves SupraCitation objects to resources.
resolve_id_citation
A function that resolves IdCitation objects to resources.

Returns

A dictionary mapping eyecite.models.ResourceType objects (the keys) to lists of CitationBase objects (the values).

Expand source code
def resolve_citations(
    citations: List[CitationBase],
    resolve_full_citation: Callable[
        [FullCitation], ResourceType
    ] = resolve_full_citation,
    resolve_shortcase_citation: Callable[
        [ShortCaseCitation, ResolvedFullCites],
        Optional[ResourceType],
    ] = _resolve_shortcase_citation,
    resolve_supra_citation: Callable[
        [SupraCitation, ResolvedFullCites],
        Optional[ResourceType],
    ] = _resolve_supra_citation,
    resolve_id_citation: Callable[
        [IdCitation, ResourceType, Resolutions], Optional[ResourceType]
    ] = _resolve_id_citation,
) -> Resolutions:
    """Resolve a list of citations to their associated resources by matching
    each type of Citation object (FullCaseCitation, ShortCaseCitation,
    SupraCitation, and IdCitation) to a "resource" object. A "resource" could
    be a document, a URL, a database entry, etc. -- anything that conforms to
    the (non-prescriptive) requirements of the `eyecite.models.ResourceType`
    type. By default, eyecite uses an extremely thin "resource" object that
    simply serves as a conceptual way to group citations with the same
    references together.

    This function assumes that the given list of citations is ordered in the
    order that they were extracted from the text (i.e., assumes that supra
    citations and id citations can only refer to previous references).

    It returns a dict in the following format:
    ```
        keys = resources
        values = lists of citations
    ```

    The individual resolution steps can be supplanted with more complex logic
    by passing custom functions (e.g., if you have a thicker resource
    abstraction that you want to use); the default approach is to use simple
    heuristics to narrow down the set of possible resolutions. If a citation
    cannot be definitively resolved to a resource, it is dropped and not
    resolved.

    Args:
        citations: A list of `eyecite.models.CitationBase` objects, returned
            from calling `eyecite.find.get_citations`.
        resolve_full_citation: A function that resolves
            `eyecite.models.FullCitation` objects to resources.
        resolve_shortcase_citation: A function that resolves
            `eyecite.models.ShortCaseCitation` objects to resources.
        resolve_supra_citation: A function that resolves
            `eyecite.models.SupraCitation` objects to resources.
        resolve_id_citation: A function that resolves
            `eyecite.models.IdCitation` objects to resources.

    Returns:
        A dictionary mapping `eyecite.models.ResourceType` objects (the keys)
            to lists of `eyecite.models.CitationBase` objects (the values).
    """
    # Dict of all citation resolutions
    resolutions: Resolutions = defaultdict(list)

    # Dict mapping full citations to their resolved resources
    resolved_full_cites: ResolvedFullCites = []

    # The resource of the most recently resolved citation, if any
    last_resolution: Optional[ResourceType] = None

    # Iterate over each citation and attempt to resolve it to a resource
    for citation in citations:
        # If the citation is a full citation, try to resolve it
        if isinstance(citation, FullCitation):
            resolution = resolve_full_citation(citation)
            resolved_full_cites.append((citation, resolution))

        # If the citation is a short case citation, try to resolve it
        elif isinstance(citation, ShortCaseCitation):
            resolution = resolve_shortcase_citation(
                citation, resolved_full_cites
            )

        # If the citation is a supra citation, try to resolve it
        elif isinstance(citation, SupraCitation):
            resolution = resolve_supra_citation(citation, resolved_full_cites)

        # If the citation is an id citation, try to resolve it
        elif isinstance(citation, IdCitation):
            resolution = resolve_id_citation(
                citation, last_resolution, resolutions
            )

        # If the citation is to an unknown document, ignore for now
        else:
            resolution = None

        last_resolution = resolution
        if resolution:
            # Record the citation in the appropriate list
            resolutions[resolution].append(citation)

    return resolutions