Package eyecite
Expand source code
from .annotate import annotate_citations
from .clean import clean_text
from .find import get_citations
from .resolve import resolve_citations
__all__ = [
"annotate_citations",
"get_citations",
"clean_text",
"resolve_citations",
]
# No need to create API documentation for these internal helper functions
__pdoc__ = {
"annotate.SpanUpdater": False,
"helpers": False,
"regexes": False,
"test_factories": False,
"utils": False,
}
Sub-modules
eyecite.annotate
eyecite.clean
eyecite.find
eyecite.models
eyecite.resolve
eyecite.tokenizers
Functions
def annotate_citations(plain_text: str, annotations: Iterable[Tuple[Tuple[int, int], Any, Any]], source_text: Optional[str] = None, unbalanced_tags: str = 'unchecked', use_dmp: bool = True, annotator: Optional[Callable[[Any, str, Any], str]] = None) ‑> str
-
Given a list of citations and the text from which they were parsed, insert annotations into the text surrounding each citation. This could be useful for linking the citations to a URL, or otherwise indicating that they were successfully parsed or resolved.
If you pre-processed your text before extracting the citations, this function will intelligently reconcile the differences between the original source text and the cleaned text using a diffing algorithm, ensuring that each annotation is inserted in the correct location.
Example:
>>> plain_text = "foo 1 U.S. 1 bar" >>> citations = get_citations(plain_text) >>> annotate_citations("foo 1 U.S. 1 bar", ... [(citations[0].span(), "<a>", "</a>")]) >>> >>> returns: "foo <a>1 U.S. 1</a> bar"
Args
plain_text
- The text containing the citations. If this text was
cleaned, you should also pass the
source_text
below. annotations
- A
Tuple
of (1) the start and end positions of the citation in the text, (2) the text to insert before the citation, and (3) the text to insert after the citation. source_text
- If provided, apply annotations to this text instead using a diffing algorithm.
unbalanced_tags
- If provided, unbalanced_tags="skip" will skip inserting annotations that result in invalid HTML. unbalanced_tags="wrap" will ensure valid HTML by wrapping annotations around any unbalanced tags.
use_dmp
- If
True
(default), use the fast_diff_match_patch_python library for diffing. IfFalse
, use the slower built-in difflib, which may be useful for debugging. annotator
- If provided, should be a function that takes three arguments (the text to insert before, the text of the citation, and the text to insert after) and returns the annotation. This is useful for customizing the annotation action: If you don't pass this function, eyecite will simply concatenate the before_text, citation_text, and after_text together for each annotation.
Returns
The annotated text.
Expand source code
def annotate_citations( plain_text: str, annotations: Iterable[Tuple[Tuple[int, int], Any, Any]], source_text: Optional[str] = None, unbalanced_tags: str = "unchecked", use_dmp: bool = True, annotator: Optional[Callable[[Any, str, Any], str]] = None, ) -> str: """Given a list of citations and the text from which they were parsed, insert annotations into the text surrounding each citation. This could be useful for linking the citations to a URL, or otherwise indicating that they were successfully parsed or resolved. If you pre-processed your text before extracting the citations, this function will intelligently reconcile the differences between the original source text and the cleaned text using a diffing algorithm, ensuring that each annotation is inserted in the correct location. Example: >>> plain_text = "foo 1 U.S. 1 bar" >>> citations = get_citations(plain_text) >>> annotate_citations("foo 1 U.S. 1 bar", ... [(citations[0].span(), "<a>", "</a>")]) >>> >>> returns: "foo <a>1 U.S. 1</a> bar" Args: plain_text: The text containing the citations. If this text was cleaned, you should also pass the `source_text` below. annotations: A `Tuple` of (1) the start and end positions of the citation in the text, (2) the text to insert before the citation, and (3) the text to insert after the citation. source_text: If provided, apply annotations to this text instead using a diffing algorithm. unbalanced_tags: If provided, unbalanced_tags="skip" will skip inserting annotations that result in invalid HTML. unbalanced_tags="wrap" will ensure valid HTML by wrapping annotations around any unbalanced tags. use_dmp: If `True` (default), use the fast_diff_match_patch_python library for diffing. If `False`, use the slower built-in difflib, which may be useful for debugging. annotator: If provided, should be a function that takes three arguments (the text to insert before, the text of the citation, and the text to insert after) and returns the annotation. This is useful for customizing the annotation action: If you don't pass this function, eyecite will simply concatenate the before_text, citation_text, and after_text together for each annotation. Returns: The annotated text. """ # set up offset_updater if we have to move annotations to source_text offset_updater = None if source_text and source_text != plain_text: offset_updater = SpanUpdater(plain_text, source_text, use_dmp=use_dmp) plain_text = source_text # append text for each annotation to out annotations = sorted(annotations) out = [] last_end = 0 for (start, end), before, after in annotations: # if we're applying to source_text, update offsets if offset_updater: start = offset_updater.update(start, bisect_right) end = offset_updater.update(end, bisect_left) # handle overlaps if start < last_end: # include partial annotation if possible start = last_end if start >= end: # if annotation is entirely covered, skip continue span_text = plain_text[start:end] # handle HTML tags if unbalanced_tags == "unchecked": pass elif unbalanced_tags in ("skip", "wrap"): if not is_balanced_html(span_text): if unbalanced_tags == "skip": continue span_text = wrap_html_tags(span_text, after, before) else: raise ValueError(f"Unknown option '{unbalanced_tags}") if annotator is not None: annotated_span = annotator(before, span_text, after) else: annotated_span = before + span_text + after # append each span out.extend( [ plain_text[last_end:start], annotated_span, ] ) last_end = end # append text after final citation if last_end < len(plain_text): out.append(plain_text[last_end:]) return "".join(out)
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) ‑> str
-
Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in
eyecite.clean
, or other custom callables. You may wish to use this tool to pre-process your text before feeding it intoget_citations()
, especially if the text was OCR'd from a PDF.Args
text
- The text to clean.
steps
- Any
Iterable
(e.g., a list) of cleaning functions to apply.
Returns
The cleaned text.
Expand source code
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str: """Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in `eyecite.clean`, or other custom callables. You may wish to use this tool to pre-process your text before feeding it into `eyecite.find.get_citations`, especially if the text was OCR'd from a PDF. Args: text: The text to clean. steps: Any `Iterable` (e.g., a list) of cleaning functions to apply. Returns: The cleaned text. """ for step in steps: if step in cleaners_lookup: step_func = cleaners_lookup[step] # type: ignore elif callable(step): step_func = step else: raise ValueError( "clean_text steps must be callable " f"or one of {list(cleaners_lookup.keys())}" ) text = step_func(text) return text # type: ignore
def get_citations(plain_text: str, remove_ambiguous: bool = False, tokenizer: Tokenizer = AhocorasickTokenizer() ‑> List[CitationBase]
-
This is eyecite's main workhorse function. Given a string of text (e.g., a judicial opinion or other legal document), return a list of
CitationBase
objects representing the citations found in the document.Args
plain_text
- The text to parse. You may wish to use the
clean_text()
function to pre-process your text before passing it here. remove_ambiguous
- Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date.
tokenizer
- An instance of a Tokenizer object. See
eyecite.tokenizers
for information about available tokenizers. Uses theAhocorasickTokenizer
by default.
Returns
A list of
CitationBase
objectsExpand source code
def get_citations( plain_text: str, remove_ambiguous: bool = False, tokenizer: Tokenizer = default_tokenizer, ) -> List[CitationBase]: """This is eyecite's main workhorse function. Given a string of text (e.g., a judicial opinion or other legal document), return a list of `eyecite.models.CitationBase` objects representing the citations found in the document. Args: plain_text: The text to parse. You may wish to use the `eyecite.clean.clean_text` function to pre-process your text before passing it here. remove_ambiguous: Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date. tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers` for information about available tokenizers. Uses the `eyecite.tokenizers.AhocorasickTokenizer` by default. Returns: A list of `eyecite.models.CitationBase` objects """ if plain_text == "eyecite": return joke_cite words, citation_tokens = tokenizer.tokenize(plain_text) citations = [] for i, token in citation_tokens: citation: CitationBase token_type = type(token) # CASE 1: Token is a CitationToken (i.e., a reporter, a law journal, # or a law). # In this case, first try extracting it as a standard, full citation, # and if that fails try extracting it as a short form citation. if token_type is CitationToken: citation_token = cast(CitationToken, token) if citation_token.short: citation = _extract_shortform_citation(words, i) else: citation = _extract_full_citation(words, i) # CASE 2: Token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited # immediately prior, but for safety we will leave that resolution up # to the user. elif token_type is IdToken: citation = _extract_id_citation(words, i) # CASE 3: Token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, for safety we won't resolve this reference yet. elif token_type is SupraToken: citation = _extract_supra_citation(words, i) # CASE 4: Token is a section marker. # In this case, it's likely that this is a reference to a citation, # but we're not sure what it is if it doesn't match any of the above. # So we record this marker in order to keep an accurate list of the # possible antecedents for id citations. elif token_type is SectionToken: citation = UnknownCitation(cast(SectionToken, token), i) # CASE 5: The token is not a citation. else: continue citations.append(citation) # Remove citations with multiple reporter candidates where we couldn't # guess correct reporter if remove_ambiguous: citations = disambiguate_reporters(citations) # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important for reconstructing # the references of the ShortCaseCitation, SupraCitation, and # IdCitation objects. return citations
def resolve_citations(citations: List[CitationBase], resolve_full_citation: Callable[[FullCitation], Hashable] = <function resolve_full_citation>, resolve_shortcase_citation: Callable[[ShortCaseCitation, List[Tuple[FullCitation, Hashable]]], Optional[Hashable]] = <function _resolve_shortcase_citation>, resolve_supra_citation: Callable[[SupraCitation, List[Tuple[FullCitation, Hashable]]], Optional[Hashable]] = <function _resolve_supra_citation>, resolve_id_citation: Callable[[IdCitation, Hashable, Dict[Hashable, List[CitationBase]]], Optional[Hashable]] = <function _resolve_id_citation>) ‑> Dict[Hashable, List[CitationBase]]
-
Resolve a list of citations to their associated resources by matching each type of Citation object (FullCaseCitation, ShortCaseCitation, SupraCitation, and IdCitation) to a "resource" object. A "resource" could be a document, a URL, a database entry, etc. – anything that conforms to the (non-prescriptive) requirements of the
eyecite.models.ResourceType
type. By default, eyecite uses an extremely thin "resource" object that simply serves as a conceptual way to group citations with the same references together.This function assumes that the given list of citations is ordered in the order that they were extracted from the text (i.e., assumes that supra citations and id citations can only refer to previous references).
It returns a dict in the following format:
keys = resources values = lists of citations
The individual resolution steps can be supplanted with more complex logic by passing custom functions (e.g., if you have a thicker resource abstraction that you want to use); the default approach is to use simple heuristics to narrow down the set of possible resolutions. If a citation cannot be definitively resolved to a resource, it is dropped and not resolved.
Args
citations
- A list of
CitationBase
objects, returned from callingget_citations()
. resolve_full_citation
- A function that resolves
FullCitation
objects to resources. resolve_shortcase_citation
- A function that resolves
ShortCaseCitation
objects to resources. resolve_supra_citation
- A function that resolves
SupraCitation
objects to resources. resolve_id_citation
- A function that resolves
IdCitation
objects to resources.
Returns
A dictionary mapping
eyecite.models.ResourceType
objects (the keys) to lists ofCitationBase
objects (the values).Expand source code
def resolve_citations( citations: List[CitationBase], resolve_full_citation: Callable[ [FullCitation], ResourceType ] = resolve_full_citation, resolve_shortcase_citation: Callable[ [ShortCaseCitation, ResolvedFullCites], Optional[ResourceType], ] = _resolve_shortcase_citation, resolve_supra_citation: Callable[ [SupraCitation, ResolvedFullCites], Optional[ResourceType], ] = _resolve_supra_citation, resolve_id_citation: Callable[ [IdCitation, ResourceType, Resolutions], Optional[ResourceType] ] = _resolve_id_citation, ) -> Resolutions: """Resolve a list of citations to their associated resources by matching each type of Citation object (FullCaseCitation, ShortCaseCitation, SupraCitation, and IdCitation) to a "resource" object. A "resource" could be a document, a URL, a database entry, etc. -- anything that conforms to the (non-prescriptive) requirements of the `eyecite.models.ResourceType` type. By default, eyecite uses an extremely thin "resource" object that simply serves as a conceptual way to group citations with the same references together. This function assumes that the given list of citations is ordered in the order that they were extracted from the text (i.e., assumes that supra citations and id citations can only refer to previous references). It returns a dict in the following format: ``` keys = resources values = lists of citations ``` The individual resolution steps can be supplanted with more complex logic by passing custom functions (e.g., if you have a thicker resource abstraction that you want to use); the default approach is to use simple heuristics to narrow down the set of possible resolutions. If a citation cannot be definitively resolved to a resource, it is dropped and not resolved. Args: citations: A list of `eyecite.models.CitationBase` objects, returned from calling `eyecite.find.get_citations`. resolve_full_citation: A function that resolves `eyecite.models.FullCitation` objects to resources. resolve_shortcase_citation: A function that resolves `eyecite.models.ShortCaseCitation` objects to resources. resolve_supra_citation: A function that resolves `eyecite.models.SupraCitation` objects to resources. resolve_id_citation: A function that resolves `eyecite.models.IdCitation` objects to resources. Returns: A dictionary mapping `eyecite.models.ResourceType` objects (the keys) to lists of `eyecite.models.CitationBase` objects (the values). """ # Dict of all citation resolutions resolutions: Resolutions = defaultdict(list) # Dict mapping full citations to their resolved resources resolved_full_cites: ResolvedFullCites = [] # The resource of the most recently resolved citation, if any last_resolution: Optional[ResourceType] = None # Iterate over each citation and attempt to resolve it to a resource for citation in citations: # If the citation is a full citation, try to resolve it if isinstance(citation, FullCitation): resolution = resolve_full_citation(citation) resolved_full_cites.append((citation, resolution)) # If the citation is a short case citation, try to resolve it elif isinstance(citation, ShortCaseCitation): resolution = resolve_shortcase_citation( citation, resolved_full_cites ) # If the citation is a supra citation, try to resolve it elif isinstance(citation, SupraCitation): resolution = resolve_supra_citation(citation, resolved_full_cites) # If the citation is an id citation, try to resolve it elif isinstance(citation, IdCitation): resolution = resolve_id_citation( citation, last_resolution, resolutions ) # If the citation is to an unknown document, ignore for now else: resolution = None last_resolution = resolution if resolution: # Record the citation in the appropriate list resolutions[resolution].append(citation) return resolutions