Module eyecite.find
Expand source code
from typing import List, Type, cast
from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
joke_cite,
match_on_tokens,
)
from eyecite.models import (
CitationBase,
CitationToken,
FullCaseCitation,
FullCitation,
FullJournalCitation,
FullLawCitation,
IdCitation,
IdToken,
ResourceCitation,
SectionToken,
ShortCaseCitation,
SupraCitation,
SupraToken,
Tokens,
UnknownCitation,
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
def get_citations(
plain_text: str,
remove_ambiguous: bool = False,
tokenizer: Tokenizer = default_tokenizer,
) -> List[CitationBase]:
"""This is eyecite's main workhorse function. Given a string of text
(e.g., a judicial opinion or other legal document), return a list of
`eyecite.models.CitationBase` objects representing the citations found
in the document.
Args:
plain_text: The text to parse. You may wish to use the
`eyecite.clean.clean_text` function to pre-process your text
before passing it here.
remove_ambiguous: Whether to remove citations that might refer to more
than one reporter and can't be narrowed down by date.
tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers`
for information about available tokenizers. Uses the
`eyecite.tokenizers.AhocorasickTokenizer` by default.
Returns:
A list of `eyecite.models.CitationBase` objects
"""
if plain_text == "eyecite":
return joke_cite
words, citation_tokens = tokenizer.tokenize(plain_text)
citations = []
for i, token in citation_tokens:
citation: CitationBase
token_type = type(token)
# CASE 1: Token is a CitationToken (i.e., a reporter, a law journal,
# or a law).
# In this case, first try extracting it as a standard, full citation,
# and if that fails try extracting it as a short form citation.
if token_type is CitationToken:
citation_token = cast(CitationToken, token)
if citation_token.short:
citation = _extract_shortform_citation(words, i)
else:
citation = _extract_full_citation(words, i)
# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
# immediately prior, but for safety we will leave that resolution up
# to the user.
elif token_type is IdToken:
citation = _extract_id_citation(words, i)
# CASE 3: Token is a "supra" reference.
# In this case, we're not sure yet what the citation's antecedent is.
# It could be any of the previous citations above. Thus, like an Id.
# citation, for safety we won't resolve this reference yet.
elif token_type is SupraToken:
citation = _extract_supra_citation(words, i)
# CASE 4: Token is a section marker.
# In this case, it's likely that this is a reference to a citation,
# but we're not sure what it is if it doesn't match any of the above.
# So we record this marker in order to keep an accurate list of the
# possible antecedents for id citations.
elif token_type is SectionToken:
citation = UnknownCitation(cast(SectionToken, token), i)
# CASE 5: The token is not a citation.
else:
continue
citations.append(citation)
# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
if remove_ambiguous:
citations = disambiguate_reporters(citations)
# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
# IdCitation objects.
return citations
def _extract_full_citation(
words: Tokens,
index: int,
) -> FullCitation:
"""Given a list of words and the index of a citation, return
a FullCitation object."""
# Our cite was matched by one or more regexes, which could have come from
# one or more of the sources in reporters_db (e.g. reporters, laws,
# journals). Get the set of all sources that matched, preferring exact
# matches to variations:
token = cast(CitationToken, words[index])
cite_sources = set(
e.reporter.source
for e in (token.exact_editions or token.variation_editions)
)
# get citation_class based on cite_sources
citation_class: Type[ResourceCitation]
if "reporters" in cite_sources:
citation_class = FullCaseCitation
elif "laws" in cite_sources:
citation_class = FullLawCitation
elif "journals" in cite_sources:
citation_class = FullJournalCitation
else:
raise ValueError(f"Unknown cite_sources value {cite_sources}")
# make citation
citation = citation_class(
token,
index,
exact_editions=token.exact_editions,
variation_editions=token.variation_editions,
)
citation.add_metadata(words)
return citation
def _extract_shortform_citation(
words: Tokens,
index: int,
) -> ShortCaseCitation:
"""Given a list of words and the index of a citation, construct and return
a ShortCaseCitation object.
Shortform 1: Adarand, 515 U.S., at 241
Shortform 2: 515 U.S., at 241
"""
# get antecedent word
antecedent_guess = None
m = match_on_tokens(
words,
index - 1,
SHORT_CITE_ANTECEDENT_REGEX,
strings_only=True,
forward=False,
)
if m:
antecedent_guess = m["antecedent"].strip()
# Get pin_cite
cite_token = cast(CitationToken, words[index])
pin_cite, span_end, parenthetical = extract_pin_cite(
words, index, prefix=cite_token.groups["page"]
)
# make ShortCaseCitation
citation = ShortCaseCitation(
cite_token,
index,
exact_editions=cite_token.exact_editions,
variation_editions=cite_token.variation_editions,
span_end=span_end,
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
"parenthetical": parenthetical,
},
)
# add metadata
citation.guess_edition()
citation.guess_court()
return citation
def _extract_supra_citation(
words: Tokens,
index: int,
) -> SupraCitation:
"""Given a list of words and the index of a supra token, look before
and after to see if this is a supra citation. If found, construct
and return a SupraCitation object.
Supra 1: Adarand, supra, at 240
Supra 2: Adarand, 515 supra, at 240
Supra 3: Adarand, supra, somethingelse
Supra 4: Adrand, supra. somethingelse
"""
pin_cite, span_end, parenthetical = extract_pin_cite(words, index)
antecedent_guess = None
volume = None
m = match_on_tokens(
words,
index - 1,
SUPRA_ANTECEDENT_REGEX,
strings_only=True,
forward=False,
)
if m:
antecedent_guess = m["antecedent"]
volume = m["volume"]
# Return SupraCitation
return SupraCitation(
cast(SupraToken, words[index]),
index,
span_end=span_end,
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
"parenthetical": parenthetical,
"volume": volume,
},
)
def _extract_id_citation(
words: Tokens,
index: int,
) -> IdCitation:
"""Given a list of words and the index of an id token, gather the
immediately succeeding tokens to construct and return an IdCitation
object.
"""
pin_cite, span_end, parenthetical = extract_pin_cite(words, index)
return IdCitation(
cast(IdToken, words[index]),
index,
span_end=span_end,
metadata={
"pin_cite": pin_cite,
"parenthetical": parenthetical,
},
)
Functions
def get_citations(plain_text: str, remove_ambiguous: bool = False, tokenizer: Tokenizer = AhocorasickTokenizer() ‑> List[CitationBase]
-
This is eyecite's main workhorse function. Given a string of text (e.g., a judicial opinion or other legal document), return a list of
CitationBase
objects representing the citations found in the document.Args
plain_text
- The text to parse. You may wish to use the
clean_text()
function to pre-process your text before passing it here. remove_ambiguous
- Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date.
tokenizer
- An instance of a Tokenizer object. See
eyecite.tokenizers
for information about available tokenizers. Uses theAhocorasickTokenizer
by default.
Returns
A list of
CitationBase
objectsExpand source code
def get_citations( plain_text: str, remove_ambiguous: bool = False, tokenizer: Tokenizer = default_tokenizer, ) -> List[CitationBase]: """This is eyecite's main workhorse function. Given a string of text (e.g., a judicial opinion or other legal document), return a list of `eyecite.models.CitationBase` objects representing the citations found in the document. Args: plain_text: The text to parse. You may wish to use the `eyecite.clean.clean_text` function to pre-process your text before passing it here. remove_ambiguous: Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date. tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers` for information about available tokenizers. Uses the `eyecite.tokenizers.AhocorasickTokenizer` by default. Returns: A list of `eyecite.models.CitationBase` objects """ if plain_text == "eyecite": return joke_cite words, citation_tokens = tokenizer.tokenize(plain_text) citations = [] for i, token in citation_tokens: citation: CitationBase token_type = type(token) # CASE 1: Token is a CitationToken (i.e., a reporter, a law journal, # or a law). # In this case, first try extracting it as a standard, full citation, # and if that fails try extracting it as a short form citation. if token_type is CitationToken: citation_token = cast(CitationToken, token) if citation_token.short: citation = _extract_shortform_citation(words, i) else: citation = _extract_full_citation(words, i) # CASE 2: Token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited # immediately prior, but for safety we will leave that resolution up # to the user. elif token_type is IdToken: citation = _extract_id_citation(words, i) # CASE 3: Token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, for safety we won't resolve this reference yet. elif token_type is SupraToken: citation = _extract_supra_citation(words, i) # CASE 4: Token is a section marker. # In this case, it's likely that this is a reference to a citation, # but we're not sure what it is if it doesn't match any of the above. # So we record this marker in order to keep an accurate list of the # possible antecedents for id citations. elif token_type is SectionToken: citation = UnknownCitation(cast(SectionToken, token), i) # CASE 5: The token is not a citation. else: continue citations.append(citation) # Remove citations with multiple reporter candidates where we couldn't # guess correct reporter if remove_ambiguous: citations = disambiguate_reporters(citations) # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important for reconstructing # the references of the ShortCaseCitation, SupraCitation, and # IdCitation objects. return citations