Module eyecite.clean
Expand source code
import re
from typing import Callable, Dict, Iterable, Union
import lxml.html
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str:
"""Given a list of "cleaning" functions, apply each in sequence to a
given text string and return the result. Steps may be the names of
functions in `eyecite.clean`, or other custom callables. You may wish to
use this tool to pre-process your text before feeding it into
`eyecite.find.get_citations`, especially if the text was
OCR'd from a PDF.
Args:
text: The text to clean.
steps: Any `Iterable` (e.g., a list) of cleaning functions to apply.
Returns:
The cleaned text.
"""
for step in steps:
if step in cleaners_lookup:
step_func = cleaners_lookup[step] # type: ignore
elif callable(step):
step_func = step
else:
raise ValueError(
"clean_text steps must be callable "
f"or one of {list(cleaners_lookup.keys())}"
)
text = step_func(text)
return text # type: ignore
def html(html_content: str) -> str:
"""Given HTML markup, return only text that would be rendered visibly.
Adopted from freelawproject/juriscraper/lib/html_utils.py#L163.
Args:
html_content: The HTML string.
Returns:
Text that is visible.
"""
html_tree = lxml.html.fromstring(html_content)
text = html_tree.xpath(
"""//text()[normalize-space() and not(
parent::style |
parent::link |
parent::head |
parent::script)]"""
)
return " ".join(text)
def inline_whitespace(text: str) -> str:
"""Collapse multiple spaces or tabs within a string into one space
character.
Args:
text: The input string.
Returns:
Text with collapsed spaces and tabs.
"""
return re.sub(r"[ \t]+", " ", text)
def all_whitespace(text: str) -> str:
"""Collapse multiple whitespace characters within a string into one space
character.
Args:
text: The input string.
Returns:
Text with collapsed whitespace characters.
"""
return re.sub(r"\s+", " ", text)
def underscores(text: str) -> str:
"""Remove strings of two or more underscores that are common
in text extracted from PDFs.
Args:
text: The input string.
Returns:
Text without underscores.
"""
return re.sub(r"__+", "", text)
cleaners_lookup: Dict[str, Callable[[str], str]] = {
"html": html,
"inline_whitespace": inline_whitespace,
"all_whitespace": all_whitespace,
"underscores": underscores,
}
Functions
def all_whitespace(text: str) ‑> str
-
Collapse multiple whitespace characters within a string into one space character.
Args
text
- The input string.
Returns
Text with collapsed whitespace characters.
Expand source code
def all_whitespace(text: str) -> str: """Collapse multiple whitespace characters within a string into one space character. Args: text: The input string. Returns: Text with collapsed whitespace characters. """ return re.sub(r"\s+", " ", text)
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) ‑> str
-
Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in
eyecite.clean
, or other custom callables. You may wish to use this tool to pre-process your text before feeding it intoget_citations()
, especially if the text was OCR'd from a PDF.Args
text
- The text to clean.
steps
- Any
Iterable
(e.g., a list) of cleaning functions to apply.
Returns
The cleaned text.
Expand source code
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str: """Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in `eyecite.clean`, or other custom callables. You may wish to use this tool to pre-process your text before feeding it into `eyecite.find.get_citations`, especially if the text was OCR'd from a PDF. Args: text: The text to clean. steps: Any `Iterable` (e.g., a list) of cleaning functions to apply. Returns: The cleaned text. """ for step in steps: if step in cleaners_lookup: step_func = cleaners_lookup[step] # type: ignore elif callable(step): step_func = step else: raise ValueError( "clean_text steps must be callable " f"or one of {list(cleaners_lookup.keys())}" ) text = step_func(text) return text # type: ignore
def html(html_content: str) ‑> str
-
Given HTML markup, return only text that would be rendered visibly. Adopted from freelawproject/juriscraper/lib/html_utils.py#L163.
Args
html_content
- The HTML string.
Returns
Text that is visible.
Expand source code
def html(html_content: str) -> str: """Given HTML markup, return only text that would be rendered visibly. Adopted from freelawproject/juriscraper/lib/html_utils.py#L163. Args: html_content: The HTML string. Returns: Text that is visible. """ html_tree = lxml.html.fromstring(html_content) text = html_tree.xpath( """//text()[normalize-space() and not( parent::style | parent::link | parent::head | parent::script)]""" ) return " ".join(text)
def inline_whitespace(text: str) ‑> str
-
Collapse multiple spaces or tabs within a string into one space character.
Args
text
- The input string.
Returns
Text with collapsed spaces and tabs.
Expand source code
def inline_whitespace(text: str) -> str: """Collapse multiple spaces or tabs within a string into one space character. Args: text: The input string. Returns: Text with collapsed spaces and tabs. """ return re.sub(r"[ \t]+", " ", text)
def underscores(text: str) ‑> str
-
Remove strings of two or more underscores that are common in text extracted from PDFs.
Args
text
- The input string.
Returns
Text without underscores.
Expand source code
def underscores(text: str) -> str: """Remove strings of two or more underscores that are common in text extracted from PDFs. Args: text: The input string. Returns: Text without underscores. """ return re.sub(r"__+", "", text)