Module eyecite.clean

Expand source code
import re
from typing import Callable, Dict, Iterable, Union

import lxml.html


def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str:
    """Given a list of "cleaning" functions, apply each in sequence to a
    given text string and return the result. Steps may be the names of
    functions in `eyecite.clean`, or other custom callables. You may wish to
    use this tool to pre-process your text before feeding it into
    `eyecite.find.get_citations`, especially if the text was
    OCR'd from a PDF.

    Args:
        text: The text to clean.
        steps: Any `Iterable` (e.g., a list) of cleaning functions to apply.

    Returns:
        The cleaned text.
    """
    for step in steps:
        if step in cleaners_lookup:
            step_func = cleaners_lookup[step]  # type: ignore
        elif callable(step):
            step_func = step
        else:
            raise ValueError(
                "clean_text steps must be callable "
                f"or one of {list(cleaners_lookup.keys())}"
            )
        text = step_func(text)

    return text  # type: ignore


def html(html_content: str) -> str:
    """Given HTML markup, return only text that would be rendered visibly.
    Adopted from freelawproject/juriscraper/lib/html_utils.py#L163.

    Args:
        html_content: The HTML string.

    Returns:
        Text that is visible.
    """
    html_tree = lxml.html.fromstring(html_content)
    text = html_tree.xpath(
        """//text()[normalize-space() and not(
            parent::style |
            parent::link |
            parent::head |
            parent::script)]"""
    )
    return " ".join(text)


def inline_whitespace(text: str) -> str:
    """Collapse multiple spaces or tabs within a string into one space
    character.

    Args:
        text: The input string.

    Returns:
        Text with collapsed spaces and tabs.
    """
    return re.sub(r"[ \t]+", " ", text)


def all_whitespace(text: str) -> str:
    """Collapse multiple whitespace characters within a string into one space
    character.

    Args:
        text: The input string.

    Returns:
        Text with collapsed whitespace characters.
    """
    return re.sub(r"\s+", " ", text)


def underscores(text: str) -> str:
    """Remove strings of two or more underscores that are common
    in text extracted from PDFs.

    Args:
        text: The input string.

    Returns:
        Text without underscores.
    """
    return re.sub(r"__+", "", text)


cleaners_lookup: Dict[str, Callable[[str], str]] = {
    "html": html,
    "inline_whitespace": inline_whitespace,
    "all_whitespace": all_whitespace,
    "underscores": underscores,
}

Functions

def all_whitespace(text: str) ‑> str

Collapse multiple whitespace characters within a string into one space character.

Args

text
The input string.

Returns

Text with collapsed whitespace characters.

Expand source code
def all_whitespace(text: str) -> str:
    """Collapse multiple whitespace characters within a string into one space
    character.

    Args:
        text: The input string.

    Returns:
        Text with collapsed whitespace characters.
    """
    return re.sub(r"\s+", " ", text)
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) ‑> str

Given a list of "cleaning" functions, apply each in sequence to a given text string and return the result. Steps may be the names of functions in eyecite.clean, or other custom callables. You may wish to use this tool to pre-process your text before feeding it into get_citations(), especially if the text was OCR'd from a PDF.

Args

text
The text to clean.
steps
Any Iterable (e.g., a list) of cleaning functions to apply.

Returns

The cleaned text.

Expand source code
def clean_text(text, steps: Iterable[Union[str, Callable[[str], str]]]) -> str:
    """Given a list of "cleaning" functions, apply each in sequence to a
    given text string and return the result. Steps may be the names of
    functions in `eyecite.clean`, or other custom callables. You may wish to
    use this tool to pre-process your text before feeding it into
    `eyecite.find.get_citations`, especially if the text was
    OCR'd from a PDF.

    Args:
        text: The text to clean.
        steps: Any `Iterable` (e.g., a list) of cleaning functions to apply.

    Returns:
        The cleaned text.
    """
    for step in steps:
        if step in cleaners_lookup:
            step_func = cleaners_lookup[step]  # type: ignore
        elif callable(step):
            step_func = step
        else:
            raise ValueError(
                "clean_text steps must be callable "
                f"or one of {list(cleaners_lookup.keys())}"
            )
        text = step_func(text)

    return text  # type: ignore
def html(html_content: str) ‑> str

Given HTML markup, return only text that would be rendered visibly. Adopted from freelawproject/juriscraper/lib/html_utils.py#L163.

Args

html_content
The HTML string.

Returns

Text that is visible.

Expand source code
def html(html_content: str) -> str:
    """Given HTML markup, return only text that would be rendered visibly.
    Adopted from freelawproject/juriscraper/lib/html_utils.py#L163.

    Args:
        html_content: The HTML string.

    Returns:
        Text that is visible.
    """
    html_tree = lxml.html.fromstring(html_content)
    text = html_tree.xpath(
        """//text()[normalize-space() and not(
            parent::style |
            parent::link |
            parent::head |
            parent::script)]"""
    )
    return " ".join(text)
def inline_whitespace(text: str) ‑> str

Collapse multiple spaces or tabs within a string into one space character.

Args

text
The input string.

Returns

Text with collapsed spaces and tabs.

Expand source code
def inline_whitespace(text: str) -> str:
    """Collapse multiple spaces or tabs within a string into one space
    character.

    Args:
        text: The input string.

    Returns:
        Text with collapsed spaces and tabs.
    """
    return re.sub(r"[ \t]+", " ", text)
def underscores(text: str) ‑> str

Remove strings of two or more underscores that are common in text extracted from PDFs.

Args

text
The input string.

Returns

Text without underscores.

Expand source code
def underscores(text: str) -> str:
    """Remove strings of two or more underscores that are common
    in text extracted from PDFs.

    Args:
        text: The input string.

    Returns:
        Text without underscores.
    """
    return re.sub(r"__+", "", text)