Module `eyecite.annotate`

Expand source code

from bisect import bisect_left, bisect_right
from collections.abc import Iterable
from difflib import SequenceMatcher
from functools import partial
from logging import getLogger
from typing import Any, Callable, Optional

import fast_diff_match_patch

from eyecite.utils import (
    is_balanced_html,
    maybe_balance_style_tags,
    placeholder_markup,
    wrap_html_tags,
)

logger = getLogger(__name__)


class SpanUpdater:
    """Helper object to shift offsets from text_before to text_after.

    For example:
    >>> text_before = "foo bar"
    >>> text_after = "foo baz bar"
    >>> updater = SpanUpdater(text_before, text_after)

    Offset 1 is still at offset 1:
    >>> updater.update(1)

    Offset 8 has moved to offset 10:
    >>> updater.update(8)
    10
    """

    def __init__(self, text_before, text_after, use_dmp=True):
        """To set up, we need to populate self.offsets and self.updaters:
            >>> SpanUpdater(text_before, text_after).offsets
            [0, 4]
            >>> SpanUpdater(text_before, text_after).updaters
            [partial(shift_offset, delta=0), partial(shift_offset, delta=4)]
        This indicates that offsets 0 to 4 need to be shifted by 0,
        and offsets 4 and up need to be shifted by 4.
        """

        # helpers for the two kinds of updates we need to apply to offsets:
        def shift_offset(offset, delta):
            return offset + delta

        def replace_offset(offset, new_offset):
            return new_offset

        # diff the two strings and set self.offsets and self.updaters:
        offset = 0
        delta = 0
        self.offsets = offsets = []
        self.updaters = updaters = []
        get_diff_steps = (
            self.get_diff_steps if use_dmp else self.get_diff_steps_builtin
        )
        for operation, amount in get_diff_steps(text_before, text_after):
            if operation == "=":
                # start a new range with a relative delta,
                # and push the offset forward
                offsets.append(offset)
                updaters.append(partial(shift_offset, delta=delta))
                offset += amount
            elif operation == "+":
                # push the delta forward
                delta += amount
            else:  # operation == '-'
                # Start a new range with an absolute delta.
                # Push the offset forward and delta backward.
                offsets.append(offset)
                updaters.append(
                    partial(replace_offset, new_offset=offset + delta)
                )
                offset += amount
                delta -= amount

    @staticmethod
    def get_diff_steps(a: str, b: str):
        """Yield steps to turn a into b. Example:
            >>> list(SpanUpdater.get_diff_steps("12 34 56", "12 78 34"))
            [('=', 3), ('+', 3), ('=', 2), ('-', 3)]
        Meaning: to turn a into b, keep the first 3 characters the same,
        insert three new characters (we don't care what), keep the next
        two characters, delete three characters.
        """
        try:
            return fast_diff_match_patch.diff(
                a, b, timelimit=0, checklines=False, cleanup="No"
            )
        except AttributeError as e:
            raise AttributeError(
                "This may be caused by having the diff_match_patch package "
                "installed, which is incompatible with "
                "fast_diff_match_patch_python."
            ) from e

    @staticmethod
    def get_diff_steps_builtin(a: str, b: str):
        """Same as get_diff_steps but using the builtin difflib.
        Much slower but potentially useful for debugging."""
        diffs = SequenceMatcher(a=a, b=b, autojunk=False)
        for operation, a1, a2, b1, b2 in diffs.get_opcodes():
            if operation == "insert":
                yield "+", b2 - b1
            elif operation == "replace":
                yield "-", a2 - a1
                yield "+", b2 - b1
            elif operation == "delete":
                yield "-", a2 - a1
            elif operation == "equal":
                yield "=", a2 - a1

    def update(self, offset, bisect):
        """Shift an offset left or right."""
        index = bisect(self.offsets, offset) - 1
        updater = self.updaters[index]
        return updater(offset)


def annotate_citations(
    plain_text: str,
    annotations: Iterable[tuple[tuple[int, int], Any, Any]],
    source_text: str = "",
    unbalanced_tags: str = "unchecked",
    use_dmp: bool = True,
    annotator: Optional[Callable[[Any, str, Any], str]] = None,
    offset_updater: Optional[SpanUpdater] = None,
) -> str:
    """Given a list of citations and the text from which they were parsed,
    insert annotations into the text surrounding each citation. This could be
    useful for linking the citations to a URL, or otherwise indicating that
    they were successfully parsed or resolved.

    If you pre-processed your text before extracting the citations, this
    function will intelligently reconcile the differences between the original
    source text and the cleaned text using a diffing algorithm, ensuring that
    each annotation is inserted in the correct location.

    Example:
    >>> plain_text = "foo 1 U.S. 1 bar"
    >>> citations = get_citations(plain_text)
    >>> annotate_citations("foo 1 U.S. 1 bar",
    ...     [(citations[0].span(), "<a>", "</a>")])
    >>>
    >>> returns: "foo <a>1 U.S. 1</a> bar"

    Args:
        plain_text: The text containing the citations. If this text was
            cleaned, you should also pass the `source_text` below.
        annotations: A `Tuple` of (1) the start and end positions of the
            citation in the text, (2) the text to insert before the citation,
            and (3) the text to insert after the citation.
        source_text: If provided, apply annotations to this text instead using
            a diffing algorithm.
        unbalanced_tags: If provided, unbalanced_tags="skip" will skip
            inserting annotations that result in invalid HTML.
            unbalanced_tags="wrap" will ensure valid HTML by wrapping
            annotations around any unbalanced tags.
        use_dmp: If `True` (default), use the fast_diff_match_patch_python
            library for diffing. If `False`, use the slower built-in difflib,
            which may be useful for debugging.
        annotator: If provided, should be a function that takes three
            arguments (the text to insert before, the text of the citation,
            and the text to insert after) and returns the annotation. This is
            useful for customizing the annotation action: If you don't pass
            this function, eyecite will simply concatenate the before_text,
            citation_text, and after_text together for each annotation.
        offset_updater: If provided, use this SpanUpdater. Citation finding
            for HTML / XML sources use a SpanUpdater called `plain_to_markup`
            to find citations, passing it saves this expensive instantiation.
    Returns:
        The annotated text.
    """
    if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
        raise ValueError(f"Unknown option '{unbalanced_tags}")

    # set up offset_updater if we have to move annotations to source_text
    if offset_updater:
        plain_text = source_text
    elif source_text and source_text != plain_text:
        placeholder_text = placeholder_markup(source_text)
        offset_updater = SpanUpdater(
            plain_text, placeholder_text, use_dmp=use_dmp
        )
        plain_text = source_text

    # append text for each annotation to out
    annotations = sorted(annotations)
    out = []
    last_end = 0
    for (start, end), before, after in annotations:
        # if we're applying to source_text, update offsets
        if offset_updater:
            start = offset_updater.update(start, bisect_right)
            end = offset_updater.update(end, bisect_left)

        # handle overlaps
        if start < last_end:
            # include partial annotation if possible
            start = last_end
            if start >= end:
                # if annotation is entirely covered, skip
                continue

        span_text = plain_text[start:end]

        # handle HTML tags
        if unbalanced_tags == "unchecked":
            pass
        elif not is_balanced_html(span_text):
            if unbalanced_tags == "wrap":
                span_text = wrap_html_tags(span_text, after, before)
            else:  # "skip" case
                original_span_text = span_text
                start, end, span_text = maybe_balance_style_tags(
                    start, end, plain_text
                )
                if not is_balanced_html(span_text):
                    logger.warning(
                        "Citation was not annotated due to unbalanced tags %s",
                        original_span_text,
                    )
                    continue

        if annotator is not None:
            annotated_span = annotator(before, span_text, after)
        else:
            annotated_span = before + span_text + after

        # append each span
        out.extend(
            [
                plain_text[last_end:start],
                annotated_span,
            ]
        )
        last_end = end

    # append text after final citation
    if last_end < len(plain_text):
        out.append(plain_text[last_end:])

    return "".join(out)

Functions

def annotate_citations(plain_text: str, annotations: collections.abc.Iterable[tuple[tuple[int, int], typing.Any, typing.Any]], source_text: str = '', unbalanced_tags: str = 'unchecked', use_dmp: bool = True, annotator: Optional[Callable[[Any, str, Any], str]] = None, offset_updater: Optional[eyecite.annotate.SpanUpdater] = None) ‑> str

Given a list of citations and the text from which they were parsed, insert annotations into the text surrounding each citation. This could be useful for linking the citations to a URL, or otherwise indicating that they were successfully parsed or resolved.

If you pre-processed your text before extracting the citations, this function will intelligently reconcile the differences between the original source text and the cleaned text using a diffing algorithm, ensuring that each annotation is inserted in the correct location.

Example:

>>> plain_text = "foo 1 U.S. 1 bar"
>>> citations = get_citations(plain_text)
>>> annotate_citations("foo 1 U.S. 1 bar",
...     [(citations[0].span(), "<a>", "</a>")])
>>>
>>> returns: "foo <a>1 U.S. 1</a> bar"

Args

plain_text: The text containing the citations. If this text was cleaned, you should also pass the source_text below.
annotations: A Tuple of (1) the start and end positions of the citation in the text, (2) the text to insert before the citation, and (3) the text to insert after the citation.
source_text: If provided, apply annotations to this text instead using a diffing algorithm.
unbalanced_tags: If provided, unbalanced_tags="skip" will skip inserting annotations that result in invalid HTML. unbalanced_tags="wrap" will ensure valid HTML by wrapping annotations around any unbalanced tags.
use_dmp: If True (default), use the fast_diff_match_patch_python library for diffing. If False, use the slower built-in difflib, which may be useful for debugging.
annotator: If provided, should be a function that takes three arguments (the text to insert before, the text of the citation, and the text to insert after) and returns the annotation. This is useful for customizing the annotation action: If you don't pass this function, eyecite will simply concatenate the before_text, citation_text, and after_text together for each annotation.
offset_updater: If provided, use this SpanUpdater. Citation finding for HTML / XML sources use a SpanUpdater called plain_to_markup to find citations, passing it saves this expensive instantiation.

Returns

The annotated text.

Expand source code

def annotate_citations(
    plain_text: str,
    annotations: Iterable[tuple[tuple[int, int], Any, Any]],
    source_text: str = "",
    unbalanced_tags: str = "unchecked",
    use_dmp: bool = True,
    annotator: Optional[Callable[[Any, str, Any], str]] = None,
    offset_updater: Optional[SpanUpdater] = None,
) -> str:
    """Given a list of citations and the text from which they were parsed,
    insert annotations into the text surrounding each citation. This could be
    useful for linking the citations to a URL, or otherwise indicating that
    they were successfully parsed or resolved.

    If you pre-processed your text before extracting the citations, this
    function will intelligently reconcile the differences between the original
    source text and the cleaned text using a diffing algorithm, ensuring that
    each annotation is inserted in the correct location.

    Example:
    >>> plain_text = "foo 1 U.S. 1 bar"
    >>> citations = get_citations(plain_text)
    >>> annotate_citations("foo 1 U.S. 1 bar",
    ...     [(citations[0].span(), "<a>", "</a>")])
    >>>
    >>> returns: "foo <a>1 U.S. 1</a> bar"

    Args:
        plain_text: The text containing the citations. If this text was
            cleaned, you should also pass the `source_text` below.
        annotations: A `Tuple` of (1) the start and end positions of the
            citation in the text, (2) the text to insert before the citation,
            and (3) the text to insert after the citation.
        source_text: If provided, apply annotations to this text instead using
            a diffing algorithm.
        unbalanced_tags: If provided, unbalanced_tags="skip" will skip
            inserting annotations that result in invalid HTML.
            unbalanced_tags="wrap" will ensure valid HTML by wrapping
            annotations around any unbalanced tags.
        use_dmp: If `True` (default), use the fast_diff_match_patch_python
            library for diffing. If `False`, use the slower built-in difflib,
            which may be useful for debugging.
        annotator: If provided, should be a function that takes three
            arguments (the text to insert before, the text of the citation,
            and the text to insert after) and returns the annotation. This is
            useful for customizing the annotation action: If you don't pass
            this function, eyecite will simply concatenate the before_text,
            citation_text, and after_text together for each annotation.
        offset_updater: If provided, use this SpanUpdater. Citation finding
            for HTML / XML sources use a SpanUpdater called `plain_to_markup`
            to find citations, passing it saves this expensive instantiation.
    Returns:
        The annotated text.
    """
    if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
        raise ValueError(f"Unknown option '{unbalanced_tags}")

    # set up offset_updater if we have to move annotations to source_text
    if offset_updater:
        plain_text = source_text
    elif source_text and source_text != plain_text:
        placeholder_text = placeholder_markup(source_text)
        offset_updater = SpanUpdater(
            plain_text, placeholder_text, use_dmp=use_dmp
        )
        plain_text = source_text

    # append text for each annotation to out
    annotations = sorted(annotations)
    out = []
    last_end = 0
    for (start, end), before, after in annotations:
        # if we're applying to source_text, update offsets
        if offset_updater:
            start = offset_updater.update(start, bisect_right)
            end = offset_updater.update(end, bisect_left)

        # handle overlaps
        if start < last_end:
            # include partial annotation if possible
            start = last_end
            if start >= end:
                # if annotation is entirely covered, skip
                continue

        span_text = plain_text[start:end]

        # handle HTML tags
        if unbalanced_tags == "unchecked":
            pass
        elif not is_balanced_html(span_text):
            if unbalanced_tags == "wrap":
                span_text = wrap_html_tags(span_text, after, before)
            else:  # "skip" case
                original_span_text = span_text
                start, end, span_text = maybe_balance_style_tags(
                    start, end, plain_text
                )
                if not is_balanced_html(span_text):
                    logger.warning(
                        "Citation was not annotated due to unbalanced tags %s",
                        original_span_text,
                    )
                    continue

        if annotator is not None:
            annotated_span = annotator(before, span_text, after)
        else:
            annotated_span = before + span_text + after

        # append each span
        out.extend(
            [
                plain_text[last_end:start],
                annotated_span,
            ]
        )
        last_end = end

    # append text after final citation
    if last_end < len(plain_text):
        out.append(plain_text[last_end:])

    return "".join(out)