Source code for architxt.nlp.utils

import warnings
from collections.abc import Generator, Iterable, Sequence

from unidecode import unidecode

from architxt.nlp.model import Entity, Relation

__all__ = ['split_entities', 'split_relations', 'split_sentences']



[docs]
def split_sentences(text: str) -> list[str]:
    r"""
    Remove Unicode and split the input text into sentences based on the line breaks.

    It is common for brat annotation formats to have one sentence per line.

    :param text: The input text to be split into sentences.
    :return: A list of sentences split by line breaks with Unicode removed.

    >>> split_sentences("This is à test\nAnothér-test here")
    ['This is a test', 'Another-test here']

    """
    return unidecode(text).split('\n')




[docs]
def split_entities(entities: Iterable[Entity], sentences: Sequence[str]) -> Generator[list[Entity], None, None]:
    """
    Split a list of `Entity` objects based on their occurrence in different sentences.

    Entities are assigned to sentences based on their start and end positions. The function
    returns a generator of lists, where each list contains the entities corresponding to a
    specific sentence, with the entity positions adjusted to be relative to the sentence.

    :param entities: An iterable of `Entity` objects, each representing a named entity with
                     start and end positions relative to the entire text.
    :param sentences: A sequence of sentences corresponding to the text from which the entities are extracted.

    :yield: A list of `Entity` objects for each sentence, with entity positions relative to
            that sentence.

    >>> e1 = Entity(name="Entity1", start=0, end=5, id="E1")
    >>> e2 = Entity(name="Entity2", start=6, end=15, id="E2")
    >>> e3 = Entity(name="Entity3", start=21, end=25, id="E3")
    >>> result = list(split_entities([e1, e2, e3], ["Hello world.", "This is a test."]))
    >>> len(result)
    2
    >>> len(result[0])
    1
    >>> len(result[1])
    2
    >>> result[0][0].name == "Entity1"
    True
    >>> result[1][0].name == "Entity2"
    True
    >>> result[1][1].name == "Entity3"
    True

    """
    # Sort entities by their start position
    entities = sorted(entities, key=lambda ent: (ent.start, ent.end))

    ent_i = 0  # Index to track the current entity
    sent_i = 0  # Index to track the current sentence
    start = 0  # Cumulative start index of the current sentence within the whole text

    # Iterate through each sentence
    while sent_i < len(sentences):
        sent_entities = []
        end = start + len(sentences[sent_i])  # The end index of the current sentence

        # Gather entities that belong to the current sentence
        while ent_i < len(entities) and entities[ent_i].end <= end:
            entity = entities[ent_i]

            # Calculate entity start and end positions relative to the current sentence
            ent_start = max(entity.start - start, 0)
            ent_end = min(entity.end - start, len(sentences[sent_i]))
            ent_i += 1

            # Add the entity to the list of entities for this sentence
            try:
                sent_entities.append(Entity(start=ent_start, end=ent_end, name=entity.name, id=entity.id))
            except ValueError as error:
                warnings.warn(str(error))

        # Update the start position for the next sentence
        start += len(sentences[sent_i]) + 1  # +1 accounts for the space or punctuation between sentences
        sent_i += 1

        # Yield the entities corresponding to the current sentence
        yield sent_entities




[docs]
def split_relations(relations: Iterable[Relation], entities: Sequence[Sequence[Entity]]) -> list[list[Relation]]:
    """
    Split relations into sentence-specific relationships.

    It maps the entity IDs to their indices within the corresponding sentence's entities.

    :param relations: An iterable of `Relation`.
    :param entities: A sequence of sequences, where each inner sequence contains `Entity` objects
                     corresponding to entities in a sentence.

    :return: A list of lists. Each inner list corresponds to a sentence and contains `Relation` objects
             for that sentence.

    >>> e1 = Entity(name="Entity1", start=0, end=1, id="E1")
    >>> e2 = Entity(name="Entity2", start=2, end=3, id="E2")
    >>> e3 = Entity(name="Entity3", start=4, end=5, id="E3")
    >>> e4 = Entity(name="Entity4", start=6, end=7, id="E4")
    >>> r1 = Relation(src="E1", dst="E2", name="relates_to")
    >>> r2 = Relation(src="E3", dst="E4", name="belongs_to")
    >>> result = split_relations([r1, r2], [[e1, e2], [e3, e4]])
    >>> len(result)
    2
    >>> result[0][0] == r1
    True
    >>> result[1][0] == r2
    True

    """
    # Initialize an empty list of relationships for each sentence
    relationship: list[list[Relation]] = [[] for _ in range(len(entities))]

    # Create a dictionary of entity indices for each sentence for faster lookups
    entity_index_map = [{entity.id: entity for entity in sentence_entities} for sentence_entities in entities]

    # Iterate through each relation and map it to the corresponding sentence and entity indices
    for rel in relations:
        # Find the sentence that contains both the source and destination entities
        sent_i: int | None = None

        for i, entity_map in enumerate(entity_index_map):
            if rel.src in entity_map and rel.dst in entity_map:
                sent_i = i
                break

        # If the relation belongs to a valid sentence, append it to the relationships
        if sent_i is not None:
            relationship[sent_i].append(rel)

    return relationship