Source code for architxt.nlp.utils

import warnings
from collections.abc import Generator, Iterable, Sequence

from unidecode import unidecode

from architxt.nlp.model import Entity, Relation

__all__ = ['split_entities', 'split_relations', 'split_sentences']


[docs] def split_sentences(text: str) -> list[str]: r""" Remove Unicode and split the input text into sentences based on the line breaks. It is common for brat annotation formats to have one sentence per line. :param text: The input text to be split into sentences. :return: A list of sentences split by line breaks with Unicode removed. >>> split_sentences("This is à test\nAnothér-test here") ['This is a test', 'Another-test here'] """ return unidecode(text).split('\n')
[docs] def split_entities(entities: Iterable[Entity], sentences: Sequence[str]) -> Generator[list[Entity], None, None]: """ Split a list of `Entity` objects based on their occurrence in different sentences. Entities are assigned to sentences based on their start and end positions. The function returns a generator of lists, where each list contains the entities corresponding to a specific sentence, with the entity positions adjusted to be relative to the sentence. :param entities: An iterable of `Entity` objects, each representing a named entity with start and end positions relative to the entire text. :param sentences: A sequence of sentences corresponding to the text from which the entities are extracted. :yield: A list of `Entity` objects for each sentence, with entity positions relative to that sentence. >>> e1 = Entity(name="Entity1", start=0, end=5, id="E1") >>> e2 = Entity(name="Entity2", start=6, end=15, id="E2") >>> e3 = Entity(name="Entity3", start=21, end=25, id="E3") >>> result = list(split_entities([e1, e2, e3], ["Hello world.", "This is a test."])) >>> len(result) 2 >>> len(result[0]) 1 >>> len(result[1]) 2 >>> result[0][0].name == "Entity1" True >>> result[1][0].name == "Entity2" True >>> result[1][1].name == "Entity3" True """ # Sort entities by their start position entities = sorted(entities, key=lambda ent: (ent.start, ent.end)) ent_i = 0 # Index to track the current entity sent_i = 0 # Index to track the current sentence start = 0 # Cumulative start index of the current sentence within the whole text # Iterate through each sentence while sent_i < len(sentences): sent_entities = [] end = start + len(sentences[sent_i]) # The end index of the current sentence # Gather entities that belong to the current sentence while ent_i < len(entities) and entities[ent_i].end <= end: entity = entities[ent_i] # Calculate entity start and end positions relative to the current sentence ent_start = max(entity.start - start, 0) ent_end = min(entity.end - start, len(sentences[sent_i])) ent_i += 1 # Add the entity to the list of entities for this sentence try: sent_entities.append(Entity(start=ent_start, end=ent_end, name=entity.name, id=entity.id)) except ValueError as error: warnings.warn(str(error)) # Update the start position for the next sentence start += len(sentences[sent_i]) + 1 # +1 accounts for the space or punctuation between sentences sent_i += 1 # Yield the entities corresponding to the current sentence yield sent_entities
[docs] def split_relations(relations: Iterable[Relation], entities: Sequence[Sequence[Entity]]) -> list[list[Relation]]: """ Split relations into sentence-specific relationships. It maps the entity IDs to their indices within the corresponding sentence's entities. :param relations: An iterable of `Relation`. :param entities: A sequence of sequences, where each inner sequence contains `Entity` objects corresponding to entities in a sentence. :return: A list of lists. Each inner list corresponds to a sentence and contains `Relation` objects for that sentence. >>> e1 = Entity(name="Entity1", start=0, end=1, id="E1") >>> e2 = Entity(name="Entity2", start=2, end=3, id="E2") >>> e3 = Entity(name="Entity3", start=4, end=5, id="E3") >>> e4 = Entity(name="Entity4", start=6, end=7, id="E4") >>> r1 = Relation(src="E1", dst="E2", name="relates_to") >>> r2 = Relation(src="E3", dst="E4", name="belongs_to") >>> result = split_relations([r1, r2], [[e1, e2], [e3, e4]]) >>> len(result) 2 >>> result[0][0] == r1 True >>> result[1][0] == r2 True """ # Initialize an empty list of relationships for each sentence relationship: list[list[Relation]] = [[] for _ in range(len(entities))] # Create a dictionary of entity indices for each sentence for faster lookups entity_index_map = [{entity.id: entity for entity in sentence_entities} for sentence_entities in entities] # Iterate through each relation and map it to the corresponding sentence and entity indices for rel in relations: # Find the sentence that contains both the source and destination entities sent_i: int | None = None for i, entity_map in enumerate(entity_index_map): if rel.src in entity_map and rel.dst in entity_map: sent_i = i break # If the relation belongs to a valid sentence, append it to the relationships if sent_i is not None: relationship[sent_i].append(rel) return relationship