"""Dataset loader for BRAT (BRAT Rapid Annotation Tool) format."""
from __future__ import annotations
from typing import TYPE_CHECKING
from pybrat.parser import BratParser, Example
from pybrat.parser import Entity as BratEntity
from pybrat.parser import Relation as BratRelation
from architxt.nlp.model import AnnotatedSentence, Entity, Relation
from architxt.nlp.utils import split_entities, split_relations, split_sentences
if TYPE_CHECKING:
from collections.abc import Generator, Iterable
from pathlib import Path
__all__ = ['load_brat_dataset']
[docs]
def convert_brat_entities(
entities: Iterable[BratEntity],
*,
allow_list: set[str] | None = None,
mapping: dict[str, str] | None = None,
) -> Generator[Entity, None, None]:
"""
Convert a list of `BratEntity` objects into `Entity` objects, while filtering out certain types of tags.
:param entities: An iterable of `BratEntity` objects to convert.
:param allow_list: A set of entity types to exclude from the output. If None, no filtering is applied.
:param mapping: A dictionary mapping entity names to new values. If None, no mapping is applied.
:return: A generator yielding `Entity` objects.
>>> from pybrat.parser import Entity, Relation, Span
>>> ents = [
... Entity(spans=[Span(start=0, end=5)], type="person", mention="E1"),
... Entity(spans=[Span(start=10, end=15)], type="FREQ", mention="E2"),
... Entity(spans=[Span(start=20, end=25)], type="MOMENT", mention="E3")
... ]
>>> ents = list(convert_brat_entities(ents, allow_list={"MOMENT"}, mapping={"FREQ": "FREQUENCE"}))
>>> len(ents)
2
>>> print(ents[0].name)
PERSON
>>> print(ents[1].name)
FREQUENCE
"""
allow_list = allow_list or set()
mapping = mapping or {}
for brat_entity in entities:
# Start and end positions based on the spans of the entity
start = brat_entity.spans[0].start
end = brat_entity.spans[-1].end
# Rename tag if needed
tag = brat_entity.type.upper()
tag = mapping.get(tag, tag)
# Generate the identity of the entity based on its spans
identity = tuple(brat_entity.spans)
# Filter out entities with specific tags
if tag not in allow_list:
yield Entity(name=tag, start=start, end=end, id=str(identity), value=brat_entity.mention.lower())
[docs]
def convert_brat_relations(
relations: Iterable[BratRelation],
*,
allow_list: set[str] | None = None,
mapping: dict[str, str] | None = None,
) -> Generator[Relation, None, None]:
"""
Convert a list of `BratRelation` objects into `Relation` objects while filtering out certain types of relations.
:param relations: An iterable of `BratRelation` objects to convert.
:param allow_list: A set of relation types to exclude from the output. If None, no filtering is applied.
:param mapping: A dictionary mapping relation names to new values. If None, no mapping is applied.
:return: A generator yielding `Relation` objects.
>>> from pybrat.parser import Entity, Relation, Span
>>> rels = [
... Relation(arg1=Entity(spans=[Span(start=0, end=5)], type='X', mention='E1'), arg2=Entity(spans=[Span(start=10, end=15)], type='Y', mention='E2'), type="part-of"),
... Relation(arg1=Entity(spans=[Span(start=20, end=25)], type='X', mention='E3'), arg2=Entity(spans=[Span(start=30, end=35)], type='Z', mention='E3'), type="TEMPORALITY")
... ]
>>> rels = list(convert_brat_relations(rels, allow_list={"TEMPORALITY"}))
>>> len(rels)
1
>>> print(rels[0].name)
PART-OF
"""
allow_list = allow_list or set()
mapping = mapping or {}
for brat_relation in relations:
src = str(tuple(brat_relation.arg1.spans))
dst = str(tuple(brat_relation.arg2.spans))
# Rename relation if needed
relation = brat_relation.type.upper()
relation = mapping.get(relation, relation)
# Filter out specific relation types
if relation not in allow_list and 'INCERTAIN' not in relation:
yield Relation(src=src, dst=dst, name=relation)
[docs]
def convert_brat_example(
example: Example,
*,
entities_filter: set[str] | None = None,
relations_filter: set[str] | None = None,
entities_mapping: dict[str, str] | None = None,
relations_mapping: dict[str, str] | None = None,
) -> Generator[AnnotatedSentence, None, None]:
"""
Convert a Brat example into annotated sentences, filtering and mapping entities and relations as specified.
:param example: An `Example` object containing the .txt and .ann file data.
:param entities_filter: A set of entity types to exclude from the output. If None, no filtering is applied.
:param relations_filter: A set of relation types to exclude from the output. If None, no filtering is applied.
:param entities_mapping: A dictionary mapping entity names to new values. If None, no mapping is applied.
:param relations_mapping: A dictionary mapping relation names to new values. If None, no mapping is applied.
:return: A generator yielding `AnnotatedSentence` objects for each sentence in the text.
"""
# Split the text into sentences
sentences = list(split_sentences(example.text) if isinstance(example.text, str) else example.text)
# Convert and filter entities, split by sentences
entities = list(
split_entities(
convert_brat_entities(example.entities, allow_list=entities_filter, mapping=entities_mapping), sentences
)
)
# Convert and filter relations, split by entities
relationships = split_relations(
convert_brat_relations(example.relations, allow_list=relations_filter, mapping=relations_mapping), entities
)
# Yield AnnotatedSentence objects for each sentence with its corresponding entities and relations
for sentence, sentence_entities, sentence_relations in zip(sentences, entities, relationships, strict=False):
if sentence and sentence_entities: # Yield only non-empty sentences
yield AnnotatedSentence(sentence, sentence_entities, sentence_relations)
[docs]
def load_brat_dataset(
path: Path,
*,
entities_filter: set[str] | None = None,
relations_filter: set[str] | None = None,
entities_mapping: dict[str, str] | None = None,
relations_mapping: dict[str, str] | None = None,
) -> Generator[AnnotatedSentence, None, None]:
examples = BratParser(ignore_types={"*", "E", "N", "AM"}, error='ignore').parse(path.absolute())
for example in examples:
yield from convert_brat_example(
example,
entities_filter=entities_filter,
relations_filter=relations_filter,
entities_mapping=entities_mapping,
relations_mapping=relations_mapping,
)