Source code for architxt.nlp.parser.benepar
from collections.abc import Iterable, Iterator
from types import TracebackType
import benepar # noqa: F401
import spacy
from spacy import Language
from architxt.tree import Tree
from . import Parser
__all__ = ['BeneparParser']
DEFAULT_BENEPAR_MODELS = {
'English': 'benepar_en3',
'Chinese': 'benepar_zh2',
'Arabic': 'benepar_ar2',
'German': 'benepar_de2',
'Basque': 'benepar_eu2',
'French': 'benepar_fr2',
'Hebrew': 'benepar_he2',
'Hungarian': 'benepar_hu2',
'Korean': 'benepar_ko2',
'Polish': 'benepar_pl2',
'Swedish': 'benepar_sv2',
}
[docs]
class BeneparParser(Parser):
def __init__(
self,
*,
spacy_models: dict[str, str],
benepar_models: dict[str, str] | None = None,
) -> None:
"""
Create a benepar parser.
:param spacy_models: The name of the SpaCy models for each language.
:param benepar_models: The name of the SpaCy model to use.
"""
self.spacy_models = spacy_models
self.benepar_models = benepar_models or DEFAULT_BENEPAR_MODELS
self.__models: dict[str, Language] = {}
def __exit__(
self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None
) -> None:
self.__models.clear()
def _get_model(self, language: str) -> Language:
if language not in self.__models:
nlp = spacy.load(self.spacy_models[language], disable={'ner', 'textcat', 'lemmatizer', 'tagger'})
nlp.add_pipe('benepar', config={'model': self.benepar_models[language]})
self.__models[language] = nlp
return self.__models[language]
[docs]
def raw_parse(self, sentences: Iterable[str], *, language: str, batch_size: int = 128) -> Iterator[Tree]:
nlp = self._get_model(language)
for doc in nlp.pipe(sentences, batch_size=batch_size):
sent = next(doc.sents)
tree = Tree.fromstring(sent._.parse_string)
tree.set_label('SENT')
yield tree