Source code for architxt.nlp.parser.corenlp

import itertools
from collections.abc import Iterable, Iterator
from types import TracebackType

from nltk import CoreNLPParser as NLTKParser

from architxt.tree import Tree

from . import Parser

__all__ = ['CoreNLPParser']


[docs] class CoreNLPParser(Parser): def __init__( self, *, corenlp_url: str, ) -> None: """ Create a CoreNLP parser. :param corenlp_url: The URL of the CoreNLP server. """ self.corenlp = NLTKParser(url=corenlp_url) def __exit__( self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None ) -> None: self.corenlp.session.close()
[docs] def raw_parse(self, sentences: Iterable[str], *, language: str, batch_size: int = 128) -> Iterator[Tree]: for batch in itertools.batched(sentences, batch_size): for tree in self.corenlp.raw_parse_sents(batch, properties={'tokenize.language': language}): # CoreNLP return a list of candidates tree, we only select the first one. # A parse tree may contain multiple sentence subtrees we select only one and convert it into a tree. yield Tree.convert(next(tree)[0])