Source code for architxt.cli.loader

import asyncio
import random
from pathlib import Path

import click
import mlflow
import typer
from rich.panel import Panel
from sqlalchemy import create_engine

from architxt.database.loader import read_database, read_document
from architxt.generator import gen_instance
from architxt.nlp import raw_load_corpus
from architxt.nlp.parser.corenlp import CoreNLPParser
from architxt.schema import Schema
from architxt.simplification.tree_rewriting import rewrite

from .utils import console, save_forest, show_metrics, show_schema

__all__ = ['app']

ENTITIES_FILTER = {'TIME', 'MOMENT', 'DUREE', 'DURATION', 'DATE', 'OTHER_ENTITY', 'OTHER_EVENT', 'COREFERENCE'}
RELATIONS_FILTER = {'TEMPORALITE', 'CAUSE-CONSEQUENCE'}
ENTITIES_MAPPING = {
    'FREQ': 'FREQUENCY',
    'FREQUENCE': 'FREQUENCY',
    'SIGN_SYMPTOM': 'SOSY',
    'VALEUR': 'VALUE',
    'HEIGHT': 'VALUE',
    'WEIGHT': 'VALUE',
    'MASS': 'VALUE',
    'QUANTITATIVE_CONCEPT': 'VALUE',
    'QUALITATIVE_CONCEPT': 'VALUE',
    'DISTANCE': 'VALUE',
    'VOLUME': 'VALUE',
    'AREA': 'VALUE',
    'LAB_VALUE': 'VALUE',
    'TRAITEMENT': 'THERAPEUTIC_PROCEDURE',
    'MEDICATION': 'THERAPEUTIC_PROCEDURE',
    'DOSE': 'DOSAGE',
    'OUTCOME': 'SOSY',
    'EXAMEN': 'DIAGNOSTIC_PROCEDURE',
    'PATHOLOGIE': 'DISEASE_DISORDER',
    'MODE': 'ADMINISTRATION',
}

app = typer.Typer(no_args_is_help=True)


[docs] @app.command(name='document', help="Extract information of a document file into a formatted tree.") def load_document( file: Path = typer.Argument(..., exists=True, readable=True, help="The document file to read."), *, raw: bool = typer.Option( False, help="Enable row reading, skipping any transformation to convert it to the metamodel." ), root_name: str = typer.Option('ROOT', help="The root node name."), sample: int | None = typer.Option(None, help="Number of sentences to sample from the corpus.", min=1), output: typer.FileBinaryWrite | None = typer.Option(None, help="Path to save the result."), ) -> None: """Read a parse a document file to a structured tree.""" forest = list(read_document(file, raw_read=raw, root_name=root_name)) if sample: forest = random.sample(forest, sample) if output is not None: save_forest(forest, output) schema = Schema.from_forest(forest, keep_unlabelled=False) show_schema(schema)
[docs] @app.command(name='database', help="Extract the database information into a formatted tree.") def load_database( db_connection: str = typer.Argument(..., help="Database connection string."), *, simplify_association: bool = typer.Option(True, help="Simplify association tables."), sample: int | None = typer.Option(None, help="Number of sentences to sample from the corpus.", min=1), output: typer.FileBinaryWrite | None = typer.Option(None, help="Path to save the result."), ) -> None: """Extract the database schema and relations to a tree format.""" with create_engine(db_connection).connect() as connection: forest = list(read_database(connection, simplify_association=simplify_association, sample=sample or 0)) if output is not None: save_forest(forest, output) schema = Schema.from_forest(forest, keep_unlabelled=False) show_schema(schema)
[docs] @app.command(name='corpus', help="Extract a database schema form a corpus.", no_args_is_help=True) def load_corpus( corpus_path: list[typer.FileBinaryRead] = typer.Argument( ..., exists=True, readable=True, help="Path to the input corpus." ), *, language: list[str] = typer.Option(['French'], help="Language of the input corpus."), corenlp_url: str = typer.Option('http://localhost:9000', help="URL of the CoreNLP server."), tau: float = typer.Option(0.7, help="The similarity threshold.", min=0, max=1), epoch: int = typer.Option(100, help="Number of iteration for tree rewriting.", min=1), min_support: int = typer.Option(20, help="Minimum support for tree patterns.", min=1), gen_instances: int = typer.Option(0, help="Number of synthetic instances to generate.", min=0), sample: int | None = typer.Option(None, help="Number of sentences to sample from the corpus.", min=1), workers: int | None = typer.Option( None, help="Number of parallel worker processes to use. Defaults to the number of available CPU cores.", min=1 ), resolver: str | None = typer.Option( None, help="The entity resolver to use when loading the corpus.", click_type=click.Choice(['umls', 'mesh', 'rxnorm', 'go', 'hpo'], case_sensitive=False), ), output: typer.FileBinaryWrite | None = typer.Option(None, help="Path to save the result."), cache: bool = typer.Option(True, help="Enable caching of the analyzed corpus to prevent re-parsing."), shuffle: bool = typer.Option(False, help="Shuffle the corpus data before processing to introduce randomness."), debug: bool = typer.Option(False, help="Enable debug mode for more verbose output."), metrics: bool = typer.Option(False, help="Show metrics of the simplification."), log: bool = typer.Option(False, help="Enable logging to MLFlow."), ) -> None: """Automatically structure a corpus as a database instance and print the database schema as a CFG.""" if log: console.print(f'[green]MLFlow logging enabled. Logs will be send to {mlflow.get_tracking_uri()}[/]') mlflow.start_run(description='corpus_processing') mlflow.log_params( { 'has_corpus': True, 'has_instance': bool(gen_instances), } ) try: forest = asyncio.run( raw_load_corpus( corpus_path, language, parser=CoreNLPParser(corenlp_url=corenlp_url), resolver_name=resolver, cache=cache, entities_filter=ENTITIES_FILTER, relations_filter=RELATIONS_FILTER, entities_mapping=ENTITIES_MAPPING, ) ) except Exception as error: console.print_exception() raise typer.Exit(code=1) from error if sample: if sample < len(forest): forest = random.sample(list(forest), sample) else: console.print( "[yellow] You have specified a sample size larger than the total population, " "which would result in fewer results than expected." ) # Generate synthetic database instances if gen_instances: schema = Schema.from_description( groups={ 'SOSY': {'SOSY', 'ANATOMIE', 'SUBSTANCE'}, 'TREATMENT': {'SUBSTANCE', 'DOSAGE', 'ADMINISTRATION', 'FREQUENCY'}, 'EXAM': {'DIAGNOSTIC_PROCEDURE', 'ANATOMIE'}, }, rels={ 'PRESCRIPTION': ('SOSY', 'TREATMENT'), 'EXAM_RESULT': ('EXAM', 'SOSY'), }, ) console.print(Panel(schema.as_cfg(), title="Synthetic Database Schema")) with console.status("[cyan]Generating synthetic instances..."): forest.extend(gen_instance(schema, size=gen_instances, generate_collections=False)) console.print(f'[green]Generated {gen_instances} synthetic instances.[/]') if shuffle: random.shuffle(forest) console.print(f'[blue]Rewriting {len(forest)} trees with tau={tau}, epoch={epoch}, min_support={min_support}[/]') new_forest = rewrite(forest, tau=tau, epoch=epoch, min_support=min_support, debug=debug, max_workers=workers) if output is not None: save_forest(new_forest, output) # Generate schema schema = Schema.from_forest(new_forest, keep_unlabelled=False) show_schema(schema) if metrics: show_metrics(forest, new_forest, schema, tau)