[docs]defread_document(file:str|Path|BytesIO|BinaryIO,*,raw_read:bool=False,root_name:str='ROOT',)->Generator[Tree,None,None]:""" Read the file as a data tree. XML are parsed according to https://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html :param file: The document file to read. :param raw_read: If enabled, the tree corresponds to the document without any transformation applied. :param root_name: The root node name. :return: A list of trees representing the database. """raw_data=read_document_file(file)document_tree=read_tree(raw_data,root_name=root_name)ifraw_read:yielddocument_treereturnyield fromparse_document_tree(document_tree)
[docs]defread_document_file(file:str|Path|BytesIO|BinaryIO)->dict[str,Any]|list[Any]:""" Read and parse a document file like XML, JSON, or CSV. :param file: The document database file to read. :return: The parsed contents of the file. :raises FileNotFoundError: If the file does not exist. :raises OSError: If the file cannot be read. :raises ValueError: If the file cannot be read or is empty. """should_close=Falsedocument_db:BytesIO|BinaryIOifisinstance(file,str|Path):document_db=Path(file).open('rb')# noqa: SIM115should_close=Trueelse:document_db=filetry:data=parse_file(document_db)finally:ifshould_close:document_db.close()ifnotdata:msg='Empty document'raiseValueError(msg)returndata
[docs]defparse_file(file:BytesIO|BinaryIO)->dict[str,Any]|list[Any]:""" Parse a document database file like XML, JSON, or CSV. :param file: A file-like object opened for reading. :return: The parsed content of the file as a Python nested object. :raises: ValueError if none of the available parsers are able to process the input file. """cursor=file.tell()forparserinFILE_PARSERS:try:returnparser(file)exceptException:# noqa: PERF203file.seek(cursor)continuemsg='Unsupported file type'raiseValueError(msg)
[docs]defread_tree(data:dict[str,Any]|list[Any],*,root_name:str='ROOT')->Tree:""" Recursively converts a document nested structure into a tree. - Dictionaries are treated as groups. - Lists are treated as collections. - Leaf elements are treated as entities. If a list contains only a single collection, the function flattens the output by returning that collection directly instead of nesting it under another collection node. :param data: The input data structure to be converted into a Tree. :param root_name: The label for the current node. :return: A nested tree structure corresponding to the input data. """root_name=root_name.replace(' ','_').lower()sub_elements:Iterable[tuple[str,Any]]=(data.items()ifisinstance(data,dict)else((root_name,item)foritemindata))children=[]forname,sub_elementinsub_elements:ifisinstance(sub_element,dict|list):# Recursively process nested structureschildren.append(read_tree(sub_element,root_name=name))else:# Leaf node becomes an entityent_label=NodeLabel(NodeType.ENT,str(name).replace(' ','_').lower())children.append(Tree(ent_label,[str(sub_element)]))# Flatten if the result is a single collection nodeiflen(children)==1andhas_type(children[0],NodeType.COLL):returnchildren[0]label=NodeLabel(NodeType.COLL,root_name)ifisinstance(data,list)elseroot_namereturnTree(label,children)
[docs]defparse_document_tree(tree:Tree)->Generator[Tree,None,None]:""" Parse a document tree and yields processed subtrees based on collection grouping. - If the root node is **not** a collection, the entire tree is processed and a single result is yielded. - If the root node **is** a collection, each child subtree is individually processed and yielded. TODO: Enhance tree decomposition for nested collections. If no collection exists at the root level, consider splitting at the closest collection and duplicating the path to the root for each collection element. :param tree: The nested tree to be parsed. :yield: Trees representing the database. """trees=treeifhas_type(tree,NodeType.COLL)else[tree]fortreeintrees:parsed_tree=traverse_tree(tree)[1]iflen(parsed_tree):yieldparsed_tree
[docs]deftraverse_tree(tree:Tree)->tuple[Tree,Tree]:""" Recursively traverses and transforms a nested tree into a valid metamodel structure. The function extracts entity nodes and groups them under a single group node. It then establishes relations between this group and any nested subgroups. :param tree: The tree to traverse and transform. :returns: A tuple containing: - The group to anchor too for parent relationship. - The transformed tree converting subgroup to relations. """ifhas_type(tree,NodeType.ENT):# Encapsulate entities into a groupgroup_label=NodeLabel(NodeType.GROUP,tree.label().name)group_node=Tree(group_label,[Tree.convert(tree)])returngroup_node,group_nodeifhas_type(tree,NodeType.COLL):updated_children=[traverse_tree(child)[0]forchildintree]updated_tree=Tree(tree.label(),updated_children)returnupdated_tree,updated_tree# Separate entities and non-entitiesentities=[Tree.convert(subtree)forsubtreeintreeifhas_type(subtree,NodeType.ENT)]non_entities=[subtreeforsubtreeintreeifnothas_type(subtree,NodeType.ENT)]# Group node for entitiesgroup_label=NodeLabel(NodeType.GROUP,tree.label())group_node=Tree(group_label,entities)relationship_nodes:list[Tree]=[]forchildinnon_entities:child_group,child_tree=traverse_tree(child)ifchild_tree.label()=='ROOT':# extend relations recursivelyrelationship_nodes.extend(Tree.convert(grandchild)forgrandchildinchild_tree)ifhas_type(child_group,NodeType.COLL):# Create relationships with each element in the collectionforelementinchild_group:rel_label=NodeLabel(NodeType.REL,f'{group_node.label().name}<->{element.label().name}')relationship_nodes.append(Tree(rel_label,[group_node.copy(deep=True),Tree.convert(element)]))else:rel_label=NodeLabel(NodeType.REL,f'{group_node.label().name}<->{child_group.label().name}')relationship_nodes.append(Tree(rel_label,[group_node.copy(deep=True),Tree.convert(child_group)]))# Return the group node and either a tree of relations or just the group if there are no relationsreturngroup_node,Tree('ROOT',relationship_nodes)ifrelationship_nodeselsegroup_node