from __future__ import annotations from typing import Any, Dict, List, Tuple, Type from bigtree.node.dagnode import DAGNode from bigtree.utils.exceptions import optional_dependencies_pandas try: import pandas as pd except ImportError: # pragma: no cover pd = None __all__ = ["list_to_dag", "dict_to_dag", "dataframe_to_dag"] @optional_dependencies_pandas def list_to_dag( relations: List[Tuple[str, str]], node_type: Type[DAGNode] = DAGNode, ) -> DAGNode: """Construct DAG from list of tuples containing parent-child names. Note that node names must be unique. Examples: >>> from bigtree import list_to_dag, dag_iterator >>> relations_list = [("a", "c"), ("a", "d"), ("b", "c"), ("c", "d"), ("d", "e")] >>> dag = list_to_dag(relations_list) >>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)] [('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')] Args: relations (List[Tuple[str, str]]): list containing tuple of parent-child names node_type (Type[DAGNode]): node type of DAG to be created, defaults to ``DAGNode`` Returns: (DAGNode) """ if not len(relations): raise ValueError("Input list does not contain any data, check `relations`") relation_data = pd.DataFrame(relations, columns=["parent", "child"]) return dataframe_to_dag( relation_data, child_col="child", parent_col="parent", node_type=node_type ) def dict_to_dag( relation_attrs: Dict[str, Any], parent_key: str = "parents", node_type: Type[DAGNode] = DAGNode, ) -> DAGNode: """Construct DAG from nested dictionary, ``key``: child name, ``value``: dictionary of parent names, attribute name, and attribute value. Note that node names must be unique. Examples: >>> from bigtree import dict_to_dag, dag_iterator >>> relation_dict = { ... "a": {"step": 1}, ... "b": {"step": 1}, ... "c": {"parents": ["a", "b"], "step": 2}, ... "d": {"parents": ["a", "c"], "step": 2}, ... "e": {"parents": ["d"], "step": 3}, ... } >>> dag = dict_to_dag(relation_dict, parent_key="parents") >>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)] [('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')] Args: relation_attrs (Dict[str, Any]): dictionary containing node, node parents, and node attribute information, key: child name, value: dictionary of parent names, node attribute, and attribute value parent_key (str): key of dictionary to retrieve list of parents name, defaults to 'parent' node_type (Type[DAGNode]): node type of DAG to be created, defaults to ``DAGNode`` Returns: (DAGNode) """ if not len(relation_attrs): raise ValueError("Dictionary does not contain any data, check `relation_attrs`") # Convert dictionary to dataframe data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index() if parent_key not in data: raise ValueError( f"Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`" ) data = data.explode(parent_key) return dataframe_to_dag( data, child_col="_tmp_child", parent_col=parent_key, node_type=node_type, ) @optional_dependencies_pandas def dataframe_to_dag( data: pd.DataFrame, child_col: str = "", parent_col: str = "", attribute_cols: List[str] = [], node_type: Type[DAGNode] = DAGNode, ) -> DAGNode: """Construct DAG from pandas DataFrame. Note that node names must be unique. - `child_col` and `parent_col` specify columns for child name and parent name to construct DAG. - `attribute_cols` specify columns for node attribute for child name. - If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other columns are `attribute_cols`. Examples: >>> import pandas as pd >>> from bigtree import dataframe_to_dag, dag_iterator >>> relation_data = pd.DataFrame([ ... ["a", None, 1], ... ["b", None, 1], ... ["c", "a", 2], ... ["c", "b", 2], ... ["d", "a", 2], ... ["d", "c", 2], ... ["e", "d", 3], ... ], ... columns=["child", "parent", "step"] ... ) >>> dag = dataframe_to_dag(relation_data) >>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)] [('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')] Args: data (pd.DataFrame): data containing path and node attribute information child_col (str): column of data containing child name information, defaults to '' if not set, it will take the first column of data parent_col (str): column of data containing parent name information, defaults to '' if not set, it will take the second column of data attribute_cols (List[str]): columns of data containing child node attribute information, if not set, it will take all columns of data except `child_col` and `parent_col` node_type (Type[DAGNode]): node type of DAG to be created, defaults to ``DAGNode`` Returns: (DAGNode) """ data = data.copy() if not len(data.columns): raise ValueError("Data does not contain any columns, check `data`") if not len(data): raise ValueError("Data does not contain any rows, check `data`") if not child_col: child_col = data.columns[0] elif child_col not in data.columns: raise ValueError(f"Child column not in data, check `child_col`: {child_col}") if not parent_col: parent_col = data.columns[1] elif parent_col not in data.columns: raise ValueError(f"Parent column not in data, check `parent_col`: {parent_col}") if not len(attribute_cols): attribute_cols = list(data.columns) attribute_cols.remove(child_col) attribute_cols.remove(parent_col) elif any([col not in data.columns for col in attribute_cols]): raise ValueError( f"One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}" ) data_check = data.copy()[[child_col, parent_col] + attribute_cols].drop_duplicates( subset=[child_col] + attribute_cols ) _duplicate_check = ( data_check[child_col] .value_counts() .to_frame("counts") .rename_axis(child_col) .reset_index() ) _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] if len(_duplicate_check): raise ValueError( f"There exists duplicate child name with different attributes\n" f"Check {_duplicate_check}" ) if sum(data[child_col].isnull()): raise ValueError(f"Child name cannot be empty, check column: {child_col}") node_dict: Dict[str, DAGNode] = dict() parent_node = DAGNode() for row in data.reset_index(drop=True).to_dict(orient="index").values(): child_name = row[child_col] parent_name = row[parent_col] node_attrs = row.copy() del node_attrs[child_col] del node_attrs[parent_col] node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)} child_node = node_dict.get(child_name, node_type(child_name)) child_node.set_attrs(node_attrs) node_dict[child_name] = child_node if not pd.isnull(parent_name): parent_node = node_dict.get(parent_name, node_type(parent_name)) node_dict[parent_name] = parent_node child_node.parents = [parent_node] return parent_node