187 lines
6.8 KiB
Python
187 lines
6.8 KiB
Python
from typing import List, Tuple, Type
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from bigtree.node.dagnode import DAGNode
|
|
|
|
__all__ = ["list_to_dag", "dict_to_dag", "dataframe_to_dag"]
|
|
|
|
|
|
def list_to_dag(
|
|
relations: List[Tuple[str, str]],
|
|
node_type: Type[DAGNode] = DAGNode,
|
|
) -> DAGNode:
|
|
"""Construct DAG from list of tuple containing parent-child names.
|
|
Note that node names must be unique.
|
|
|
|
>>> from bigtree import list_to_dag, dag_iterator
|
|
>>> relations_list = [("a", "c"), ("a", "d"), ("b", "c"), ("c", "d"), ("d", "e")]
|
|
>>> dag = list_to_dag(relations_list)
|
|
>>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)]
|
|
[('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')]
|
|
|
|
Args:
|
|
relations (list): list containing tuple of parent-child names
|
|
node_type (Type[DAGNode]): node type of DAG to be created, defaults to DAGNode
|
|
|
|
Returns:
|
|
(DAGNode)
|
|
"""
|
|
if not len(relations):
|
|
raise ValueError("Input list does not contain any data, check `relations`")
|
|
|
|
relation_data = pd.DataFrame(relations, columns=["parent", "child"])
|
|
return dataframe_to_dag(
|
|
relation_data, child_col="child", parent_col="parent", node_type=node_type
|
|
)
|
|
|
|
|
|
def dict_to_dag(
|
|
relation_attrs: dict,
|
|
parent_key: str = "parents",
|
|
node_type: Type[DAGNode] = DAGNode,
|
|
) -> DAGNode:
|
|
"""Construct DAG from nested dictionary, ``key``: child name, ``value``: dict of parent names, attribute name and
|
|
attribute value.
|
|
Note that node names must be unique.
|
|
|
|
>>> from bigtree import dict_to_dag, dag_iterator
|
|
>>> relation_dict = {
|
|
... "a": {"step": 1},
|
|
... "b": {"step": 1},
|
|
... "c": {"parents": ["a", "b"], "step": 2},
|
|
... "d": {"parents": ["a", "c"], "step": 2},
|
|
... "e": {"parents": ["d"], "step": 3},
|
|
... }
|
|
>>> dag = dict_to_dag(relation_dict, parent_key="parents")
|
|
>>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)]
|
|
[('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')]
|
|
|
|
Args:
|
|
relation_attrs (dict): dictionary containing node, node parents, and node attribute information,
|
|
key: child name, value: dict of parent names, node attribute and attribute value
|
|
parent_key (str): key of dictionary to retrieve list of parents name, defaults to "parent"
|
|
node_type (Type[DAGNode]): node type of DAG to be created, defaults to DAGNode
|
|
|
|
Returns:
|
|
(DAGNode)
|
|
"""
|
|
if not len(relation_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `relation_attrs`")
|
|
|
|
# Convert dictionary to dataframe
|
|
data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index()
|
|
assert (
|
|
parent_key in data
|
|
), f"Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`"
|
|
|
|
data = data.explode(parent_key)
|
|
return dataframe_to_dag(
|
|
data,
|
|
child_col="_tmp_child",
|
|
parent_col=parent_key,
|
|
node_type=node_type,
|
|
)
|
|
|
|
|
|
def dataframe_to_dag(
|
|
data: pd.DataFrame,
|
|
child_col: str = None,
|
|
parent_col: str = None,
|
|
attribute_cols: list = [],
|
|
node_type: Type[DAGNode] = DAGNode,
|
|
) -> DAGNode:
|
|
"""Construct DAG from pandas DataFrame.
|
|
Note that node names must be unique.
|
|
|
|
`child_col` and `parent_col` specify columns for child name and parent name to construct DAG.
|
|
`attribute_cols` specify columns for node attribute for child name
|
|
If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
|
|
columns are `attribute_cols`.
|
|
|
|
>>> import pandas as pd
|
|
>>> from bigtree import dataframe_to_dag, dag_iterator
|
|
>>> relation_data = pd.DataFrame([
|
|
... ["a", None, 1],
|
|
... ["b", None, 1],
|
|
... ["c", "a", 2],
|
|
... ["c", "b", 2],
|
|
... ["d", "a", 2],
|
|
... ["d", "c", 2],
|
|
... ["e", "d", 3],
|
|
... ],
|
|
... columns=["child", "parent", "step"]
|
|
... )
|
|
>>> dag = dataframe_to_dag(relation_data)
|
|
>>> [(parent.node_name, child.node_name) for parent, child in dag_iterator(dag)]
|
|
[('a', 'd'), ('c', 'd'), ('d', 'e'), ('a', 'c'), ('b', 'c')]
|
|
|
|
Args:
|
|
data (pandas.DataFrame): data containing path and node attribute information
|
|
child_col (str): column of data containing child name information, defaults to None
|
|
if not set, it will take the first column of data
|
|
parent_col (str): column of data containing parent name information, defaults to None
|
|
if not set, it will take the second column of data
|
|
attribute_cols (list): columns of data containing child node attribute information,
|
|
if not set, it will take all columns of data except `child_col` and `parent_col`
|
|
node_type (Type[DAGNode]): node type of DAG to be created, defaults to DAGNode
|
|
|
|
Returns:
|
|
(DAGNode)
|
|
"""
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not child_col:
|
|
child_col = data.columns[0]
|
|
if not parent_col:
|
|
parent_col = data.columns[1]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(child_col)
|
|
attribute_cols.remove(parent_col)
|
|
|
|
data_check = data.copy()[[child_col] + attribute_cols].drop_duplicates()
|
|
_duplicate_check = (
|
|
data_check[child_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(child_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate child name with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
if np.any(data[child_col].isnull()):
|
|
raise ValueError(f"Child name cannot be empty, check {child_col}")
|
|
|
|
node_dict = dict()
|
|
parent_node = None
|
|
|
|
for row in data.reset_index(drop=True).to_dict(orient="index").values():
|
|
child_name = row[child_col]
|
|
parent_name = row[parent_col]
|
|
node_attrs = row.copy()
|
|
del node_attrs[child_col]
|
|
del node_attrs[parent_col]
|
|
node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)}
|
|
child_node = node_dict.get(child_name)
|
|
if not child_node:
|
|
child_node = node_type(child_name)
|
|
node_dict[child_name] = child_node
|
|
child_node.set_attrs(node_attrs)
|
|
|
|
if not pd.isnull(parent_name):
|
|
parent_node = node_dict.get(parent_name)
|
|
if not parent_node:
|
|
parent_node = node_type(parent_name)
|
|
node_dict[parent_name] = parent_node
|
|
child_node.parents = [parent_node]
|
|
|
|
return parent_node
|