915 lines
32 KiB
Python
915 lines
32 KiB
Python
import re
|
|
from collections import OrderedDict
|
|
from typing import List, Tuple, Type
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from bigtree.node.node import Node
|
|
from bigtree.tree.export import tree_to_dataframe
|
|
from bigtree.tree.search import find_children, find_name
|
|
from bigtree.utils.exceptions import DuplicatedNodeError, TreeError
|
|
|
|
__all__ = [
|
|
"add_path_to_tree",
|
|
"add_dict_to_tree_by_path",
|
|
"add_dict_to_tree_by_name",
|
|
"add_dataframe_to_tree_by_path",
|
|
"add_dataframe_to_tree_by_name",
|
|
"str_to_tree",
|
|
"list_to_tree",
|
|
"list_to_tree_by_relation",
|
|
"dict_to_tree",
|
|
"nested_dict_to_tree",
|
|
"dataframe_to_tree",
|
|
"dataframe_to_tree_by_relation",
|
|
]
|
|
|
|
|
|
def add_path_to_tree(
|
|
tree: Node,
|
|
path: str,
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_attrs: dict = {},
|
|
) -> Node:
|
|
"""Add nodes and attributes to existing tree *in-place*, return node of added path.
|
|
Adds to existing tree from list of path strings.
|
|
|
|
Path should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path` and can be different from that of existing tree.
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> from bigtree import add_path_to_tree, print_tree
|
|
>>> root = Node("a")
|
|
>>> add_path_to_tree(root, "a/b/c")
|
|
Node(/a/b/c, )
|
|
>>> print_tree(root)
|
|
a
|
|
└── b
|
|
└── c
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
path (str): path to be added to tree
|
|
sep (str): path separator for input `path`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
node_attrs (dict): attributes to add to node, key: attribute name, value: attribute value, optional
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path):
|
|
raise ValueError("Path is empty, check `path`")
|
|
|
|
tree_root = tree.root
|
|
tree_sep = tree_root.sep
|
|
node_type = tree_root.__class__
|
|
branch = path.lstrip(sep).rstrip(sep).split(sep)
|
|
if branch[0] != tree_root.node_name:
|
|
raise TreeError(
|
|
f"Error: Path does not have same root node, expected {tree_root.node_name}, received {branch[0]}\n"
|
|
f"Check your input paths or verify that path separator `sep` is set correctly"
|
|
)
|
|
|
|
# Grow tree
|
|
node = tree_root
|
|
parent_node = tree_root
|
|
for idx in range(1, len(branch)):
|
|
node_name = branch[idx]
|
|
node_path = tree_sep.join(branch[: idx + 1])
|
|
if not duplicate_name_allowed:
|
|
node = find_name(tree_root, node_name)
|
|
if node and not node.path_name.endswith(node_path):
|
|
raise DuplicatedNodeError(
|
|
f"Node {node_name} already exists, try setting `duplicate_name_allowed` to True "
|
|
f"to allow `Node` with same node name"
|
|
)
|
|
else:
|
|
node = find_children(parent_node, node_name)
|
|
if not node:
|
|
node = node_type(branch[idx])
|
|
node.parent = parent_node
|
|
parent_node = node
|
|
node.set_attrs(node_attrs)
|
|
return node
|
|
|
|
|
|
def add_dict_to_tree_by_path(
|
|
tree: Node,
|
|
path_attrs: dict,
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
) -> Node:
|
|
"""Add nodes and attributes to tree *in-place*, return root of tree.
|
|
Adds to existing tree from nested dictionary, ``key``: path, ``value``: dict of attribute name and attribute value.
|
|
|
|
Path should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path` and can be different from that of existing tree.
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> from bigtree import Node, add_dict_to_tree_by_path, print_tree
|
|
>>> root = Node("a")
|
|
>>> path_dict = {
|
|
... "a": {"age": 90},
|
|
... "a/b": {"age": 65},
|
|
... "a/c": {"age": 60},
|
|
... "a/b/d": {"age": 40},
|
|
... "a/b/e": {"age": 35},
|
|
... "a/c/f": {"age": 38},
|
|
... "a/b/e/g": {"age": 10},
|
|
... "a/b/e/h": {"age": 6},
|
|
... }
|
|
>>> root = add_dict_to_tree_by_path(root, path_dict)
|
|
>>> print_tree(root)
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
path_attrs (dict): dictionary containing node path and attribute information,
|
|
key: node path, value: dict of node attribute name and attribute value
|
|
sep (str): path separator for input `path_attrs`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `path_attrs`")
|
|
|
|
tree_root = tree.root
|
|
|
|
for k, v in path_attrs.items():
|
|
add_path_to_tree(
|
|
tree_root,
|
|
k,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_attrs=v,
|
|
)
|
|
return tree_root
|
|
|
|
|
|
def add_dict_to_tree_by_name(
|
|
tree: Node, path_attrs: dict, join_type: str = "left"
|
|
) -> Node:
|
|
"""Add attributes to tree, return *new* root of tree.
|
|
Adds to existing tree from nested dictionary, ``key``: name, ``value``: dict of attribute name and attribute value.
|
|
|
|
Function can return all existing tree nodes or only tree nodes that are in the input dictionary keys.
|
|
Input dictionary keys that are not existing node names will be ignored.
|
|
Note that if multiple nodes have the same name, attributes will be added to all nodes sharing same name.
|
|
|
|
>>> from bigtree import Node, add_dict_to_tree_by_name, print_tree
|
|
>>> root = Node("a")
|
|
>>> b = Node("b", parent=root)
|
|
>>> name_dict = {
|
|
... "a": {"age": 90},
|
|
... "b": {"age": 65},
|
|
... }
|
|
>>> root = add_dict_to_tree_by_name(root, name_dict)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
path_attrs (dict): dictionary containing node name and attribute information,
|
|
key: node name, value: dict of node attribute name and attribute value
|
|
join_type (str): join type with attribute, default of 'left' takes existing tree nodes,
|
|
if join_type is set to 'inner' it will only take tree nodes that are in `path_attrs` key and drop others
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if join_type not in ["inner", "left"]:
|
|
raise ValueError("`join_type` must be one of 'inner' or 'left'")
|
|
|
|
if not len(path_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `path_attrs`")
|
|
|
|
# Convert dictionary to dataframe
|
|
data = pd.DataFrame(path_attrs).T.rename_axis("NAME").reset_index()
|
|
return add_dataframe_to_tree_by_name(tree, data=data, join_type=join_type)
|
|
|
|
|
|
def add_dataframe_to_tree_by_path(
|
|
tree: Node,
|
|
data: pd.DataFrame,
|
|
path_col: str = "",
|
|
attribute_cols: list = [],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
) -> Node:
|
|
"""Add nodes and attributes to tree *in-place*, return root of tree.
|
|
|
|
`path_col` and `attribute_cols` specify columns for node path and attributes to add to existing tree.
|
|
If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols`
|
|
|
|
Path in path column should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path_col` and can be different from that of existing tree.
|
|
|
|
Path in path column can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> import pandas as pd
|
|
>>> from bigtree import add_dataframe_to_tree_by_path, print_tree
|
|
>>> root = Node("a")
|
|
>>> path_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["a/b", 65],
|
|
... ["a/c", 60],
|
|
... ["a/b/d", 40],
|
|
... ["a/b/e", 35],
|
|
... ["a/c/f", 38],
|
|
... ["a/b/e/g", 10],
|
|
... ["a/b/e/h", 6],
|
|
... ],
|
|
... columns=["PATH", "age"]
|
|
... )
|
|
>>> root = add_dataframe_to_tree_by_path(root, path_data)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
data (pandas.DataFrame): data containing node path and attribute information
|
|
path_col (str): column of data containing `path_name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (list): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `path_col`
|
|
sep (str): path separator for input `path_col`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not path_col:
|
|
path_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(path_col)
|
|
|
|
tree_root = tree.root
|
|
data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep)
|
|
data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[path_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(path_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate path with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
for row in data.to_dict(orient="index").values():
|
|
node_attrs = row.copy()
|
|
del node_attrs[path_col]
|
|
node_attrs = {k: v for k, v in node_attrs.items() if not np.all(pd.isnull(v))}
|
|
add_path_to_tree(
|
|
tree_root,
|
|
row[path_col],
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_attrs=node_attrs,
|
|
)
|
|
return tree_root
|
|
|
|
|
|
def add_dataframe_to_tree_by_name(
|
|
tree: Node,
|
|
data: pd.DataFrame,
|
|
name_col: str = "",
|
|
attribute_cols: list = [],
|
|
join_type: str = "left",
|
|
):
|
|
"""Add attributes to tree, return *new* root of tree.
|
|
|
|
`name_col` and `attribute_cols` specify columns for node name and attributes to add to existing tree.
|
|
If columns are not specified, the first column will be taken as name column and all other columns as attributes.
|
|
|
|
Function can return all existing tree nodes or only tree nodes that are in the input data node names.
|
|
Input data node names that are not existing node names will be ignored.
|
|
Note that if multiple nodes have the same name, attributes will be added to all nodes sharing same name.
|
|
|
|
>>> import pandas as pd
|
|
>>> from bigtree import add_dataframe_to_tree_by_name, print_tree
|
|
>>> root = Node("a")
|
|
>>> b = Node("b", parent=root)
|
|
>>> name_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["b", 65],
|
|
... ],
|
|
... columns=["NAME", "age"]
|
|
... )
|
|
>>> root = add_dataframe_to_tree_by_name(root, name_data)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
data (pandas.DataFrame): data containing node name and attribute information
|
|
name_col (str): column of data containing `name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (list): column(s) of data containing node attribute information,
|
|
if not set, it will take all columns of data except path_col
|
|
join_type (str): join type with attribute, default of 'left' takes existing tree nodes,
|
|
if join_type is set to 'inner' it will only take tree nodes with attributes and drop the other nodes
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if join_type not in ["inner", "left"]:
|
|
raise ValueError("`join_type` must be one of 'inner' or 'left'")
|
|
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not name_col:
|
|
name_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(name_col)
|
|
|
|
# Attribute data
|
|
path_col = "PATH"
|
|
data2 = data.copy()[[name_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[name_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(name_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate name with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
# Tree data
|
|
tree_root = tree.root
|
|
sep = tree_root.sep
|
|
node_type = tree_root.__class__
|
|
data_tree = tree_to_dataframe(
|
|
tree_root, name_col=name_col, path_col=path_col, all_attrs=True
|
|
)
|
|
common_cols = list(set(data_tree.columns).intersection(attribute_cols))
|
|
data_tree = data_tree.drop(columns=common_cols)
|
|
|
|
# Attribute data
|
|
data_tree_attrs = pd.merge(data_tree, data, on=name_col, how=join_type)
|
|
data_tree_attrs = data_tree_attrs.drop(columns=name_col)
|
|
|
|
return dataframe_to_tree(
|
|
data_tree_attrs, path_col=path_col, sep=sep, node_type=node_type
|
|
)
|
|
|
|
|
|
def str_to_tree(
|
|
tree_string: str,
|
|
tree_prefix_list: List[str] = [],
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
r"""Construct tree from tree string
|
|
|
|
>>> from bigtree import str_to_tree, print_tree
|
|
>>> tree_str = 'a\n├── b\n│ ├── d\n│ └── e\n│ ├── g\n│ └── h\n└── c\n └── f'
|
|
>>> root = str_to_tree(tree_str, tree_prefix_list=["├──", "└──"])
|
|
>>> print_tree(root)
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
tree_string (str): String to construct tree
|
|
tree_prefix_list (list): List of prefix to mark the end of tree branch/stem and start of node name, optional.
|
|
If not specified, it will infer unicode characters and whitespace as prefix.
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
tree_string = tree_string.strip("\n")
|
|
if not len(tree_string):
|
|
raise ValueError("Tree string does not contain any data, check `tree_string`")
|
|
tree_list = tree_string.split("\n")
|
|
tree_root = node_type(tree_list[0])
|
|
|
|
# Infer prefix length
|
|
prefix_length = None
|
|
cur_parent = tree_root
|
|
for node_str in tree_list[1:]:
|
|
if len(tree_prefix_list):
|
|
node_name = re.split("|".join(tree_prefix_list), node_str)[-1].lstrip()
|
|
else:
|
|
node_name = node_str.encode("ascii", "ignore").decode("ascii").lstrip()
|
|
|
|
# Find node parent
|
|
if not prefix_length:
|
|
prefix_length = node_str.index(node_name)
|
|
if not prefix_length:
|
|
raise ValueError(
|
|
f"Invalid prefix, prefix should be unicode character or whitespace, "
|
|
f"otherwise specify one or more prefixes in `tree_prefix_list`, check: {node_str}"
|
|
)
|
|
node_prefix_length = node_str.index(node_name)
|
|
if node_prefix_length % prefix_length:
|
|
raise ValueError(
|
|
f"Tree string have different prefix length, check branch: {node_str}"
|
|
)
|
|
while cur_parent.depth > node_prefix_length / prefix_length:
|
|
cur_parent = cur_parent.parent
|
|
|
|
# Link node
|
|
child_node = node_type(node_name)
|
|
child_node.parent = cur_parent
|
|
cur_parent = child_node
|
|
|
|
return tree_root
|
|
|
|
|
|
def list_to_tree(
|
|
paths: list,
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from list of path strings.
|
|
|
|
Path should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> from bigtree import list_to_tree, print_tree
|
|
>>> path_list = ["a/b", "a/c", "a/b/d", "a/b/e", "a/c/f", "a/b/e/g", "a/b/e/h"]
|
|
>>> root = list_to_tree(path_list)
|
|
>>> print_tree(root)
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
paths (list): list containing path strings
|
|
sep (str): path separator for input `paths` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(paths):
|
|
raise ValueError("Path list does not contain any data, check `paths`")
|
|
|
|
# Remove duplicates
|
|
paths = list(OrderedDict.fromkeys(paths))
|
|
|
|
# Construct root node
|
|
root_name = paths[0].lstrip(sep).split(sep)[0]
|
|
root_node = node_type(root_name)
|
|
root_node.sep = sep
|
|
|
|
for path in paths:
|
|
add_path_to_tree(
|
|
root_node, path, sep=sep, duplicate_name_allowed=duplicate_name_allowed
|
|
)
|
|
root_node.sep = sep
|
|
return root_node
|
|
|
|
|
|
def list_to_tree_by_relation(
|
|
relations: List[Tuple[str, str]],
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from list of tuple containing parent-child names.
|
|
|
|
Note that node names must be unique since tree is created from parent-child names,
|
|
except for leaf nodes - names of leaf nodes may be repeated as there is no confusion.
|
|
|
|
>>> from bigtree import list_to_tree_by_relation, print_tree
|
|
>>> relations_list = [("a", "b"), ("a", "c"), ("b", "d"), ("b", "e"), ("c", "f"), ("e", "g"), ("e", "h")]
|
|
>>> root = list_to_tree_by_relation(relations_list)
|
|
>>> print_tree(root)
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
relations (list): list containing tuple containing parent-child names
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(relations):
|
|
raise ValueError("Path list does not contain any data, check `relations`")
|
|
|
|
relation_data = pd.DataFrame(relations, columns=["parent", "child"])
|
|
return dataframe_to_tree_by_relation(
|
|
relation_data, child_col="child", parent_col="parent", node_type=node_type
|
|
)
|
|
|
|
|
|
def dict_to_tree(
|
|
path_attrs: dict,
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from nested dictionary using path,
|
|
``key``: path, ``value``: dict of attribute name and attribute value.
|
|
|
|
Path should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> from bigtree import dict_to_tree, print_tree
|
|
>>> path_dict = {
|
|
... "a": {"age": 90},
|
|
... "a/b": {"age": 65},
|
|
... "a/c": {"age": 60},
|
|
... "a/b/d": {"age": 40},
|
|
... "a/b/e": {"age": 35},
|
|
... "a/c/f": {"age": 38},
|
|
... "a/b/e/g": {"age": 10},
|
|
... "a/b/e/h": {"age": 6},
|
|
... }
|
|
>>> root = dict_to_tree(path_dict)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
path_attrs (dict): dictionary containing path and node attribute information,
|
|
key: path, value: dict of tree attribute and attribute value
|
|
sep (str): path separator of input `path_attrs` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `path_attrs`")
|
|
|
|
# Convert dictionary to dataframe
|
|
data = pd.DataFrame(path_attrs).T.rename_axis("PATH").reset_index()
|
|
return dataframe_to_tree(
|
|
data,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_type=node_type,
|
|
)
|
|
|
|
|
|
def nested_dict_to_tree(
|
|
node_attrs: dict,
|
|
name_key: str = "name",
|
|
child_key: str = "children",
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from nested recursive dictionary.
|
|
- ``key``: `name_key`, `child_key`, or any attributes key.
|
|
- ``value`` of `name_key` (str): node name.
|
|
- ``value`` of `child_key` (list): list of dict containing `name_key` and `child_key` (recursive).
|
|
|
|
>>> from bigtree import nested_dict_to_tree, print_tree
|
|
>>> path_dict = {
|
|
... "name": "a",
|
|
... "age": 90,
|
|
... "children": [
|
|
... {"name": "b",
|
|
... "age": 65,
|
|
... "children": [
|
|
... {"name": "d", "age": 40},
|
|
... {"name": "e", "age": 35, "children": [
|
|
... {"name": "g", "age": 10},
|
|
... ]},
|
|
... ]},
|
|
... ],
|
|
... }
|
|
>>> root = nested_dict_to_tree(path_dict)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
├── d [age=40]
|
|
└── e [age=35]
|
|
└── g [age=10]
|
|
|
|
Args:
|
|
node_attrs (dict): dictionary containing node, children, and node attribute information,
|
|
key: `name_key` and `child_key`
|
|
value of `name_key` (str): node name
|
|
value of `child_key` (list): list of dict containing `name_key` and `child_key` (recursive)
|
|
name_key (str): key of node name, value is type str
|
|
child_key (str): key of child list, value is type list
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
|
|
def recursive_add_child(child_dict, parent_node=None):
|
|
child_dict = child_dict.copy()
|
|
node_name = child_dict.pop(name_key)
|
|
node_children = child_dict.pop(child_key, [])
|
|
node = node_type(node_name, parent=parent_node, **child_dict)
|
|
for _child in node_children:
|
|
recursive_add_child(_child, parent_node=node)
|
|
return node
|
|
|
|
root_node = recursive_add_child(node_attrs)
|
|
return root_node
|
|
|
|
|
|
def dataframe_to_tree(
|
|
data: pd.DataFrame,
|
|
path_col: str = "",
|
|
attribute_cols: list = [],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from pandas DataFrame using path, return root of tree.
|
|
|
|
`path_col` and `attribute_cols` specify columns for node path and attributes to construct tree.
|
|
If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols`.
|
|
|
|
Path in path column can start from root node `name`, or start with `sep`.
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
Path in path column should contain `Node` name, separated by `sep`.
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
All paths should start from the same root node.
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
>>> import pandas as pd
|
|
>>> from bigtree import dataframe_to_tree, print_tree
|
|
>>> path_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["a/b", 65],
|
|
... ["a/c", 60],
|
|
... ["a/b/d", 40],
|
|
... ["a/b/e", 35],
|
|
... ["a/c/f", 38],
|
|
... ["a/b/e/g", 10],
|
|
... ["a/b/e/h", 6],
|
|
... ],
|
|
... columns=["PATH", "age"]
|
|
... )
|
|
>>> root = dataframe_to_tree(path_data)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
data (pandas.DataFrame): data containing path and node attribute information
|
|
path_col (str): column of data containing `path_name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (list): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `path_col`
|
|
sep (str): path separator of input `path_col` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicated `Node` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not path_col:
|
|
path_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(path_col)
|
|
|
|
data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep)
|
|
data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[path_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(path_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate path with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
root_name = data[path_col].values[0].split(sep)[0]
|
|
root_node = node_type(root_name)
|
|
add_dataframe_to_tree_by_path(
|
|
root_node,
|
|
data,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
)
|
|
root_node.sep = sep
|
|
return root_node
|
|
|
|
|
|
def dataframe_to_tree_by_relation(
|
|
data: pd.DataFrame,
|
|
child_col: str = "",
|
|
parent_col: str = "",
|
|
attribute_cols: list = [],
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from pandas DataFrame using parent and child names, return root of tree.
|
|
|
|
Note that node names must be unique since tree is created from parent-child names,
|
|
except for leaf nodes - names of leaf nodes may be repeated as there is no confusion.
|
|
|
|
`child_col` and `parent_col` specify columns for child name and parent name to construct tree.
|
|
`attribute_cols` specify columns for node attribute for child name
|
|
If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
|
|
columns are `attribute_cols`.
|
|
|
|
>>> import pandas as pd
|
|
>>> from bigtree import dataframe_to_tree_by_relation, print_tree
|
|
>>> relation_data = pd.DataFrame([
|
|
... ["a", None, 90],
|
|
... ["b", "a", 65],
|
|
... ["c", "a", 60],
|
|
... ["d", "b", 40],
|
|
... ["e", "b", 35],
|
|
... ["f", "c", 38],
|
|
... ["g", "e", 10],
|
|
... ["h", "e", 6],
|
|
... ],
|
|
... columns=["child", "parent", "age"]
|
|
... )
|
|
>>> root = dataframe_to_tree_by_relation(relation_data)
|
|
>>> print_tree(root, attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
data (pandas.DataFrame): data containing path and node attribute information
|
|
child_col (str): column of data containing child name information, defaults to None
|
|
if not set, it will take the first column of data
|
|
parent_col (str): column of data containing parent name information, defaults to None
|
|
if not set, it will take the second column of data
|
|
attribute_cols (list): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `child_col` and `parent_col`
|
|
node_type (Type[Node]): node type of tree to be created, defaults to Node
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not child_col:
|
|
child_col = data.columns[0]
|
|
if not parent_col:
|
|
parent_col = data.columns[1]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(child_col)
|
|
attribute_cols.remove(parent_col)
|
|
|
|
data_check = data.copy()[[child_col, parent_col]].drop_duplicates()
|
|
# Filter for child nodes that are parent of other nodes
|
|
data_check = data_check[data_check[child_col].isin(data_check[parent_col])]
|
|
_duplicate_check = (
|
|
data_check[child_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(child_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate child with different parent where the child is also a parent node.\n"
|
|
f"Duplicated node names should not happen, but can only exist in leaf nodes to avoid confusion.\n"
|
|
f"Check {_duplicate_check}"
|
|
)
|
|
|
|
# If parent-child contains None -> root
|
|
root_row = data[data[parent_col].isnull()]
|
|
root_names = list(root_row[child_col])
|
|
if not len(root_names):
|
|
root_names = list(set(data[parent_col]) - set(data[child_col]))
|
|
if len(root_names) != 1:
|
|
raise ValueError(f"Unable to determine root node\nCheck {root_names}")
|
|
root_name = root_names[0]
|
|
root_node = node_type(root_name)
|
|
|
|
def retrieve_attr(row):
|
|
node_attrs = row.copy()
|
|
node_attrs["name"] = node_attrs[child_col]
|
|
del node_attrs[child_col]
|
|
del node_attrs[parent_col]
|
|
_node_attrs = {k: v for k, v in node_attrs.items() if not np.all(pd.isnull(v))}
|
|
return _node_attrs
|
|
|
|
def recursive_create_child(parent_node):
|
|
child_rows = data[data[parent_col] == parent_node.node_name]
|
|
|
|
for row in child_rows.to_dict(orient="index").values():
|
|
child_node = node_type(**retrieve_attr(row))
|
|
child_node.parent = parent_node
|
|
recursive_create_child(child_node)
|
|
|
|
# Create root node attributes
|
|
if len(root_row):
|
|
row = list(root_row.to_dict(orient="index").values())[0]
|
|
root_node.set_attrs(retrieve_attr(row))
|
|
recursive_create_child(root_node)
|
|
return root_node
|