1328 lines
48 KiB
Python
1328 lines
48 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import OrderedDict, defaultdict
|
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
|
|
|
|
from bigtree.node.node import Node
|
|
from bigtree.tree.export import tree_to_dataframe
|
|
from bigtree.tree.search import find_child_by_name, find_name
|
|
from bigtree.utils.constants import NewickCharacter, NewickState
|
|
from bigtree.utils.exceptions import (
|
|
DuplicatedNodeError,
|
|
TreeError,
|
|
optional_dependencies_pandas,
|
|
)
|
|
|
|
try:
|
|
import pandas as pd
|
|
except ImportError: # pragma: no cover
|
|
pd = None
|
|
|
|
__all__ = [
|
|
"add_path_to_tree",
|
|
"add_dict_to_tree_by_path",
|
|
"add_dict_to_tree_by_name",
|
|
"add_dataframe_to_tree_by_path",
|
|
"add_dataframe_to_tree_by_name",
|
|
"str_to_tree",
|
|
"list_to_tree",
|
|
"list_to_tree_by_relation",
|
|
"dict_to_tree",
|
|
"nested_dict_to_tree",
|
|
"dataframe_to_tree",
|
|
"dataframe_to_tree_by_relation",
|
|
"newick_to_tree",
|
|
]
|
|
|
|
|
|
def add_path_to_tree(
|
|
tree: Node,
|
|
path: str,
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_attrs: Dict[str, Any] = {},
|
|
) -> Node:
|
|
"""Add nodes and attributes to existing tree *in-place*, return node of path added.
|
|
Adds to existing tree from list of path strings.
|
|
|
|
Path should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path` and can differ from existing tree.
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc., and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> from bigtree import add_path_to_tree, Node
|
|
>>> root = Node("a")
|
|
>>> add_path_to_tree(root, "a/b/c")
|
|
Node(/a/b/c, )
|
|
>>> root.show()
|
|
a
|
|
└── b
|
|
└── c
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
path (str): path to be added to tree
|
|
sep (str): path separator for input `path`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
node_attrs (Dict[str, Any]): attributes to add to node, key: attribute name, value: attribute value, optional
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path):
|
|
raise ValueError("Path is empty, check `path`")
|
|
|
|
tree_root = tree.root
|
|
tree_sep = tree_root.sep
|
|
node_type = tree_root.__class__
|
|
branch = path.lstrip(sep).rstrip(sep).split(sep)
|
|
if branch[0] != tree_root.node_name:
|
|
raise TreeError(
|
|
f"Path does not have same root node, expected {tree_root.node_name}, received {branch[0]}\n"
|
|
f"Check your input paths or verify that path separator `sep` is set correctly"
|
|
)
|
|
|
|
# Grow tree
|
|
node = tree_root
|
|
parent_node = tree_root
|
|
for idx in range(1, len(branch)):
|
|
node_name = branch[idx]
|
|
node_path = tree_sep.join(branch[: idx + 1])
|
|
if not duplicate_name_allowed:
|
|
node = find_name(tree_root, node_name)
|
|
if node and not node.path_name.endswith(node_path):
|
|
raise DuplicatedNodeError(
|
|
f"Node {node_name} already exists, try setting `duplicate_name_allowed` to True "
|
|
f"to allow `Node` with same node name"
|
|
)
|
|
else:
|
|
node = find_child_by_name(parent_node, node_name)
|
|
if not node:
|
|
if idx == len(branch) - 1:
|
|
node_name = node_attrs.pop("name", branch[idx])
|
|
node = node_type(node_name, **node_attrs)
|
|
else:
|
|
node = node_type(branch[idx])
|
|
node.parent = parent_node
|
|
parent_node = node
|
|
node.set_attrs(node_attrs)
|
|
return node
|
|
|
|
|
|
def add_dict_to_tree_by_path(
|
|
tree: Node,
|
|
path_attrs: Dict[str, Dict[str, Any]],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
) -> Node:
|
|
"""Add nodes and attributes to tree *in-place*, return root of tree.
|
|
Adds to existing tree from nested dictionary, ``key``: path, ``value``: dict of attribute name and attribute value.
|
|
|
|
Path should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path` and can differ from existing tree.
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> from bigtree import Node, add_dict_to_tree_by_path
|
|
>>> root = Node("a")
|
|
>>> path_dict = {
|
|
... "a": {"age": 90},
|
|
... "a/b": {"age": 65},
|
|
... "a/c": {"age": 60},
|
|
... "a/b/d": {"age": 40},
|
|
... "a/b/e": {"age": 35},
|
|
... "a/c/f": {"age": 38},
|
|
... "a/b/e/g": {"age": 10},
|
|
... "a/b/e/h": {"age": 6},
|
|
... }
|
|
>>> root = add_dict_to_tree_by_path(root, path_dict)
|
|
>>> root.show()
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
path_attrs (Dict[str, Dict[str, Any]]): dictionary containing node path and attribute information,
|
|
key: node path, value: dict of node attribute name and attribute value
|
|
sep (str): path separator for input `path_attrs`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `path_attrs`")
|
|
|
|
tree_root = tree.root
|
|
|
|
for k, v in path_attrs.items():
|
|
add_path_to_tree(
|
|
tree_root,
|
|
k,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_attrs=v,
|
|
)
|
|
return tree_root
|
|
|
|
|
|
@optional_dependencies_pandas
|
|
def add_dict_to_tree_by_name(
|
|
tree: Node, name_attrs: Dict[str, Dict[str, Any]], join_type: str = "left"
|
|
) -> Node:
|
|
"""Add attributes to tree, return *new* root of tree.
|
|
Adds to existing tree from nested dictionary, ``key``: name, ``value``: dict of attribute name and attribute value.
|
|
|
|
Function can return all existing tree nodes or only tree nodes that are in the input dictionary keys depending on join type.
|
|
Input dictionary keys that are not existing node names will be ignored.
|
|
Note that if multiple nodes have the same name, attributes will be added to all nodes sharing the same name.
|
|
|
|
Examples:
|
|
>>> from bigtree import Node, add_dict_to_tree_by_name
|
|
>>> root = Node("a")
|
|
>>> b = Node("b", parent=root)
|
|
>>> name_dict = {
|
|
... "a": {"age": 90},
|
|
... "b": {"age": 65},
|
|
... }
|
|
>>> root = add_dict_to_tree_by_name(root, name_dict)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
name_attrs (Dict[str, Dict[str, Any]]): dictionary containing node name and attribute information,
|
|
key: node name, value: dict of node attribute name and attribute value
|
|
join_type (str): join type with attribute, default of 'left' takes existing tree nodes,
|
|
if join_type is set to 'inner' it will only take tree nodes that are in `name_attrs` key and drop others
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if join_type not in ["inner", "left"]:
|
|
raise ValueError("`join_type` must be one of 'inner' or 'left'")
|
|
|
|
if not len(name_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `name_attrs`")
|
|
|
|
# Convert dictionary to dataframe
|
|
data = pd.DataFrame(name_attrs).T.rename_axis("NAME").reset_index()
|
|
return add_dataframe_to_tree_by_name(tree, data=data, join_type=join_type)
|
|
|
|
|
|
def add_dataframe_to_tree_by_path(
|
|
tree: Node,
|
|
data: pd.DataFrame,
|
|
path_col: str = "",
|
|
attribute_cols: List[str] = [],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
) -> Node:
|
|
"""Add nodes and attributes to tree *in-place*, return root of tree.
|
|
|
|
`path_col` and `attribute_cols` specify columns for node path and attributes to add to existing tree.
|
|
If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols`
|
|
|
|
Path in path column should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
- Path separator `sep` is for the input `path` and can differ from existing tree.
|
|
|
|
Path in path column can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> from bigtree import add_dataframe_to_tree_by_path, Node
|
|
>>> root = Node("a")
|
|
>>> path_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["a/b", 65],
|
|
... ["a/c", 60],
|
|
... ["a/b/d", 40],
|
|
... ["a/b/e", 35],
|
|
... ["a/c/f", 38],
|
|
... ["a/b/e/g", 10],
|
|
... ["a/b/e/h", 6],
|
|
... ],
|
|
... columns=["PATH", "age"]
|
|
... )
|
|
>>> root = add_dataframe_to_tree_by_path(root, path_data)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
data (pd.DataFrame): data containing node path and attribute information
|
|
path_col (str): column of data containing `path_name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (List[str]): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `path_col`
|
|
sep (str): path separator for input `path_col`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
data = data.copy()
|
|
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not path_col:
|
|
path_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(path_col)
|
|
|
|
tree_root = tree.root
|
|
data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep)
|
|
data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[path_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(path_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate path with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
for row in data.to_dict(orient="index").values():
|
|
node_attrs = row.copy()
|
|
del node_attrs[path_col]
|
|
node_attrs = {k: v for k, v in node_attrs.items() if v is not None}
|
|
add_path_to_tree(
|
|
tree_root,
|
|
row[path_col],
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_attrs=node_attrs,
|
|
)
|
|
return tree_root
|
|
|
|
|
|
@optional_dependencies_pandas
|
|
def add_dataframe_to_tree_by_name(
|
|
tree: Node,
|
|
data: pd.DataFrame,
|
|
name_col: str = "",
|
|
attribute_cols: List[str] = [],
|
|
join_type: str = "left",
|
|
) -> Node:
|
|
"""Add attributes to tree, return *new* root of tree.
|
|
|
|
`name_col` and `attribute_cols` specify columns for node name and attributes to add to existing tree.
|
|
If columns are not specified, the first column will be taken as name column and all other columns as attributes.
|
|
|
|
Function can return all existing tree nodes or only tree nodes that are in the input data node names.
|
|
Input data node names that are not existing node names will be ignored.
|
|
Note that if multiple nodes have the same name, attributes will be added to all nodes sharing same name.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> from bigtree import add_dataframe_to_tree_by_name, Node
|
|
>>> root = Node("a")
|
|
>>> b = Node("b", parent=root)
|
|
>>> name_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["b", 65],
|
|
... ],
|
|
... columns=["NAME", "age"]
|
|
... )
|
|
>>> root = add_dataframe_to_tree_by_name(root, name_data)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
|
|
Args:
|
|
tree (Node): existing tree
|
|
data (pd.DataFrame): data containing node name and attribute information
|
|
name_col (str): column of data containing `name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (List[str]): column(s) of data containing node attribute information,
|
|
if not set, it will take all columns of data except `path_col`
|
|
join_type (str): join type with attribute, default of 'left' takes existing tree nodes,
|
|
if join_type is set to 'inner' it will only take tree nodes with attributes and drop the other nodes
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
data = data.copy()
|
|
|
|
if join_type not in ["inner", "left"]:
|
|
raise ValueError("`join_type` must be one of 'inner' or 'left'")
|
|
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not name_col:
|
|
name_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(name_col)
|
|
|
|
# Attribute data
|
|
path_col = "PATH"
|
|
data2 = data.copy()[[name_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[name_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(name_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate name with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
# Tree data
|
|
tree_root = tree.root
|
|
sep = tree_root.sep
|
|
node_type = tree_root.__class__
|
|
data_tree = tree_to_dataframe(
|
|
tree_root, name_col=name_col, path_col=path_col, all_attrs=True
|
|
)
|
|
common_cols = list(set(data_tree.columns).intersection(attribute_cols))
|
|
data_tree = data_tree.drop(columns=common_cols)
|
|
|
|
# Attribute data
|
|
data_tree_attrs = pd.merge(data_tree, data, on=name_col, how=join_type)
|
|
data_tree_attrs = data_tree_attrs.drop(columns=name_col)
|
|
|
|
return dataframe_to_tree(
|
|
data_tree_attrs, path_col=path_col, sep=sep, node_type=node_type
|
|
)
|
|
|
|
|
|
def str_to_tree(
|
|
tree_string: str,
|
|
tree_prefix_list: List[str] = [],
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
r"""Construct tree from tree string
|
|
|
|
Examples:
|
|
>>> from bigtree import str_to_tree
|
|
>>> tree_str = 'a\n├── b\n│ ├── d\n│ └── e\n│ ├── g\n│ └── h\n└── c\n └── f'
|
|
>>> root = str_to_tree(tree_str, tree_prefix_list=["├──", "└──"])
|
|
>>> root.show()
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
tree_string (str): String to construct tree
|
|
tree_prefix_list (List[str]): List of prefix to mark the end of tree branch/stem and start of node name, optional.
|
|
If not specified, it will infer unicode characters and whitespace as prefix.
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
tree_string = tree_string.strip("\n")
|
|
if not len(tree_string):
|
|
raise ValueError("Tree string does not contain any data, check `tree_string`")
|
|
tree_list = tree_string.split("\n")
|
|
tree_root = node_type(tree_list[0])
|
|
|
|
# Infer prefix length
|
|
prefix_length = None
|
|
cur_parent = tree_root
|
|
for node_str in tree_list[1:]:
|
|
if len(tree_prefix_list):
|
|
node_name = re.split("|".join(tree_prefix_list), node_str)[-1].lstrip()
|
|
else:
|
|
node_name = node_str.encode("ascii", "ignore").decode("ascii").lstrip()
|
|
|
|
# Find node parent
|
|
if not prefix_length:
|
|
prefix_length = node_str.index(node_name)
|
|
if not prefix_length:
|
|
raise ValueError(
|
|
f"Invalid prefix, prefix should be unicode character or whitespace, "
|
|
f"otherwise specify one or more prefixes in `tree_prefix_list`, check: {node_str}"
|
|
)
|
|
node_prefix_length = node_str.index(node_name)
|
|
if node_prefix_length % prefix_length:
|
|
raise ValueError(
|
|
f"Tree string have different prefix length, check branch: {node_str}"
|
|
)
|
|
while cur_parent.depth > node_prefix_length / prefix_length:
|
|
cur_parent = cur_parent.parent
|
|
|
|
# Link node
|
|
child_node = node_type(node_name)
|
|
child_node.parent = cur_parent
|
|
cur_parent = child_node
|
|
|
|
return tree_root
|
|
|
|
|
|
def list_to_tree(
|
|
paths: Iterable[str],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from list of path strings.
|
|
|
|
Path should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> from bigtree import list_to_tree
|
|
>>> path_list = ["a/b", "a/c", "a/b/d", "a/b/e", "a/c/f", "a/b/e/g", "a/b/e/h"]
|
|
>>> root = list_to_tree(path_list)
|
|
>>> root.show()
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
paths (Iterable[str]): list containing path strings
|
|
sep (str): path separator for input `paths` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not paths:
|
|
raise ValueError("Path list does not contain any data, check `paths`")
|
|
|
|
# Remove duplicates
|
|
paths = list(OrderedDict.fromkeys(paths))
|
|
|
|
# Construct root node
|
|
root_name = paths[0].lstrip(sep).split(sep)[0]
|
|
root_node = node_type(root_name)
|
|
root_node.sep = sep
|
|
|
|
for path in paths:
|
|
add_path_to_tree(
|
|
root_node, path, sep=sep, duplicate_name_allowed=duplicate_name_allowed
|
|
)
|
|
root_node.sep = sep
|
|
return root_node
|
|
|
|
|
|
@optional_dependencies_pandas
|
|
def list_to_tree_by_relation(
|
|
relations: Iterable[Tuple[str, str]],
|
|
allow_duplicates: bool = False,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from list of tuple containing parent-child names.
|
|
|
|
Since tree is created from parent-child names, only names of leaf nodes may be repeated.
|
|
Error will be thrown if names of intermediate nodes are repeated as there will be confusion.
|
|
This error can be ignored by setting `allow_duplicates` to be True.
|
|
|
|
Examples:
|
|
>>> from bigtree import list_to_tree_by_relation
|
|
>>> relations_list = [("a", "b"), ("a", "c"), ("b", "d"), ("b", "e"), ("c", "f"), ("e", "g"), ("e", "h")]
|
|
>>> root = list_to_tree_by_relation(relations_list)
|
|
>>> root.show()
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
│ ├── g
|
|
│ └── h
|
|
└── c
|
|
└── f
|
|
|
|
Args:
|
|
relations (Iterable[Tuple[str, str]]): list containing tuple containing parent-child names
|
|
allow_duplicates (bool): allow duplicate intermediate nodes such that child node will
|
|
be tagged to multiple parent nodes, defaults to False
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not relations:
|
|
raise ValueError("Path list does not contain any data, check `relations`")
|
|
|
|
relation_data = pd.DataFrame(relations, columns=["parent", "child"])
|
|
return dataframe_to_tree_by_relation(
|
|
relation_data,
|
|
child_col="child",
|
|
parent_col="parent",
|
|
allow_duplicates=allow_duplicates,
|
|
node_type=node_type,
|
|
)
|
|
|
|
|
|
@optional_dependencies_pandas
|
|
def dict_to_tree(
|
|
path_attrs: Dict[str, Any],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from nested dictionary using path,
|
|
``key``: path, ``value``: dict of attribute name and attribute value.
|
|
|
|
Path should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
Path can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> from bigtree import dict_to_tree
|
|
>>> path_dict = {
|
|
... "a": {"age": 90},
|
|
... "a/b": {"age": 65},
|
|
... "a/c": {"age": 60},
|
|
... "a/b/d": {"age": 40},
|
|
... "a/b/e": {"age": 35},
|
|
... "a/c/f": {"age": 38},
|
|
... "a/b/e/g": {"age": 10},
|
|
... "a/b/e/h": {"age": 6},
|
|
... }
|
|
>>> root = dict_to_tree(path_dict)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
path_attrs (Dict[str, Any]): dictionary containing path and node attribute information,
|
|
key: path, value: dict of tree attribute and attribute value
|
|
sep (str): path separator of input `path_attrs` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(path_attrs):
|
|
raise ValueError("Dictionary does not contain any data, check `path_attrs`")
|
|
|
|
# Convert dictionary to dataframe
|
|
data = pd.DataFrame(path_attrs).T.rename_axis("PATH").reset_index()
|
|
return dataframe_to_tree(
|
|
data,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
node_type=node_type,
|
|
)
|
|
|
|
|
|
def nested_dict_to_tree(
|
|
node_attrs: Dict[str, Any],
|
|
name_key: str = "name",
|
|
child_key: str = "children",
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from nested recursive dictionary.
|
|
|
|
- ``key``: `name_key`, `child_key`, or any attributes key.
|
|
- ``value`` of `name_key` (str): node name.
|
|
- ``value`` of `child_key` (List[Dict[str, Any]]): list of dict containing `name_key` and `child_key` (recursive).
|
|
|
|
Examples:
|
|
>>> from bigtree import nested_dict_to_tree
|
|
>>> path_dict = {
|
|
... "name": "a",
|
|
... "age": 90,
|
|
... "children": [
|
|
... {"name": "b",
|
|
... "age": 65,
|
|
... "children": [
|
|
... {"name": "d", "age": 40},
|
|
... {"name": "e", "age": 35, "children": [
|
|
... {"name": "g", "age": 10},
|
|
... ]},
|
|
... ]},
|
|
... ],
|
|
... }
|
|
>>> root = nested_dict_to_tree(path_dict)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
└── b [age=65]
|
|
├── d [age=40]
|
|
└── e [age=35]
|
|
└── g [age=10]
|
|
|
|
Args:
|
|
node_attrs (Dict[str, Any]): dictionary containing node, children, and node attribute information,
|
|
key: `name_key` and `child_key`
|
|
value of `name_key` (str): node name
|
|
value of `child_key` (List[Dict[str, Any]]): list of dict containing `name_key` and `child_key` (recursive)
|
|
name_key (str): key of node name, value is type str
|
|
child_key (str): key of child list, value is type list
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not node_attrs:
|
|
raise ValueError("Dictionary does not contain any data, check `node_attrs`")
|
|
|
|
def _recursive_add_child(
|
|
child_dict: Dict[str, Any], parent_node: Optional[Node] = None
|
|
) -> Node:
|
|
"""Recursively add child to tree, given child attributes and parent node.
|
|
|
|
Args:
|
|
child_dict (Dict[str, Any]): child to be added to tree, from dictionary
|
|
parent_node (Node): parent node to be assigned to child node, defaults to None
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
child_dict = child_dict.copy()
|
|
node_name = child_dict.pop(name_key)
|
|
node_children = child_dict.pop(child_key, [])
|
|
if not isinstance(node_children, List):
|
|
raise TypeError(
|
|
f"child_key {child_key} should be List type, received {node_children}"
|
|
)
|
|
node = node_type(node_name, parent=parent_node, **child_dict)
|
|
for _child in node_children:
|
|
_recursive_add_child(_child, parent_node=node)
|
|
return node
|
|
|
|
root_node = _recursive_add_child(node_attrs)
|
|
return root_node
|
|
|
|
|
|
def dataframe_to_tree(
|
|
data: pd.DataFrame,
|
|
path_col: str = "",
|
|
attribute_cols: List[str] = [],
|
|
sep: str = "/",
|
|
duplicate_name_allowed: bool = True,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from pandas DataFrame using path, return root of tree.
|
|
|
|
`path_col` and `attribute_cols` specify columns for node path and attributes to construct tree.
|
|
If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols`.
|
|
|
|
Path in path column can start from root node `name`, or start with `sep`.
|
|
|
|
- For example: Path string can be "/a/b" or "a/b", if sep is "/".
|
|
|
|
Path in path column should contain ``Node`` name, separated by `sep`.
|
|
|
|
- For example: Path string "a/b" refers to Node("b") with parent Node("a").
|
|
|
|
All paths should start from the same root node.
|
|
|
|
- For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> from bigtree import dataframe_to_tree
|
|
>>> path_data = pd.DataFrame([
|
|
... ["a", 90],
|
|
... ["a/b", 65],
|
|
... ["a/c", 60],
|
|
... ["a/b/d", 40],
|
|
... ["a/b/e", 35],
|
|
... ["a/c/f", 38],
|
|
... ["a/b/e/g", 10],
|
|
... ["a/b/e/h", 6],
|
|
... ],
|
|
... columns=["PATH", "age"]
|
|
... )
|
|
>>> root = dataframe_to_tree(path_data)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
data (pd.DataFrame): data containing path and node attribute information
|
|
path_col (str): column of data containing `path_name` information,
|
|
if not set, it will take the first column of data
|
|
attribute_cols (List[str]): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `path_col`
|
|
sep (str): path separator of input `path_col` and created tree, defaults to `/`
|
|
duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
data = data.copy()
|
|
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not path_col:
|
|
path_col = data.columns[0]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(path_col)
|
|
|
|
data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep)
|
|
data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates()
|
|
_duplicate_check = (
|
|
data2[path_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(path_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate path with different attributes\nCheck {_duplicate_check}"
|
|
)
|
|
|
|
root_name = data[path_col].values[0].split(sep)[0]
|
|
root_node_data = data[data[path_col] == root_name]
|
|
if len(root_node_data):
|
|
root_node_kwargs = list(
|
|
root_node_data[attribute_cols].to_dict(orient="index").values()
|
|
)[0]
|
|
root_name = root_node_kwargs.pop("name", root_name)
|
|
root_node = node_type(root_name, **root_node_kwargs)
|
|
else:
|
|
root_node = node_type(root_name)
|
|
add_dataframe_to_tree_by_path(
|
|
root_node,
|
|
data,
|
|
path_col=path_col,
|
|
attribute_cols=attribute_cols,
|
|
sep=sep,
|
|
duplicate_name_allowed=duplicate_name_allowed,
|
|
)
|
|
root_node.sep = sep
|
|
return root_node
|
|
|
|
|
|
def dataframe_to_tree_by_relation(
|
|
data: pd.DataFrame,
|
|
child_col: str = "",
|
|
parent_col: str = "",
|
|
attribute_cols: List[str] = [],
|
|
allow_duplicates: bool = False,
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from pandas DataFrame using parent and child names, return root of tree.
|
|
|
|
Since tree is created from parent-child names, only names of leaf nodes may be repeated.
|
|
Error will be thrown if names of intermediate nodes are repeated as there will be confusion.
|
|
This error can be ignored by setting `allow_duplicates` to be True.
|
|
|
|
`child_col` and `parent_col` specify columns for child name and parent name to construct tree.
|
|
`attribute_cols` specify columns for node attribute for child name.
|
|
If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
|
|
columns are `attribute_cols`.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> from bigtree import dataframe_to_tree_by_relation
|
|
>>> relation_data = pd.DataFrame([
|
|
... ["a", None, 90],
|
|
... ["b", "a", 65],
|
|
... ["c", "a", 60],
|
|
... ["d", "b", 40],
|
|
... ["e", "b", 35],
|
|
... ["f", "c", 38],
|
|
... ["g", "e", 10],
|
|
... ["h", "e", 6],
|
|
... ],
|
|
... columns=["child", "parent", "age"]
|
|
... )
|
|
>>> root = dataframe_to_tree_by_relation(relation_data)
|
|
>>> root.show(attr_list=["age"])
|
|
a [age=90]
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
│ ├── g [age=10]
|
|
│ └── h [age=6]
|
|
└── c [age=60]
|
|
└── f [age=38]
|
|
|
|
Args:
|
|
data (pd.DataFrame): data containing path and node attribute information
|
|
child_col (str): column of data containing child name information, defaults to None
|
|
if not set, it will take the first column of data
|
|
parent_col (str): column of data containing parent name information, defaults to None
|
|
if not set, it will take the second column of data
|
|
attribute_cols (List[str]): columns of data containing node attribute information,
|
|
if not set, it will take all columns of data except `child_col` and `parent_col`
|
|
allow_duplicates (bool): allow duplicate intermediate nodes such that child node will
|
|
be tagged to multiple parent nodes, defaults to False
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
data = data.copy()
|
|
|
|
if not len(data.columns):
|
|
raise ValueError("Data does not contain any columns, check `data`")
|
|
if not len(data):
|
|
raise ValueError("Data does not contain any rows, check `data`")
|
|
|
|
if not child_col:
|
|
child_col = data.columns[0]
|
|
if not parent_col:
|
|
parent_col = data.columns[1]
|
|
if not len(attribute_cols):
|
|
attribute_cols = list(data.columns)
|
|
attribute_cols.remove(child_col)
|
|
attribute_cols.remove(parent_col)
|
|
|
|
data_check = data.copy()[[child_col, parent_col]].drop_duplicates()
|
|
# Filter for child nodes that are parent of other nodes
|
|
if not allow_duplicates:
|
|
data_check = data_check[data_check[child_col].isin(data_check[parent_col])]
|
|
_duplicate_check = (
|
|
data_check[child_col]
|
|
.value_counts()
|
|
.to_frame("counts")
|
|
.rename_axis(child_col)
|
|
.reset_index()
|
|
)
|
|
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
|
|
if len(_duplicate_check):
|
|
raise ValueError(
|
|
f"There exists duplicate child with different parent where the child is also a parent node.\n"
|
|
f"Duplicated node names should not happen, but can only exist in leaf nodes to avoid confusion.\n"
|
|
f"Check {_duplicate_check}"
|
|
)
|
|
|
|
# If parent-child contains None -> root
|
|
root_row = data[data[parent_col].isnull()]
|
|
root_names = list(root_row[child_col])
|
|
if not len(root_names):
|
|
root_names = list(set(data[parent_col]) - set(data[child_col]))
|
|
if len(root_names) != 1:
|
|
raise ValueError(
|
|
f"Unable to determine root node\nPossible root nodes: {root_names}"
|
|
)
|
|
root_name = root_names[0]
|
|
root_node_data = data[data[child_col] == root_name]
|
|
if len(root_node_data):
|
|
root_node_kwargs = list(
|
|
root_node_data[attribute_cols].to_dict(orient="index").values()
|
|
)[0]
|
|
root_name = root_node_kwargs.pop("name", root_name)
|
|
root_node = node_type(root_name, **root_node_kwargs)
|
|
else:
|
|
root_node = node_type(root_name)
|
|
|
|
def _retrieve_attr(_row: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Retrieve node attributes from dictionary, remove parent and child column from dictionary.
|
|
|
|
Args:
|
|
_row (Dict[str, Any]): node attributes
|
|
|
|
Returns:
|
|
(Dict[str, Any])
|
|
"""
|
|
node_attrs = _row.copy()
|
|
node_attrs["name"] = node_attrs[child_col]
|
|
del node_attrs[child_col]
|
|
del node_attrs[parent_col]
|
|
_node_attrs = {k: v for k, v in node_attrs.items() if v is not None}
|
|
return _node_attrs
|
|
|
|
def _recursive_add_child(parent_node: Node) -> None:
|
|
"""Recursive add child to tree, given current node.
|
|
|
|
Args:
|
|
parent_node (Node): parent node
|
|
"""
|
|
child_rows = data[data[parent_col] == parent_node.node_name]
|
|
|
|
for row in child_rows.to_dict(orient="index").values():
|
|
child_node = node_type(**_retrieve_attr(row))
|
|
child_node.parent = parent_node
|
|
_recursive_add_child(child_node)
|
|
|
|
# Create root node attributes
|
|
if len(root_row):
|
|
row = list(root_row.to_dict(orient="index").values())[0]
|
|
root_node.set_attrs(_retrieve_attr(row))
|
|
_recursive_add_child(root_node)
|
|
return root_node
|
|
|
|
|
|
def newick_to_tree(
|
|
tree_string: str,
|
|
length_attr: str = "length",
|
|
attr_prefix: str = "&&NHX:",
|
|
node_type: Type[Node] = Node,
|
|
) -> Node:
|
|
"""Construct tree from Newick notation, return root of tree.
|
|
|
|
In the Newick Notation (or New Hampshire Notation)
|
|
|
|
- Tree is represented in round brackets i.e., `(child1,child2,child3)parent`.
|
|
- If there are nested tree, they will be in nested round brackets i.e., `((grandchild1)child1,(grandchild2,grandchild3)child2)parent`.
|
|
- If there is length attribute, they will be beside the name i.e., `(child1:0.5,child2:0.1)parent`.
|
|
- If there are other attributes, attributes are represented in square brackets i.e., `(child1:0.5[S:human],child2:0.1[S:human])parent[S:parent]`.
|
|
|
|
Variations supported
|
|
|
|
- Support special characters (`[`, `]`, `(`, `)`, `:`, `,`) in node name, attribute name, and attribute values if
|
|
they are enclosed in single quotes i.e., '(name:!)'.
|
|
- If there are no node names, it will be auto-filled with convention `nodeN` with N representing a number.
|
|
|
|
Examples:
|
|
>>> from bigtree import newick_to_tree
|
|
>>> root = newick_to_tree("((d,e)b,c)a")
|
|
>>> root.show()
|
|
a
|
|
├── b
|
|
│ ├── d
|
|
│ └── e
|
|
└── c
|
|
|
|
>>> root = newick_to_tree("((d:40,e:35)b:65,c:60)a", length_attr="age")
|
|
>>> root.show(attr_list=["age"])
|
|
a
|
|
├── b [age=65]
|
|
│ ├── d [age=40]
|
|
│ └── e [age=35]
|
|
└── c [age=60]
|
|
|
|
>>> root = newick_to_tree(
|
|
... "((d:40[&&NHX:species=human],e:35[&&NHX:species=human])b:65[&&NHX:species=human],c:60[&&NHX:species=human])a[&&NHX:species=human]",
|
|
... length_attr="age",
|
|
... )
|
|
>>> root.show(all_attrs=True)
|
|
a [species=human]
|
|
├── b [age=65, species=human]
|
|
│ ├── d [age=40, species=human]
|
|
│ └── e [age=35, species=human]
|
|
└── c [age=60, species=human]
|
|
|
|
Args:
|
|
tree_string (str): Newick notation to construct tree
|
|
length_attr (str): attribute name to store node length, optional, defaults to 'length'
|
|
attr_prefix (str): prefix before all attributes, within square bracket, used to detect attributes, defaults to "&&NHX:"
|
|
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``
|
|
|
|
Returns:
|
|
(Node)
|
|
"""
|
|
if not len(tree_string):
|
|
raise ValueError("Tree string does not contain any data, check `tree_string`")
|
|
|
|
# Store results (for tracking)
|
|
depth_nodes: Dict[int, List[Node]] = defaultdict(list)
|
|
unlabelled_node_counter: int = 0
|
|
current_depth: int = 1
|
|
tree_string_idx: int = 0
|
|
|
|
# Store states (for assertions and checks)
|
|
current_state: NewickState = NewickState.PARSE_STRING
|
|
current_node: Optional[Node] = None
|
|
cumulative_string: str = ""
|
|
cumulative_string_value: str = ""
|
|
|
|
def _create_node(
|
|
_new_node: Optional[Node],
|
|
_cumulative_string: str,
|
|
_unlabelled_node_counter: int,
|
|
_depth_nodes: Dict[int, List[Node]],
|
|
_current_depth: int,
|
|
) -> Tuple[Node, int]:
|
|
"""Create node at checkpoint.
|
|
|
|
Args:
|
|
_new_node (Optional[Node]): existing node (to add length attribute), or nothing (to create a node)
|
|
_cumulative_string (str): cumulative string, contains either node name or length attribute
|
|
_unlabelled_node_counter (int): number of unlabelled nodes, updates and returns counter
|
|
_depth_nodes (Dict[int, List[Node]]): list of nodes at each depth
|
|
_current_depth (int): depth of current node or node to be created
|
|
|
|
Returns:
|
|
(Tuple[Node, int])
|
|
"""
|
|
if not _new_node:
|
|
if not _cumulative_string:
|
|
_cumulative_string = f"node{_unlabelled_node_counter}"
|
|
_unlabelled_node_counter += 1
|
|
_new_node = node_type(_cumulative_string)
|
|
_depth_nodes[_current_depth].append(_new_node)
|
|
elif _cumulative_string:
|
|
_new_node.set_attrs(
|
|
{
|
|
length_attr: (
|
|
int(_cumulative_string)
|
|
if _cumulative_string.isdigit()
|
|
else float(_cumulative_string)
|
|
)
|
|
}
|
|
)
|
|
|
|
if len(_depth_nodes[_current_depth + 1]):
|
|
_new_node.children = depth_nodes[_current_depth + 1] # type: ignore
|
|
del _depth_nodes[_current_depth + 1]
|
|
return _new_node, _unlabelled_node_counter
|
|
|
|
def _raise_value_error(tree_idx: int) -> None:
|
|
"""Raise value error.
|
|
|
|
Raises:
|
|
ValueError
|
|
"""
|
|
raise ValueError(
|
|
f"String not properly closed, check `tree_string` at index {tree_idx}"
|
|
)
|
|
|
|
while tree_string_idx < len(tree_string):
|
|
character = tree_string[tree_string_idx]
|
|
if character == NewickCharacter.OPEN_BRACKET:
|
|
# Check and/or change state
|
|
state_title = "Node creation start"
|
|
if current_state not in [NewickState.PARSE_STRING]:
|
|
_raise_value_error(tree_string_idx)
|
|
# Logic
|
|
current_depth += 1
|
|
if current_node:
|
|
_raise_value_error(tree_string_idx)
|
|
if cumulative_string:
|
|
_raise_value_error(tree_string_idx)
|
|
assert (
|
|
not cumulative_string_value
|
|
), f"{state_title}, should not have cumulative_string_value"
|
|
tree_string_idx += 1
|
|
continue
|
|
|
|
if character in [
|
|
NewickCharacter.CLOSE_BRACKET,
|
|
NewickCharacter.ATTR_START,
|
|
NewickCharacter.NODE_SEP,
|
|
]:
|
|
# Check and/or change state
|
|
state_title = "Node creation end / Node attribute start"
|
|
if current_state not in [
|
|
NewickState.PARSE_STRING,
|
|
NewickState.PARSE_ATTRIBUTE_NAME,
|
|
]:
|
|
_raise_value_error(tree_string_idx)
|
|
# Logic
|
|
if character == NewickCharacter.ATTR_START:
|
|
current_state = NewickState.PARSE_ATTRIBUTE_NAME
|
|
if tree_string[tree_string_idx + 1 :].startswith( # noqa: E203
|
|
attr_prefix
|
|
):
|
|
tree_string_idx += len(attr_prefix)
|
|
current_node, unlabelled_node_counter = _create_node(
|
|
current_node,
|
|
cumulative_string,
|
|
unlabelled_node_counter,
|
|
depth_nodes,
|
|
current_depth,
|
|
)
|
|
if character == NewickCharacter.CLOSE_BRACKET:
|
|
current_depth -= 1
|
|
current_node = None
|
|
if character == NewickCharacter.NODE_SEP:
|
|
current_node = None
|
|
cumulative_string = ""
|
|
assert (
|
|
not cumulative_string_value
|
|
), f"{state_title}, should not have cumulative_string_value"
|
|
tree_string_idx += 1
|
|
continue
|
|
|
|
if character == NewickCharacter.ATTR_END:
|
|
# Check and/or change state
|
|
state_title = "Node attribute end"
|
|
if current_state not in [NewickState.PARSE_ATTRIBUTE_VALUE]:
|
|
_raise_value_error(tree_string_idx)
|
|
current_state = NewickState.PARSE_STRING
|
|
# Logic
|
|
assert current_node, f"{state_title}, should have current_node"
|
|
current_node.set_attrs({cumulative_string: cumulative_string_value})
|
|
cumulative_string = ""
|
|
cumulative_string_value = ""
|
|
tree_string_idx += 1
|
|
continue
|
|
|
|
if character == NewickCharacter.ATTR_KEY_VALUE:
|
|
# Check and/or change state
|
|
state_title = "Node attribute creation"
|
|
if current_state not in [NewickState.PARSE_ATTRIBUTE_NAME]:
|
|
_raise_value_error(tree_string_idx)
|
|
current_state = NewickState.PARSE_ATTRIBUTE_VALUE
|
|
# Logic
|
|
assert current_node, f"{state_title}, should have current_node"
|
|
if not cumulative_string:
|
|
_raise_value_error(tree_string_idx)
|
|
assert (
|
|
not cumulative_string_value
|
|
), f"{state_title}, should not have cumulative_string_value"
|
|
tree_string_idx += 1
|
|
continue
|
|
|
|
if character == NewickCharacter.ATTR_QUOTE:
|
|
# Logic
|
|
quote_end_idx = tree_string.find(
|
|
NewickCharacter.ATTR_QUOTE, tree_string_idx + 1
|
|
)
|
|
if quote_end_idx == -1:
|
|
_raise_value_error(tree_string_idx)
|
|
if current_state in [
|
|
NewickState.PARSE_STRING,
|
|
NewickState.PARSE_ATTRIBUTE_NAME,
|
|
]:
|
|
if cumulative_string:
|
|
_raise_value_error(tree_string_idx)
|
|
cumulative_string = tree_string[
|
|
tree_string_idx + 1 : quote_end_idx # noqa: E203
|
|
]
|
|
else:
|
|
if cumulative_string_value:
|
|
_raise_value_error(tree_string_idx)
|
|
cumulative_string_value = tree_string[
|
|
tree_string_idx + 1 : quote_end_idx # noqa: E203
|
|
]
|
|
tree_string_idx = quote_end_idx + 1
|
|
continue
|
|
|
|
if character == NewickCharacter.SEP:
|
|
# Check and/or change state
|
|
state_title = "Node length creation / Node attribute creation"
|
|
if current_state not in [
|
|
NewickState.PARSE_STRING,
|
|
NewickState.PARSE_ATTRIBUTE_VALUE,
|
|
]:
|
|
_raise_value_error(tree_string_idx)
|
|
# Logic
|
|
if current_state == NewickState.PARSE_STRING:
|
|
if current_node:
|
|
_raise_value_error(tree_string_idx)
|
|
current_node, unlabelled_node_counter = _create_node(
|
|
current_node,
|
|
cumulative_string,
|
|
unlabelled_node_counter,
|
|
depth_nodes,
|
|
current_depth,
|
|
)
|
|
cumulative_string = ""
|
|
assert (
|
|
not cumulative_string_value
|
|
), f"{state_title}, should not have cumulative_string_value"
|
|
tree_string_idx += 1
|
|
continue
|
|
else:
|
|
current_state = NewickState.PARSE_ATTRIBUTE_NAME
|
|
assert current_node, f"{state_title}, should not have current_node"
|
|
current_node.set_attrs({cumulative_string: cumulative_string_value})
|
|
cumulative_string = ""
|
|
cumulative_string_value = ""
|
|
tree_string_idx += 1
|
|
continue
|
|
|
|
if current_state == NewickState.PARSE_ATTRIBUTE_VALUE:
|
|
cumulative_string_value += character
|
|
else:
|
|
cumulative_string += character
|
|
tree_string_idx += 1
|
|
|
|
if current_depth != 1:
|
|
_raise_value_error(tree_string_idx)
|
|
|
|
# Final root node
|
|
if len(depth_nodes[current_depth]):
|
|
current_node = depth_nodes[current_depth][0]
|
|
current_node, unlabelled_node_counter = _create_node(
|
|
current_node,
|
|
cumulative_string,
|
|
unlabelled_node_counter,
|
|
depth_nodes,
|
|
current_depth,
|
|
)
|
|
return current_node
|