Implemented a few new features such as create_group_hierarchy of any depth, get_parent_child_relationships, and display_group_hierarchy_on_a_treemap. Additionally, unified read_hdf5_as_dataframe's and documented it better.

This commit is contained in:
2023-10-31 14:30:34 +01:00
parent 87b256d93e
commit cc52fafc44

View File

@ -1,73 +1,34 @@
import pandas as pd import pandas as pd
import h5py import h5py
import os import os
import sys #import sys
#from itertools import product
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import plotly.express as px
def is_wrapped(value): import plotly.graph_objects as go
"""returns True if value is contained in a 1 by 1 array, or False otherwise.""" from plotly.subplots import make_subplots
if not isinstance(value,np.ndarray):
return False
elif sum(value.shape)==2:
return True
else:
return False
def read_hdf5_as_dataframe(filename): def read_hdf5_as_dataframe(filename):
with h5py.File(filename,'r') as file: """ Reconstruct a Matlab Table encoded in a .h5 file as a Pandas DataFrame. The input h5. file
contains as many groups as rows in the Matlab Table, and each group stores dataset-like variables in the Table as
# Define group's attributes and datasets. This should hold Datasets while categorical and numerical variables in the table are represented as attributes of each group.
# for all groups. TODO: implement verification and noncompliance error if needed.
group_list = list(file.keys()) Note:DataFrame is constructed columnwise to ensure homogenous data columns.
group_attrs = list(file[group_list[0]].attrs.keys())
# Parameters:
column_attr_names = [item[item.find('_')+1::] for item in group_attrs]
column_attr_names_idx = [int(item[4:(item.find('_'))]) for item in group_attrs] filename (str): .h5 file's name. It may include location-path information.
Returns:
output_dataframe (pd.DataFrame): Matlab's Table as a Pandas DataFrame
"""
group_datasets = list(file[group_list[0]].keys()) #contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
#
column_dataset_names = [file[group_list[0]][item].attrs['column_name'] for item in group_datasets]
column_dataset_names_idx = [int(item[2:]) for item in group_datasets]
# Define data_frame as group_attrs + group_datasets
#pd_series_index = group_attrs + group_datasets
pd_series_index = column_attr_names + column_dataset_names
output_dataframe = pd.DataFrame(columns=pd_series_index,index=group_list)
for group_key in group_list:
# Print group_name
#print(group_key)
tmp_row = []
for attr_key in group_attrs:
#print(type(file[group_key].attrs[attr_key]))
df_entry = file[group_key].attrs[attr_key][()]
tmp_row.append(df_entry)
for ds_key in group_datasets:
# Check dataset's type by uncommenting the line below
# print(type(file[group_key][ds_key][()]))
# Append to list the value of the file at dataset /group/ds
#tmp_row.append(file[group_key][ds_key][()])
#tmp_row.append(file[group_key+'/'+ds_key][()])
tmp_row.append(file[group_key+'/'+ds_key][()])
# Create pandas Series/measurement
row = pd.Series(data=tmp_row,index=pd_series_index, name = group_key)
output_dataframe.loc[group_key,:] = row
return output_dataframe
def read_hdf5_as_dataframe_v2(filename):
"""contructs dataframe by filling out entries columnwise. This way we can ensure homogenous data columns"""
with h5py.File(filename,'r') as file: with h5py.File(filename,'r') as file:
@ -96,10 +57,13 @@ def read_hdf5_as_dataframe_v2(filename):
for meas_prop in group_attrs + group_datasets: for meas_prop in group_attrs + group_datasets:
if meas_prop in group_attrs: if meas_prop in group_attrs:
column_label = meas_prop[meas_prop.find('_')+1:] column_label = meas_prop[meas_prop.find('_')+1:]
# Create numerical or categorical column from group's attributes
tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list] tmp_col = [file[group_key].attrs[meas_prop][()][0] for group_key in group_list]
else: else:
# Create dataset column from group's datasets
column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name'] column_label = file[group_list[0] + '/' + meas_prop].attrs['column_name']
tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list] #tmp_col = [file[group_key + '/' + meas_prop][()][0] for group_key in group_list]
tmp_col = [file[group_key + '/' + meas_prop][()] for group_key in group_list]
output_dataframe.loc[:,column_label] = tmp_col output_dataframe.loc[:,column_label] = tmp_col
@ -114,9 +78,7 @@ def is_nested_hierarchy(df) -> bool:
"""receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy. """receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy.
That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group. That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group.
""" """
# TODO: generalize the code to check for deeper group hierachies. # TODO: generalize the code to check for deeper group hierachies.
def are_nested(df, col, col_nxt): def are_nested(df, col, col_nxt):
""" Checks whether low level LL groups can be separated in terms of high level HL groups. """ Checks whether low level LL groups can be separated in terms of high level HL groups.
That is, elements of low-level groups do not belong to more than one HL group.""" That is, elements of low-level groups do not belong to more than one HL group."""
@ -150,20 +112,10 @@ def get_attr_names(input_data):
raise ValueError("input_data must be a pd.DataFrame") raise ValueError("input_data must be a pd.DataFrame")
return input_data.columns return input_data.columns
def create_group_hierarchy(obj, df, columns):
from itertools import product """
def set_group_hierarchy(file: h5py.File, df):
args = [df[col].unique().tolist() for col in df.columns]
group_paths = ['/'+'/'.join(item) for item in list(product(*args))]
return group_paths
def create_group_hierarchy(obj, columns, df):
"""
Input: Input:
obj (h5py.File or h5py.Group) obj (h5py.File or h5py.Group)
columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
@ -179,22 +131,104 @@ def create_group_hierarchy(obj, columns, df):
for group_name in unique_values: for group_name in unique_values:
group = obj.require_group(group_name) group = obj.require_group(group_name)
group.attrs.create('column_name', columns[0])
sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:] sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
group.attrs.create('count',sub_df.shape[0])
#if group_name == 'MgO powder,H2O,HCl': #if group_name == 'MgO powder,H2O,HCl':
# print('Here:',sub_df.shape) # print('Here:',sub_df.shape)
create_group_hierarchy(group, columns[1::], sub_df) create_group_hierarchy(group, sub_df, columns[1::])
def get_parent_child_relationships(file: h5py.File):
nodes = []
parent = []
values = []
def node_visitor(name,obj):
if isinstance(obj,h5py.Group):
nodes.append(obj.name)
parent.append(obj.parent.name)
#nodes.append(os.path.split(obj.name)[1])
#parent.append(os.path.split(obj.parent.name)[1])
values.append(obj.attrs['count'])
file.visititems(node_visitor)
return nodes, parent, values
def get_groups_at_a_level(file: h5py.File, level: str):
groups = []
def node_selector(name, obj):
if name.count('/') == level:
print(name)
groups.append(obj.name)
file.visititems(node_selector)
#file.visititems()
return groups
def format_group_names(names: list):
formated_names = []
for name in names:
idx = name.rfind('/')
if len(name) > 1:
formated_names.append(name[idx+1::])
else:
formated_names.append(name)
return pd.DataFrame(formated_names,columns=['formated_names'],index=names)
def display_group_hierarchy_on_treemap(filename: str):
with h5py.File(filename,'r') as file:
nodes, parents, values = get_parent_child_relationships(file)
#formating_df = format_group_names(nodes + ["/"])
fig = make_subplots(1, 1, specs=[[{"type": "domain"}]],)
fig.add_trace(go.Treemap(
labels=nodes, #formating_df['formated_names'][nodes],
parents=parents,#formating_df['formated_names'][parents],
values=values,
branchvalues='total',
customdata= pd.Series(nodes),
#marker=dict(
# colors=df_all_trees['color'],
# colorscale='RdBu',
# cmid=average_score),
#hovertemplate='<b>%{label} </b> <br> Number of files: %{value}<br> Success rate: %{color:.2f}',
hovertemplate='<b>%{label} </b> <br> Count: %{value} <br> Path: %{customdata}',
name=''
))
fig.update_layout(width = 800, height= 600, margin = dict(t=50, l=25, r=25, b=25))
fig.show()
def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): def create_hdf5_file(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
""" Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group. """ Creates an hdf5 file with as many levels as indicated by len(group_by_funcs).
Top level denotes the root group/directory and bottom level denotes measurement level groups.
Parameters:
input_data (pd.DataFrame | file-system path) : input_data (pd.DataFrame | file-system path) :
group_by_funcs (list of callables or strs) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group. group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
to partition or group files from top to bottom.
Callables in the list must assign a categorical value to each file in a file list, internally represented as a DataFrame,
and they thus return a pd.Series of categorical values.
On the other hand, strings in the list refer to the name of categorical columns in the input_data (when this is a DataFrame)
Returns:
""" """
# Check whether input_data is a valid file system path or a dataframe # Check whether input_data is a valid file-system path or a DataFrame
check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False
if check_possible_path(input_data): if check_possible_path(input_data):
@ -229,9 +263,11 @@ def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list
# # TODO: extend to more than 2 callable elements. # # TODO: extend to more than 2 callable elements.
# raise ValueError("group_by_funcs can only contain at most two grouping elements.") # raise ValueError("group_by_funcs can only contain at most two grouping elements.")
with h5py.File(filename, 'w') as f: with h5py.File(ofilename, 'w') as file:
create_group_hierarchy(f, grouping_cols, df) create_group_hierarchy(file, df, grouping_cols)
file.attrs.create(name='depth', data=len(grouping_cols)-1)
#join_path = lambda x,y: '/' + x + '/' + y #join_path = lambda x,y: '/' + x + '/' + y
#for group_name in df[grouping_cols[0]].unique(): #for group_name in df[grouping_cols[0]].unique():
@ -241,9 +277,16 @@ def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name) # # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
# print(join_path(group_name,subgroup_name)) # print(join_path(group_name,subgroup_name))
# f.create_group(join_path(group_name,subgroup_name)) # f.create_group(join_path(group_name,subgroup_name))
# Get groups at the bottom of the hierarchy
bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
nodes, parents, values = get_parent_child_relationships(file)
print(':)') print(':)')
#fig = px.treemap(values=values,names=nodes, parents= parents)
#fig.update_traces(root_color="lightgrey")
#fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
#fig.show()
else: else:
raise ValueError("'approach' must take values in ['top-down','bottom-up']") raise ValueError("'approach' must take values in ['top-down','bottom-up']")
@ -256,13 +299,18 @@ def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list
# #
# Add datasets to groups and the groups and the group's attributes # Add datasets to groups and the groups and the group's attributes
return 0
def augment_with_filetype(df):
df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
#return [os.path.splitext(item)[1][1::] for item in df['filename']]
return df return df
def get_filetype(df): def augment_with_filenumber(df):
return [os.path.splitext(item)[1][1::] for item in df['filename']] df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
#return [item[0:item.find('_')] for item in df['filename']]
def group_by_filenumber(df): return df
return [item[0:item.find('_')] for item in df['filename']]
def group_by_df_column(df, column_name: str): def group_by_df_column(df, column_name: str):
""" """
@ -272,18 +320,20 @@ def group_by_df_column(df, column_name: str):
if not column_name in df.columns: if not column_name in df.columns:
raise ValueError("column_name must be in the columns of df.") raise ValueError("column_name must be in the columns of df.")
return df[column_name] return df[column_name]
def main(): def main():
# input data frame # input data frame
input_data = read_hdf5_as_dataframe_v2('input_files\\BeamTimeMetaData.h5') input_data = read_hdf5_as_dataframe('input_files\\BeamTimeMetaData.h5')
# Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'. # Rename column 'name' with 'filename'. get_filetype finds filetypes based on extension of filenames assumed to be located at the column 'filename'.
input_data = input_data.rename(columns = {'name':'filename'}) input_data = input_data.rename(columns = {'name':'filename'})
# Add column with filetypes to input_data # Add column with filetypes to input_data
input_data['filetype'] = get_filetype(input_data) input_data = augment_with_filenumber(input_data)
input_data = augment_with_filetype(input_data)
#input_data['filetype'] = get_filetype(input_data)
print(input_data['filetype'].unique()) print(input_data['filetype'].unique())
# Reduce input_data to files of ibw type # Reduce input_data to files of ibw type
input_data = input_data.loc[input_data['filetype']=='ibw', : ] input_data = input_data.loc[input_data['filetype']=='ibw', : ]
@ -296,8 +346,7 @@ def main():
print(item) print(item)
sample_name.append(item[0:item.find('(')]) sample_name.append(item[0:item.find('(')])
sample_quality.append(item[item.find('(')+1:len(item)-1]) sample_quality.append(item[item.find('(')+1:len(item)-1])
else: else:
if item=='': if item=='':
sample_name.append('Not yet annotated') sample_name.append('Not yet annotated')
sample_quality.append('unevaluated') sample_quality.append('unevaluated')
@ -307,21 +356,28 @@ def main():
input_data['sample'] = sample_name input_data['sample'] = sample_name
input_data['data_quality'] = sample_quality input_data['data_quality'] = sample_quality
#input_data = input_data.loc[input_data['sample']!='' , : ]
#group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']]
#group_by_func
group_by_sample = lambda x : group_by_df_column(x,'sample') group_by_sample = lambda x : group_by_df_column(x,'sample')
group_by_type = lambda x : group_by_df_column(x,'filetype') group_by_type = lambda x : group_by_df_column(x,'filetype')
group_by_filenumber = lambda x : group_by_df_column(x,'filenumber')
#fig = px.treemap(values=[10,4,3,3,2],names=[1,2,3,4,5], parents=[None,1,1,1,2],hover_name=['si senhor',':)',':)',':)','bottom'])
#fig = px.treemap(input_data,path=[px.Constant("BeamtimeMetadata.h5"),'sample','filenumber'])
#fig.update_traces(root_color = "lightgrey")
#fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
#fig.show()
success = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber])
display_group_hierarchy_on_treemap('test.h5')
print(':)')
df = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber,group_by_type]) #success = create_hdf5_file('test_v2.h5',input_data, 'top-down', group_by_funcs = ['sample','filenumber','filetype'])
df['file_group'] #df['file_group']
print(df.head()) #print(df.head())