Updated create_hdf5_file.py with input, enabling either botton-up and top-down construction of group hierarchies. So, far, implementation works for top-down construction where group hierarchies can be of any depth and defined by categorical columns.

This commit is contained in:
2023-10-26 14:44:53 +02:00
parent d95a08caaa
commit 87b256d93e

View File

@ -4,6 +4,8 @@ import os
import sys
import numpy as np
import matplotlib.pyplot as plt
def is_wrapped(value):
"""returns True if value is contained in a 1 by 1 array, or False otherwise."""
if not isinstance(value,np.ndarray):
@ -104,36 +106,95 @@ def read_hdf5_as_dataframe_v2(filename):
return output_dataframe
def is_group_hierarchy_valid(df) -> bool:
def is_callable_list(x : list):
return all([callable(item) for item in x])
def is_str_list(x : list):
return all([isinstance(item,str) for item in x])
def is_nested_hierarchy(df) -> bool:
"""receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy.
That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group.
"""
# TODO: generalize the code to check for deeper group hierachies.
def are_nested(df, col, col_nxt):
""" Checks whether low level LL groups can be separated in terms of high level HL groups.
That is, elements of low-level groups do not belong to more than one HL group."""
# Compute higher level group names/categories
memberships = df[col_nxt].unique().tolist()
# Compute upper-level group memberships of low-level groups
col_avg_memberships = df.groupby(col).mean()[col_nxt].unique()
# Check whether all low-level groups have an actual hlg membership. That is, their avg. hlg membership is in the hlg membership.
return all([col_avg_memberships[group_idx] in memberships for group_idx in range(len(col_avg_memberships))])
df_tmp = df.copy()
# Create relabeling map
for column_name in df_tmp.columns:
category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique())
df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist()
# Extract level 1 group names
l1_group_names = df_tmp['level_1_groups'].unique().tolist()
return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()])
df_tmp.plot()
return all([are_nested(df_tmp,'level_'+str(i)+'_groups','level_'+str(i+1)+'_groups') for i in range(len(df_tmp.columns)-1)])
def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None):
def get_attr_names(input_data):
# TODO: extend this to file-system paths
if not isinstance(input_data,pd.DataFrame):
raise ValueError("input_data must be a pd.DataFrame")
return input_data.columns
from itertools import product
def set_group_hierarchy(file: h5py.File, df):
args = [df[col].unique().tolist() for col in df.columns]
group_paths = ['/'+'/'.join(item) for item in list(product(*args))]
return group_paths
def create_group_hierarchy(obj, columns, df):
"""
Input:
obj (h5py.File or h5py.Group)
columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
"""
if not columns:
return
# Determine categories associated with first categorical column
unique_values = df[columns[0]].unique()
for group_name in unique_values:
group = obj.require_group(group_name)
sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
#if group_name == 'MgO powder,H2O,HCl':
# print('Here:',sub_df.shape)
create_group_hierarchy(group, columns[1::], sub_df)
def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
""" Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group.
input_data (pd.DataFrame | file-system path) :
group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
group_by_funcs (list of callables or strs) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
"""
if not all([callable(func) for func in group_by_funcs]):
raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
# Check whether input_data is a valid file system path or a dataframe
check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False
if check_possible_path(input_data):
@ -144,24 +205,51 @@ def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = Non
else:
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
list_of_group_cols = []
for i, func in enumerate(group_by_funcs):
list_of_group_cols.append('level_'+str(i)+'_groups')
df['level_'+str(i)+'_groups'] = func(df)
# Check the length of group_by_funcs list is at most 2
if len(group_by_funcs) > 2:
# TODO: extend to more than 2 callable elements.
raise ValueError("group_by_funcs can only contain at most two callable elements.")
if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]):
raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
#
if is_callable_list(group_by_funcs):
grouping_cols = []
for i, func in enumerate(group_by_funcs):
grouping_cols.append('level_'+str(i)+'_groups')
df['level_'+str(i)+'_groups'] = func(df)
elif is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
grouping_cols = group_by_funcs
else:
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
if approach == 'botton-up':
# TODO: implement botton-up approach
if is_nested_hierarchy(df.loc[:,grouping_cols]):
print('Do something')
else:
raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
elif approach == 'top-down':
# Check the length of group_by_funcs list is at most 2
#if len(group_by_funcs) > 2:
# # TODO: extend to more than 2 callable elements.
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
with h5py.File(filename, 'w') as f:
create_group_hierarchy(f, grouping_cols, df)
#join_path = lambda x,y: '/' + x + '/' + y
#for group_name in df[grouping_cols[0]].unique():
# group_filter = df[grouping_cols[0]]==group_name
# for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
# # Create group subgroup folder structure implicitly.
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
# print(join_path(group_name,subgroup_name))
# f.create_group(join_path(group_name,subgroup_name))
print(':)')
else:
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
#for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()):
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
# 3. Create hdf5 file with groups defined by the 'file_group' column
@ -199,16 +287,37 @@ def main():
print(input_data['filetype'].unique())
# Reduce input_data to files of ibw type
input_data = input_data.loc[input_data['filetype']=='ibw', : ]
input_data = input_data.loc[input_data['sample']!='' , : ]
#input_data = input_data.loc[input_data['sample']!='' , : ]
sample_name = []
sample_quality = []
for item in input_data['sample']:
if item.find('(')!=-1:
print(item)
sample_name.append(item[0:item.find('(')])
sample_quality.append(item[item.find('(')+1:len(item)-1])
else:
if item=='':
sample_name.append('Not yet annotated')
sample_quality.append('unevaluated')
else:
sample_name.append(item)
sample_quality.append('good data')
input_data['sample'] = sample_name
input_data['data_quality'] = sample_quality
#input_data = input_data.loc[input_data['sample']!='' , : ]
#group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']]
#group_by_func
group_by_sample = lambda x : group_by_df_column(x,'sample')
df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample])
group_by_type = lambda x : group_by_df_column(x,'filetype')
df = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber,group_by_type])
df['file_group']