diff --git a/hdf5_lib.py b/hdf5_lib.py index 9b1e630..6c48e50 100644 --- a/hdf5_lib.py +++ b/hdf5_lib.py @@ -4,6 +4,8 @@ import os import sys import numpy as np +import matplotlib.pyplot as plt + def is_wrapped(value): """returns True if value is contained in a 1 by 1 array, or False otherwise.""" if not isinstance(value,np.ndarray): @@ -104,36 +106,95 @@ def read_hdf5_as_dataframe_v2(filename): return output_dataframe -def is_group_hierarchy_valid(df) -> bool: +def is_callable_list(x : list): + return all([callable(item) for item in x]) +def is_str_list(x : list): + return all([isinstance(item,str) for item in x]) +def is_nested_hierarchy(df) -> bool: + """receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy. + That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group. + """ # TODO: generalize the code to check for deeper group hierachies. + def are_nested(df, col, col_nxt): + """ Checks whether low level LL groups can be separated in terms of high level HL groups. + That is, elements of low-level groups do not belong to more than one HL group.""" + + # Compute higher level group names/categories + memberships = df[col_nxt].unique().tolist() + + # Compute upper-level group memberships of low-level groups + col_avg_memberships = df.groupby(col).mean()[col_nxt].unique() + + # Check whether all low-level groups have an actual hlg membership. That is, their avg. hlg membership is in the hlg membership. + return all([col_avg_memberships[group_idx] in memberships for group_idx in range(len(col_avg_memberships))]) + df_tmp = df.copy() # Create relabeling map for column_name in df_tmp.columns: category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique()) df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist() - # Extract level 1 group names - l1_group_names = df_tmp['level_1_groups'].unique().tolist() - return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()]) + df_tmp.plot() + + return all([are_nested(df_tmp,'level_'+str(i)+'_groups','level_'+str(i+1)+'_groups') for i in range(len(df_tmp.columns)-1)]) -def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None): +def get_attr_names(input_data): + + # TODO: extend this to file-system paths + if not isinstance(input_data,pd.DataFrame): + raise ValueError("input_data must be a pd.DataFrame") + + return input_data.columns + +from itertools import product + +def set_group_hierarchy(file: h5py.File, df): + + args = [df[col].unique().tolist() for col in df.columns] + group_paths = ['/'+'/'.join(item) for item in list(product(*args))] + + return group_paths + +def create_group_hierarchy(obj, columns, df): + + """ + + Input: + obj (h5py.File or h5py.Group) + columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy + + """ + + if not columns: + return + + # Determine categories associated with first categorical column + unique_values = df[columns[0]].unique() + + for group_name in unique_values: + + group = obj.require_group(group_name) + sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:] + #if group_name == 'MgO powder,H2O,HCl': + # print('Here:',sub_df.shape) + create_group_hierarchy(group, columns[1::], sub_df) + + + +def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): """ Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group. input_data (pd.DataFrame | file-system path) : - group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group. + group_by_funcs (list of callables or strs) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group. """ - - if not all([callable(func) for func in group_by_funcs]): - raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.") # Check whether input_data is a valid file system path or a dataframe - check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False if check_possible_path(input_data): @@ -144,24 +205,51 @@ def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = Non else: raise ValueError("input_data must be either a valid file-system path or a dataframe.") - list_of_group_cols = [] - for i, func in enumerate(group_by_funcs): - list_of_group_cols.append('level_'+str(i)+'_groups') - df['level_'+str(i)+'_groups'] = func(df) - - # Check the length of group_by_funcs list is at most 2 - if len(group_by_funcs) > 2: - # TODO: extend to more than 2 callable elements. - raise ValueError("group_by_funcs can only contain at most two callable elements.") - - if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]): - raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.") - + # + if is_callable_list(group_by_funcs): + grouping_cols = [] + for i, func in enumerate(group_by_funcs): + grouping_cols.append('level_'+str(i)+'_groups') + df['level_'+str(i)+'_groups'] = func(df) + elif is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]): + grouping_cols = group_by_funcs + else: + raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.") + if approach == 'botton-up': + # TODO: implement botton-up approach + if is_nested_hierarchy(df.loc[:,grouping_cols]): + print('Do something') + else: + raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.") + + elif approach == 'top-down': + # Check the length of group_by_funcs list is at most 2 + #if len(group_by_funcs) > 2: + # # TODO: extend to more than 2 callable elements. + # raise ValueError("group_by_funcs can only contain at most two grouping elements.") + + with h5py.File(filename, 'w') as f: + + create_group_hierarchy(f, grouping_cols, df) + + #join_path = lambda x,y: '/' + x + '/' + y + #for group_name in df[grouping_cols[0]].unique(): + # group_filter = df[grouping_cols[0]]==group_name + # for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique(): + # # Create group subgroup folder structure implicitly. + # # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name) + # print(join_path(group_name,subgroup_name)) + # f.create_group(join_path(group_name,subgroup_name)) + + print(':)') + + else: + raise ValueError("'approach' must take values in ['top-down','bottom-up']") + #for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()): - # 2. Validate group hierarchy, lower level groups must be embedded in higher level groups # 3. Create hdf5 file with groups defined by the 'file_group' column @@ -199,16 +287,37 @@ def main(): print(input_data['filetype'].unique()) # Reduce input_data to files of ibw type input_data = input_data.loc[input_data['filetype']=='ibw', : ] - input_data = input_data.loc[input_data['sample']!='' , : ] + #input_data = input_data.loc[input_data['sample']!='' , : ] + sample_name = [] + sample_quality = [] + for item in input_data['sample']: + if item.find('(')!=-1: + print(item) + sample_name.append(item[0:item.find('(')]) + sample_quality.append(item[item.find('(')+1:len(item)-1]) + else: + + if item=='': + sample_name.append('Not yet annotated') + sample_quality.append('unevaluated') + else: + sample_name.append(item) + sample_quality.append('good data') + input_data['sample'] = sample_name + input_data['data_quality'] = sample_quality + #input_data = input_data.loc[input_data['sample']!='' , : ] #group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']] #group_by_func group_by_sample = lambda x : group_by_df_column(x,'sample') - df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample]) + group_by_type = lambda x : group_by_df_column(x,'filetype') + + + df = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber,group_by_type]) df['file_group']