Updated create_hdf5_file.py with input, enabling either botton-up and top-down construction of group hierarchies. So, far, implementation works for top-down construction where group hierarchies can be of any depth and defined by categorical columns.

2023-10-26 14:44:53 +02:00
parent d95a08caaa
commit 87b256d93e
1 changed files with 135 additions and 26 deletions
--- a/hdf5_lib.py
+++ b/hdf5_lib.py
@ -4,6 +4,8 @@ import os
 import sys
 import numpy as np

+import matplotlib.pyplot as plt
+
 def is_wrapped(value):
    """returns True if value is contained in a 1 by 1 array, or False otherwise."""
    if not isinstance(value,np.ndarray):
@ -104,36 +106,95 @@ def read_hdf5_as_dataframe_v2(filename):
    return output_dataframe


-def is_group_hierarchy_valid(df) -> bool:
+def is_callable_list(x : list):
+    return all([callable(item) for item in x])
+def is_str_list(x : list):
+    return all([isinstance(item,str) for item in x])
+def is_nested_hierarchy(df) -> bool:
+    """receives a dataframe with categorical columns and checks whether rows form a nested group hierarchy.
+    That is, from bottom to top, subsequent hierarchical levels contain nested groups. The lower level groups belong to exactly one group in the higher level group.
+    """

    # TODO: generalize the code to check for deeper group hierachies.

+    def are_nested(df, col, col_nxt):
+        """ Checks whether low level LL groups can be separated in terms of high level HL groups. 
+        That is, elements of low-level groups do not belong to more than one HL group."""
+
+        # Compute higher level group names/categories 
+        memberships = df[col_nxt].unique().tolist()
+
+        # Compute upper-level group memberships of low-level groups
+        col_avg_memberships = df.groupby(col).mean()[col_nxt].unique()
+        
+        # Check whether all low-level groups have an actual hlg membership. That is, their avg. hlg membership is in the hlg membership.  
+        return all([col_avg_memberships[group_idx] in memberships for group_idx in range(len(col_avg_memberships))])
+
    df_tmp = df.copy()

    # Create relabeling map    
    for column_name in df_tmp.columns:        
        category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique())
        df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist()
-    # Extract level 1 group names
-    l1_group_names = df_tmp['level_1_groups'].unique().tolist()

-    return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()])
+    df_tmp.plot()
+
+    return all([are_nested(df_tmp,'level_'+str(i)+'_groups','level_'+str(i+1)+'_groups') for i in range(len(df_tmp.columns)-1)])



-def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None):
+def get_attr_names(input_data):
+    
+    # TODO: extend this to file-system paths
+    if not isinstance(input_data,pd.DataFrame):
+        raise ValueError("input_data must be a pd.DataFrame")
+
+    return input_data.columns
+
+from itertools import product
+    
+def set_group_hierarchy(file: h5py.File, df):
+    
+    args = [df[col].unique().tolist() for col in df.columns]
+    group_paths = ['/'+'/'.join(item) for item in list(product(*args))]
+
+    return group_paths
+
+def create_group_hierarchy(obj, columns, df):
+
+    """
+    
+        Input:
+        obj (h5py.File or h5py.Group)
+        columns (list of strs): denote categorical columns in df to be used to define hdf5 file group hierarchy
+    
+    """
+
+    if not columns:
+        return
+    
+    # Determine categories associated with first categorical column
+    unique_values = df[columns[0]].unique()
+
+    for group_name in unique_values:
+
+        group = obj.require_group(group_name)
+        sub_df = df[df[columns[0]]==group_name] # same as df.loc[df[columns[0]]==group_name,:]
+        #if group_name == 'MgO powder,H2O,HCl':
+        #    print('Here:',sub_df.shape)
+        create_group_hierarchy(group, columns[1::], sub_df)
+
+
+    
+def create_hdf5_file(filename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):

    """ Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group.

        input_data (pd.DataFrame | file-system path) :
-        group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
+        group_by_funcs (list of callables or strs) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group.
    """

-
-    if not all([callable(func) for func in group_by_funcs]):
-            raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.")
    # Check whether input_data is a valid file system path or a dataframe
-
    check_possible_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False

    if check_possible_path(input_data):
@ -144,24 +205,51 @@ def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = Non
    else:
        raise ValueError("input_data must be either a valid file-system path or a dataframe.")
    
-    list_of_group_cols = []
-    for i, func in enumerate(group_by_funcs):
-        list_of_group_cols.append('level_'+str(i)+'_groups') 
-        df['level_'+str(i)+'_groups'] = func(df)
-
-    # Check the length of group_by_funcs list is at most 2
-    if len(group_by_funcs) > 2:
-        # TODO: extend to more than 2 callable elements.
-        raise ValueError("group_by_funcs can only contain at most two callable elements.")
-
-    if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]):
-        raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
-
+    # 
+    if is_callable_list(group_by_funcs):
+        grouping_cols = []
+        for i, func in enumerate(group_by_funcs):
+            grouping_cols.append('level_'+str(i)+'_groups') 
+            df['level_'+str(i)+'_groups'] = func(df)
+    elif is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
+        grouping_cols = group_by_funcs
+    else:
+        raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
    
+    if approach == 'botton-up':  
+        # TODO: implement botton-up approach      
+        if is_nested_hierarchy(df.loc[:,grouping_cols]):
+            print('Do something')
+        else:
+            raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
+    
+    elif approach == 'top-down':
+        # Check the length of group_by_funcs list is at most 2
+        #if len(group_by_funcs) > 2:
+        #    # TODO: extend to more than 2 callable elements.
+        #    raise ValueError("group_by_funcs can only contain at most two grouping elements.")
+
+        with h5py.File(filename, 'w') as f:
+
+            create_group_hierarchy(f, grouping_cols, df)
+
+            #join_path = lambda x,y: '/' + x + '/' + y
+            #for group_name in df[grouping_cols[0]].unique():
+            #    group_filter = df[grouping_cols[0]]==group_name
+            #    for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
+            #        # Create group subgroup folder structure implicitly.
+            #        # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name) 
+            #        print(join_path(group_name,subgroup_name))
+            #        f.create_group(join_path(group_name,subgroup_name))
+
+            print(':)')
+
+    else:
+        raise ValueError("'approach' must take values in ['top-down','bottom-up']")
+

    #for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()):

-
    # 2. Validate group hierarchy, lower level groups must be embedded in higher level groups

    # 3. Create hdf5 file with groups defined by the 'file_group' column
@ -199,16 +287,37 @@ def main():
    print(input_data['filetype'].unique())
    # Reduce input_data to files of ibw type
    input_data = input_data.loc[input_data['filetype']=='ibw', : ]
-    input_data = input_data.loc[input_data['sample']!=''     , : ]
+    #input_data = input_data.loc[input_data['sample']!=''     , : ]

+    sample_name = []
+    sample_quality = []
+    for item in input_data['sample']:
+        if item.find('(')!=-1:
+            print(item)
+            sample_name.append(item[0:item.find('(')])
+            sample_quality.append(item[item.find('(')+1:len(item)-1])
+        else:
+            
+            if item=='':
+                sample_name.append('Not yet annotated')
+                sample_quality.append('unevaluated')
+            else:
+                sample_name.append(item)
+                sample_quality.append('good data')
+    input_data['sample'] = sample_name
+    input_data['data_quality'] = sample_quality

+    #input_data = input_data.loc[input_data['sample']!=''     , : ]

    #group_by_func = lambda df: [item[0:item.find('_')] for item in df['name']]

    #group_by_func

    group_by_sample = lambda x : group_by_df_column(x,'sample')
-    df = create_hdf5_file(input_data,[group_by_filenumber,group_by_sample])
+    group_by_type = lambda x : group_by_df_column(x,'filetype')
+
+
+    df = create_hdf5_file('test.h5',input_data, 'top-down', group_by_funcs = [group_by_sample, group_by_filenumber,group_by_type])

    df['file_group']