diff --git a/hdf5_lib.py b/hdf5_lib.py index 3780814..9b1e630 100644 --- a/hdf5_lib.py +++ b/hdf5_lib.py @@ -103,12 +103,33 @@ def read_hdf5_as_dataframe_v2(filename): return output_dataframe -def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None): - """ - group_by_funcs (list of callables): returns a pd.Series, whose unique values denote groups of input_data elements. +def is_group_hierarchy_valid(df) -> bool: + + # TODO: generalize the code to check for deeper group hierachies. + + df_tmp = df.copy() + + # Create relabeling map + for column_name in df_tmp.columns: + category_index = pd.Series(np.arange(len(df_tmp[column_name].unique())), index=df_tmp[column_name].unique()) + df_tmp[column_name] = category_index[df_tmp[column_name].tolist()].tolist() + # Extract level 1 group names + l1_group_names = df_tmp['level_1_groups'].unique().tolist() + + return all([item in l1_group_names for item in df_tmp.groupby('level_0_groups').mean()['level_1_groups'].unique()]) + + + +def create_hdf5_file(input_data, group_by_funcs : list, extract_attrs_func = None): + + """ Creates an hdf5 file with at most three group levels, bottom, middle, and top level groups, where the top level group is the root '/' group. + + input_data (pd.DataFrame | file-system path) : + group_by_funcs (list of callables) : returns a pd.Series, from input_data elements to group labels. input data elements with same label belong to the same group. """ + if not all([callable(func) for func in group_by_funcs]): raise ValueError("'group_by_funcs' must be a list of callables (or functions) that takes input_data as input an returns valid categorical output.") # Check whether input_data is a valid file system path or a dataframe @@ -123,9 +144,24 @@ def create_hdf5_file(input_data, group_by_funcs, extract_attrs_func = None): else: raise ValueError("input_data must be either a valid file-system path or a dataframe.") + list_of_group_cols = [] for i, func in enumerate(group_by_funcs): + list_of_group_cols.append('level_'+str(i)+'_groups') df['level_'+str(i)+'_groups'] = func(df) + # Check the length of group_by_funcs list is at most 2 + if len(group_by_funcs) > 2: + # TODO: extend to more than 2 callable elements. + raise ValueError("group_by_funcs can only contain at most two callable elements.") + + if not is_group_hierarchy_valid(df.loc[:,list_of_group_cols]): + raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.") + + + + #for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()): + + # 2. Validate group hierarchy, lower level groups must be embedded in higher level groups # 3. Create hdf5 file with groups defined by the 'file_group' column @@ -162,7 +198,8 @@ def main(): input_data['filetype'] = get_filetype(input_data) print(input_data['filetype'].unique()) # Reduce input_data to files of ibw type - input_data = input_data.loc[input_data['filetype']=='ibw',:] + input_data = input_data.loc[input_data['filetype']=='ibw', : ] + input_data = input_data.loc[input_data['sample']!='' , : ]