From cb7d91490877818958735d154a949bf2cfe773ff Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Mon, 8 Jul 2024 15:24:48 +0200 Subject: [PATCH] Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively. --- src/hdf5_lib.py | 54 +++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index e90c0ee..b3ff6eb 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str, """ Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) - of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering - of directory paths and file paths that do not contain the specified keywords. + of a given filesystem path. The data integration capabilities are limited by our file reader, which can only access data from a list of admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file. @@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group Top level denotes the root group/directory and bottom level denotes measurement level groups. Parameters: - input_data (pd.DataFrame | file-system path) : + input_data (pd.DataFrame) : group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used to partition or group files from top to bottom. @@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group else: raise ValueError("input_data must be either a valid file-system path or a dataframe.") - # + # Create group columns to form paths if utils.is_callable_list(group_by_funcs): grouping_cols = [] for i, func in enumerate(group_by_funcs): grouping_cols.append('level_'+str(i)+'_groups') - df['level_'+str(i)+'_groups'] = func(df) + df['level_'+str(i)+'_groups'] = func(df) elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]): grouping_cols = group_by_funcs else: raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.") + # Concatenate group columns to form paths + df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1) + if approach == 'botton-up': # TODO: implement botton-up approach if is_nested_hierarchy(df.loc[:,grouping_cols]): @@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group # raise ValueError("group_by_funcs can only contain at most two grouping elements.") with h5py.File(ofilename, 'w') as file: + + # Create groups based on concatenated paths + for path in df['group_path'].unique(): + file.create_group(path) + # TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets - create_group_hierarchy(file, df, grouping_cols) + #create_group_hierarchy(file, df, grouping_cols) file.attrs.create(name='depth', data=len(grouping_cols)-1) - #join_path = lambda x,y: '/' + x + '/' + y - #for group_name in df[grouping_cols[0]].unique(): - # group_filter = df[grouping_cols[0]]==group_name - # for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique(): - # # Create group subgroup folder structure implicitly. - # # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name) - # print(join_path(group_name,subgroup_name)) - # f.create_group(join_path(group_name,subgroup_name)) - - # Get groups at the bottom of the hierarchy - #bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth']) - - #nodes, parents, values = get_parent_child_relationships(file) print(':)') - #fig = px.treemap(values=values,names=nodes, parents= parents) - #fig.update_traces(root_color="lightgrey") - #fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25)) - #fig.show() + else: raise ValueError("'approach' must take values in ['top-down','bottom-up']") @@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group #return 0 - -#def main(): - -# inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate' -# select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26'] -# select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26'] - -# output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename) -# if not os.path.exists(output_filename_path): -# create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords) - -# # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path) -# output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path) - - -# return output_filename_path, output_yml_filename_path - def main_mtable_h5_from_dataframe(): #import os