Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively.

2024-07-08 15:24:48 +02:00
parent 92eca4d79e
commit cb7d914908
1 changed files with 14 additions and 40 deletions
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
    """
    Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
-    of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering
+    of a given filesystem path.
    of directory paths and file paths that do not contain the specified keywords.
    The data integration capabilities are limited by our file reader, which can only access data from a list of
    admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
        Top level denotes the root group/directory and bottom level denotes measurement level groups.
        Parameters:    
-        input_data (pd.DataFrame | file-system path) :
+        input_data (pd.DataFrame) :
        group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
        to partition or group files from top to bottom.
@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
    else:
        raise ValueError("input_data must be either a valid file-system path or a dataframe.")
-    # 
+    # Create group columns to form paths
    if utils.is_callable_list(group_by_funcs):
        grouping_cols = []
        for i, func in enumerate(group_by_funcs):
            grouping_cols.append('level_'+str(i)+'_groups') 
-            df['level_'+str(i)+'_groups'] = func(df)
+            df['level_'+str(i)+'_groups'] = func(df)            
    elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
        grouping_cols = group_by_funcs
    else:
        raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
    # Concatenate group columns to form paths
    df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
    if approach == 'botton-up':  
        # TODO: implement botton-up approach      
        if is_nested_hierarchy(df.loc[:,grouping_cols]):
@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
        #    raise ValueError("group_by_funcs can only contain at most two grouping elements.")
        with h5py.File(ofilename, 'w') as file:
            # Create groups based on concatenated paths
            for path in df['group_path'].unique():
                file.create_group(path)
                # TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
-            create_group_hierarchy(file, df, grouping_cols)
+            #create_group_hierarchy(file, df, grouping_cols)
            file.attrs.create(name='depth', data=len(grouping_cols)-1)
            #join_path = lambda x,y: '/' + x + '/' + y
            #for group_name in df[grouping_cols[0]].unique():
            #    group_filter = df[grouping_cols[0]]==group_name
            #    for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
            #        # Create group subgroup folder structure implicitly.
            #        # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name) 
            #        print(join_path(group_name,subgroup_name))
            #        f.create_group(join_path(group_name,subgroup_name))
            # Get groups at the bottom of the hierarchy 
            #bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
            #nodes, parents, values = get_parent_child_relationships(file)
            print(':)')
-            #fig = px.treemap(values=values,names=nodes, parents= parents)
+
            #fig.update_traces(root_color="lightgrey")
            #fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
            #fig.show()
    else:
        raise ValueError("'approach' must take values in ['top-down','bottom-up']")
@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
    #return 0
 #def main():    
 #    inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate'
 #    select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
 #    select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26']
 #    output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename)   
 #    if not os.path.exists(output_filename_path):
 #        create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords)
 #    # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)
 #    output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
 #    return output_filename_path, output_yml_filename_path
 def main_mtable_h5_from_dataframe():
    #import os