Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively.

This commit is contained in:
2024-07-08 15:24:48 +02:00
parent 92eca4d79e
commit cb7d914908

View File

@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
"""
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering
of directory paths and file paths that do not contain the specified keywords.
of a given filesystem path.
The data integration capabilities are limited by our file reader, which can only access data from a list of
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
Top level denotes the root group/directory and bottom level denotes measurement level groups.
Parameters:
input_data (pd.DataFrame | file-system path) :
input_data (pd.DataFrame) :
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
to partition or group files from top to bottom.
@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
else:
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
#
# Create group columns to form paths
if utils.is_callable_list(group_by_funcs):
grouping_cols = []
for i, func in enumerate(group_by_funcs):
grouping_cols.append('level_'+str(i)+'_groups')
df['level_'+str(i)+'_groups'] = func(df)
df['level_'+str(i)+'_groups'] = func(df)
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
grouping_cols = group_by_funcs
else:
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
# Concatenate group columns to form paths
df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
if approach == 'botton-up':
# TODO: implement botton-up approach
if is_nested_hierarchy(df.loc[:,grouping_cols]):
@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
with h5py.File(ofilename, 'w') as file:
# Create groups based on concatenated paths
for path in df['group_path'].unique():
file.create_group(path)
# TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
create_group_hierarchy(file, df, grouping_cols)
#create_group_hierarchy(file, df, grouping_cols)
file.attrs.create(name='depth', data=len(grouping_cols)-1)
#join_path = lambda x,y: '/' + x + '/' + y
#for group_name in df[grouping_cols[0]].unique():
# group_filter = df[grouping_cols[0]]==group_name
# for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
# # Create group subgroup folder structure implicitly.
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
# print(join_path(group_name,subgroup_name))
# f.create_group(join_path(group_name,subgroup_name))
# Get groups at the bottom of the hierarchy
#bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
#nodes, parents, values = get_parent_child_relationships(file)
print(':)')
#fig = px.treemap(values=values,names=nodes, parents= parents)
#fig.update_traces(root_color="lightgrey")
#fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
#fig.show()
else:
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
#return 0
#def main():
# inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate'
# select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
# select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26']
# output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename)
# if not os.path.exists(output_filename_path):
# create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords)
# # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)
# output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
# return output_filename_path, output_yml_filename_path
def main_mtable_h5_from_dataframe():
#import os