Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively.

This commit is contained in:
2024-07-08 15:24:48 +02:00
parent 92eca4d79e
commit cb7d914908

View File

@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
""" """
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure) Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering of a given filesystem path.
of directory paths and file paths that do not contain the specified keywords.
The data integration capabilities are limited by our file reader, which can only access data from a list of The data integration capabilities are limited by our file reader, which can only access data from a list of
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file. admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
Top level denotes the root group/directory and bottom level denotes measurement level groups. Top level denotes the root group/directory and bottom level denotes measurement level groups.
Parameters: Parameters:
input_data (pd.DataFrame | file-system path) : input_data (pd.DataFrame) :
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
to partition or group files from top to bottom. to partition or group files from top to bottom.
@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
else: else:
raise ValueError("input_data must be either a valid file-system path or a dataframe.") raise ValueError("input_data must be either a valid file-system path or a dataframe.")
# # Create group columns to form paths
if utils.is_callable_list(group_by_funcs): if utils.is_callable_list(group_by_funcs):
grouping_cols = [] grouping_cols = []
for i, func in enumerate(group_by_funcs): for i, func in enumerate(group_by_funcs):
grouping_cols.append('level_'+str(i)+'_groups') grouping_cols.append('level_'+str(i)+'_groups')
df['level_'+str(i)+'_groups'] = func(df) df['level_'+str(i)+'_groups'] = func(df)
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]): elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
grouping_cols = group_by_funcs grouping_cols = group_by_funcs
else: else:
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.") raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
# Concatenate group columns to form paths
df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
if approach == 'botton-up': if approach == 'botton-up':
# TODO: implement botton-up approach # TODO: implement botton-up approach
if is_nested_hierarchy(df.loc[:,grouping_cols]): if is_nested_hierarchy(df.loc[:,grouping_cols]):
@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
# raise ValueError("group_by_funcs can only contain at most two grouping elements.") # raise ValueError("group_by_funcs can only contain at most two grouping elements.")
with h5py.File(ofilename, 'w') as file: with h5py.File(ofilename, 'w') as file:
# Create groups based on concatenated paths
for path in df['group_path'].unique():
file.create_group(path)
# TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
create_group_hierarchy(file, df, grouping_cols) #create_group_hierarchy(file, df, grouping_cols)
file.attrs.create(name='depth', data=len(grouping_cols)-1) file.attrs.create(name='depth', data=len(grouping_cols)-1)
#join_path = lambda x,y: '/' + x + '/' + y
#for group_name in df[grouping_cols[0]].unique():
# group_filter = df[grouping_cols[0]]==group_name
# for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
# # Create group subgroup folder structure implicitly.
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
# print(join_path(group_name,subgroup_name))
# f.create_group(join_path(group_name,subgroup_name))
# Get groups at the bottom of the hierarchy
#bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
#nodes, parents, values = get_parent_child_relationships(file)
print(':)') print(':)')
#fig = px.treemap(values=values,names=nodes, parents= parents)
#fig.update_traces(root_color="lightgrey")
#fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
#fig.show()
else: else:
raise ValueError("'approach' must take values in ['top-down','bottom-up']") raise ValueError("'approach' must take values in ['top-down','bottom-up']")
@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
#return 0 #return 0
#def main():
# inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate'
# select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
# select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26']
# output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename)
# if not os.path.exists(output_filename_path):
# create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords)
# # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)
# output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
# return output_filename_path, output_yml_filename_path
def main_mtable_h5_from_dataframe(): def main_mtable_h5_from_dataframe():
#import os #import os