Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively.
This commit is contained in:
@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
||||
"""
|
||||
|
||||
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
|
||||
of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering
|
||||
of directory paths and file paths that do not contain the specified keywords.
|
||||
of a given filesystem path.
|
||||
|
||||
The data integration capabilities are limited by our file reader, which can only access data from a list of
|
||||
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
|
||||
@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
||||
Top level denotes the root group/directory and bottom level denotes measurement level groups.
|
||||
|
||||
Parameters:
|
||||
input_data (pd.DataFrame | file-system path) :
|
||||
input_data (pd.DataFrame) :
|
||||
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
|
||||
to partition or group files from top to bottom.
|
||||
|
||||
@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
||||
else:
|
||||
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
||||
|
||||
#
|
||||
# Create group columns to form paths
|
||||
if utils.is_callable_list(group_by_funcs):
|
||||
grouping_cols = []
|
||||
for i, func in enumerate(group_by_funcs):
|
||||
grouping_cols.append('level_'+str(i)+'_groups')
|
||||
df['level_'+str(i)+'_groups'] = func(df)
|
||||
df['level_'+str(i)+'_groups'] = func(df)
|
||||
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
||||
grouping_cols = group_by_funcs
|
||||
else:
|
||||
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
|
||||
|
||||
# Concatenate group columns to form paths
|
||||
df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
|
||||
|
||||
if approach == 'botton-up':
|
||||
# TODO: implement botton-up approach
|
||||
if is_nested_hierarchy(df.loc[:,grouping_cols]):
|
||||
@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
||||
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
|
||||
|
||||
with h5py.File(ofilename, 'w') as file:
|
||||
|
||||
# Create groups based on concatenated paths
|
||||
for path in df['group_path'].unique():
|
||||
file.create_group(path)
|
||||
# TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
|
||||
|
||||
create_group_hierarchy(file, df, grouping_cols)
|
||||
#create_group_hierarchy(file, df, grouping_cols)
|
||||
|
||||
file.attrs.create(name='depth', data=len(grouping_cols)-1)
|
||||
|
||||
#join_path = lambda x,y: '/' + x + '/' + y
|
||||
#for group_name in df[grouping_cols[0]].unique():
|
||||
# group_filter = df[grouping_cols[0]]==group_name
|
||||
# for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
|
||||
# # Create group subgroup folder structure implicitly.
|
||||
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
|
||||
# print(join_path(group_name,subgroup_name))
|
||||
# f.create_group(join_path(group_name,subgroup_name))
|
||||
|
||||
# Get groups at the bottom of the hierarchy
|
||||
#bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
|
||||
|
||||
#nodes, parents, values = get_parent_child_relationships(file)
|
||||
print(':)')
|
||||
#fig = px.treemap(values=values,names=nodes, parents= parents)
|
||||
#fig.update_traces(root_color="lightgrey")
|
||||
#fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
|
||||
#fig.show()
|
||||
|
||||
else:
|
||||
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
|
||||
|
||||
@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
||||
|
||||
#return 0
|
||||
|
||||
|
||||
#def main():
|
||||
|
||||
# inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate'
|
||||
# select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
|
||||
# select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26']
|
||||
|
||||
# output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename)
|
||||
# if not os.path.exists(output_filename_path):
|
||||
# create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords)
|
||||
|
||||
# # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)
|
||||
# output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
|
||||
|
||||
|
||||
# return output_filename_path, output_yml_filename_path
|
||||
|
||||
def main_mtable_h5_from_dataframe():
|
||||
|
||||
#import os
|
||||
|
Reference in New Issue
Block a user