Cleaned code and modified def create_hdf5_file_from_dataframe to create group hierichy implicitly from path rather than recursively.
This commit is contained in:
@ -367,8 +367,7 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
|
Creates an .h5 file with name "output_filename" that preserves the directory tree (or folder structure)
|
||||||
of a given filesystem path. When file and directory keywords are non-empty, the keywords enable filtering
|
of a given filesystem path.
|
||||||
of directory paths and file paths that do not contain the specified keywords.
|
|
||||||
|
|
||||||
The data integration capabilities are limited by our file reader, which can only access data from a list of
|
The data integration capabilities are limited by our file reader, which can only access data from a list of
|
||||||
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
|
admissible file formats. These, however, can be extended. Directories are groups in the resulting HDF5 file.
|
||||||
@ -557,7 +556,7 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
|||||||
Top level denotes the root group/directory and bottom level denotes measurement level groups.
|
Top level denotes the root group/directory and bottom level denotes measurement level groups.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
input_data (pd.DataFrame | file-system path) :
|
input_data (pd.DataFrame) :
|
||||||
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
|
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
|
||||||
to partition or group files from top to bottom.
|
to partition or group files from top to bottom.
|
||||||
|
|
||||||
@ -589,17 +588,20 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
|||||||
else:
|
else:
|
||||||
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
||||||
|
|
||||||
#
|
# Create group columns to form paths
|
||||||
if utils.is_callable_list(group_by_funcs):
|
if utils.is_callable_list(group_by_funcs):
|
||||||
grouping_cols = []
|
grouping_cols = []
|
||||||
for i, func in enumerate(group_by_funcs):
|
for i, func in enumerate(group_by_funcs):
|
||||||
grouping_cols.append('level_'+str(i)+'_groups')
|
grouping_cols.append('level_'+str(i)+'_groups')
|
||||||
df['level_'+str(i)+'_groups'] = func(df)
|
df['level_'+str(i)+'_groups'] = func(df)
|
||||||
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
||||||
grouping_cols = group_by_funcs
|
grouping_cols = group_by_funcs
|
||||||
else:
|
else:
|
||||||
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
|
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
|
||||||
|
|
||||||
|
# Concatenate group columns to form paths
|
||||||
|
df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
|
||||||
|
|
||||||
if approach == 'botton-up':
|
if approach == 'botton-up':
|
||||||
# TODO: implement botton-up approach
|
# TODO: implement botton-up approach
|
||||||
if is_nested_hierarchy(df.loc[:,grouping_cols]):
|
if is_nested_hierarchy(df.loc[:,grouping_cols]):
|
||||||
@ -614,29 +616,18 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
|||||||
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
|
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
|
||||||
|
|
||||||
with h5py.File(ofilename, 'w') as file:
|
with h5py.File(ofilename, 'w') as file:
|
||||||
|
|
||||||
|
# Create groups based on concatenated paths
|
||||||
|
for path in df['group_path'].unique():
|
||||||
|
file.create_group(path)
|
||||||
|
# TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
|
||||||
|
|
||||||
create_group_hierarchy(file, df, grouping_cols)
|
#create_group_hierarchy(file, df, grouping_cols)
|
||||||
|
|
||||||
file.attrs.create(name='depth', data=len(grouping_cols)-1)
|
file.attrs.create(name='depth', data=len(grouping_cols)-1)
|
||||||
|
|
||||||
#join_path = lambda x,y: '/' + x + '/' + y
|
|
||||||
#for group_name in df[grouping_cols[0]].unique():
|
|
||||||
# group_filter = df[grouping_cols[0]]==group_name
|
|
||||||
# for subgroup_name in df.loc[group_filter,grouping_cols[1]].unique():
|
|
||||||
# # Create group subgroup folder structure implicitly.
|
|
||||||
# # Explicitly, grp = f.create_group(group_name), subgrp = grp.create_group(subgroup_name)
|
|
||||||
# print(join_path(group_name,subgroup_name))
|
|
||||||
# f.create_group(join_path(group_name,subgroup_name))
|
|
||||||
|
|
||||||
# Get groups at the bottom of the hierarchy
|
|
||||||
#bottom_level_groups = get_groups_at_a_level(file, file.attrs['depth'])
|
|
||||||
|
|
||||||
#nodes, parents, values = get_parent_child_relationships(file)
|
|
||||||
print(':)')
|
print(':)')
|
||||||
#fig = px.treemap(values=values,names=nodes, parents= parents)
|
|
||||||
#fig.update_traces(root_color="lightgrey")
|
|
||||||
#fig.update_layout(width = 800, height=600, margin = dict(t=50, l=25, r=25, b=25))
|
|
||||||
#fig.show()
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
|
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
|
||||||
|
|
||||||
@ -651,23 +642,6 @@ def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group
|
|||||||
|
|
||||||
#return 0
|
#return 0
|
||||||
|
|
||||||
|
|
||||||
#def main():
|
|
||||||
|
|
||||||
# inputfile_dir = config_file.inputfile_dir #'\\\\fs03\\Iron_Sulphate'
|
|
||||||
# select_dir_keywords = config_file.select_dir_keywords #['gas','smps\\20220726','htof\\2022.07.26','ptr\\2022.07.26','ams\\2022.07.26']
|
|
||||||
# select_file_keywords = config_file.select_file_keywords #['20220726','2022.07.26']
|
|
||||||
|
|
||||||
# output_filename_path = os.path.join(config_file.outputfile_dir,config_file.output_filename)
|
|
||||||
# if not os.path.exists(output_filename_path):
|
|
||||||
# create_hdf5_file_from_filesystem_path(output_filename_path,inputfile_dir,select_dir_keywords,select_file_keywords)
|
|
||||||
|
|
||||||
# # hdf5_vis.display_group_hierarchy_on_a_treemap(output_filename_path)
|
|
||||||
# output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
|
|
||||||
|
|
||||||
|
|
||||||
# return output_filename_path, output_yml_filename_path
|
|
||||||
|
|
||||||
def main_mtable_h5_from_dataframe():
|
def main_mtable_h5_from_dataframe():
|
||||||
|
|
||||||
#import os
|
#import os
|
||||||
|
Reference in New Issue
Block a user