Moved func create_hdf5_file_from_dataframe() from hdf5_lib_part2 into hdf5_write.py
This commit is contained in:
@ -349,99 +349,6 @@ def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5
|
|||||||
h5file.attrs.update(project_level_attributes)
|
h5file.attrs.update(project_level_attributes)
|
||||||
transfer_file_dict_to_hdf5(h5file, '/', file_dict)
|
transfer_file_dict_to_hdf5(h5file, '/', file_dict)
|
||||||
|
|
||||||
|
|
||||||
def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):
|
|
||||||
|
|
||||||
""" Creates an hdf5 file with as many levels as indicated by len(group_by_funcs).
|
|
||||||
Top level denotes the root group/directory and bottom level denotes measurement level groups.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
input_data (pd.DataFrame) :
|
|
||||||
group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used
|
|
||||||
to partition or group files from top to bottom.
|
|
||||||
|
|
||||||
Callables in the list must assign a categorical value to each file in a file list, internally represented as a DataFrame,
|
|
||||||
and they thus return a pd.Series of categorical values.
|
|
||||||
|
|
||||||
On the other hand, strings in the list refer to the name of categorical columns in the input_data (when this is a DataFrame)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Check whether input_data is a valid file-system path or a DataFrame
|
|
||||||
is_valid_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False
|
|
||||||
|
|
||||||
if is_valid_path(input_data):
|
|
||||||
|
|
||||||
file_list = os.listdir(input_data)
|
|
||||||
|
|
||||||
# Navigates file-system folders/directories from top to bottom.
|
|
||||||
#for dirpath, dirnames, filenames in os.walk(input_data,topdown=True):
|
|
||||||
|
|
||||||
|
|
||||||
#df = pd.DataFrame(file_list,columns=['filename'])
|
|
||||||
df = utils.augment_with_filetype(df)
|
|
||||||
|
|
||||||
elif isinstance(input_data,pd.DataFrame):
|
|
||||||
df = input_data.copy()
|
|
||||||
else:
|
|
||||||
raise ValueError("input_data must be either a valid file-system path or a dataframe.")
|
|
||||||
|
|
||||||
# Create group columns to form paths
|
|
||||||
if utils.is_callable_list(group_by_funcs):
|
|
||||||
grouping_cols = []
|
|
||||||
for i, func in enumerate(group_by_funcs):
|
|
||||||
grouping_cols.append('level_'+str(i)+'_groups')
|
|
||||||
df['level_'+str(i)+'_groups'] = func(df)
|
|
||||||
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
|
||||||
grouping_cols = group_by_funcs
|
|
||||||
else:
|
|
||||||
raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.")
|
|
||||||
|
|
||||||
# Concatenate group columns to form paths
|
|
||||||
df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1)
|
|
||||||
|
|
||||||
if approach == 'botton-up':
|
|
||||||
# TODO: implement botton-up approach
|
|
||||||
if is_nested_hierarchy(df.loc[:,grouping_cols]):
|
|
||||||
print('Do something')
|
|
||||||
else:
|
|
||||||
raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.")
|
|
||||||
|
|
||||||
elif approach == 'top-down':
|
|
||||||
# Check the length of group_by_funcs list is at most 2
|
|
||||||
#if len(group_by_funcs) > 2:
|
|
||||||
# # TODO: extend to more than 2 callable elements.
|
|
||||||
# raise ValueError("group_by_funcs can only contain at most two grouping elements.")
|
|
||||||
|
|
||||||
with h5py.File(ofilename, 'w') as file:
|
|
||||||
|
|
||||||
# Create groups based on concatenated paths
|
|
||||||
for path in df['group_path'].unique():
|
|
||||||
file.create_group(path)
|
|
||||||
# TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets
|
|
||||||
|
|
||||||
#create_group_hierarchy(file, df, grouping_cols)
|
|
||||||
|
|
||||||
file.attrs.create(name='depth', data=len(grouping_cols)-1)
|
|
||||||
|
|
||||||
print(':)')
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError("'approach' must take values in ['top-down','bottom-up']")
|
|
||||||
|
|
||||||
|
|
||||||
#for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()):
|
|
||||||
|
|
||||||
# 2. Validate group hierarchy, lower level groups must be embedded in higher level groups
|
|
||||||
|
|
||||||
# 3. Create hdf5 file with groups defined by the 'file_group' column
|
|
||||||
#
|
|
||||||
# Add datasets to groups and the groups and the group's attributes
|
|
||||||
|
|
||||||
#return 0
|
|
||||||
|
|
||||||
def main_mtable_h5_from_dataframe():
|
def main_mtable_h5_from_dataframe():
|
||||||
|
|
||||||
#import os
|
#import os
|
||||||
|
@ -249,6 +249,82 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
|||||||
|
|
||||||
return path_to_output_file #, output_yml_filename_path
|
return path_to_output_file #, output_yml_filename_path
|
||||||
|
|
||||||
|
def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None):
|
||||||
|
"""
|
||||||
|
Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
ofilename (str): Path for the output HDF5 file.
|
||||||
|
input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path.
|
||||||
|
group_by_funcs (list): List of callables or column names to define hierarchical grouping.
|
||||||
|
approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file.
|
||||||
|
extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
--------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# Check whether input_data is a valid file-system path or a DataFrame
|
||||||
|
is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False
|
||||||
|
|
||||||
|
if is_valid_path(input_data):
|
||||||
|
# If input_data is a file-system path, create a DataFrame with file info
|
||||||
|
file_list = os.listdir(input_data)
|
||||||
|
df = pd.DataFrame(file_list, columns=['filename'])
|
||||||
|
df = utils.augment_with_filetype(df) # Add filetype information if needed
|
||||||
|
elif isinstance(input_data, pd.DataFrame):
|
||||||
|
# If input_data is a DataFrame, make a copy
|
||||||
|
df = input_data.copy()
|
||||||
|
else:
|
||||||
|
raise ValueError("input_data must be either a valid file-system path or a DataFrame.")
|
||||||
|
|
||||||
|
# Generate grouping columns based on group_by_funcs
|
||||||
|
if utils.is_callable_list(group_by_funcs):
|
||||||
|
grouping_cols = []
|
||||||
|
for i, func in enumerate(group_by_funcs):
|
||||||
|
col_name = f'level_{i}_groups'
|
||||||
|
grouping_cols.append(col_name)
|
||||||
|
df[col_name] = func(df)
|
||||||
|
elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]):
|
||||||
|
grouping_cols = group_by_funcs
|
||||||
|
else:
|
||||||
|
raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.")
|
||||||
|
|
||||||
|
# Generate group paths
|
||||||
|
df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)]
|
||||||
|
|
||||||
|
# Open the HDF5 file in write mode
|
||||||
|
with h5py.File(ofilename, 'w') as file:
|
||||||
|
for group_path in df['group_path'].unique():
|
||||||
|
# Create groups in HDF5
|
||||||
|
group = file.create_group(group_path)
|
||||||
|
|
||||||
|
# Filter the DataFrame for the current group
|
||||||
|
datatable = df[df['group_path'] == group_path].copy()
|
||||||
|
|
||||||
|
# Drop grouping columns and the generated 'group_path'
|
||||||
|
datatable = datatable.drop(columns=grouping_cols + ['group_path'])
|
||||||
|
|
||||||
|
# Add datasets to groups if data exists
|
||||||
|
if not datatable.empty:
|
||||||
|
dataset = utils.convert_dataframe_to_np_structured_array(datatable)
|
||||||
|
group.create_dataset(name='data_table', data=dataset)
|
||||||
|
|
||||||
|
# Add attributes if extract_attrs_func is provided
|
||||||
|
if extract_attrs_func:
|
||||||
|
attrs = extract_attrs_func(datatable)
|
||||||
|
for key, value in attrs.items():
|
||||||
|
group.attrs[key] = value
|
||||||
|
|
||||||
|
# Save metadata about depth of hierarchy
|
||||||
|
file.attrs.create(name='depth', data=len(grouping_cols) - 1)
|
||||||
|
|
||||||
|
print(f"HDF5 file created successfully at {ofilename}")
|
||||||
|
|
||||||
|
return ofilename
|
||||||
|
|
||||||
|
|
||||||
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
|
def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name):
|
||||||
"""
|
"""
|
||||||
Save processed dataframe columns with annotations to an HDF5 file.
|
Save processed dataframe columns with annotations to an HDF5 file.
|
||||||
|
Reference in New Issue
Block a user