From 967be876d1da83fdc77574215b3b8b1807da2286 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Sun, 24 Nov 2024 11:30:08 +0100 Subject: [PATCH] Moved func create_hdf5_file_from_dataframe() from hdf5_lib_part2 into hdf5_write.py --- src/hdf5_lib_part2.py | 93 ------------------------------------------- src/hdf5_writer.py | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 93 deletions(-) diff --git a/src/hdf5_lib_part2.py b/src/hdf5_lib_part2.py index 29c528c..5682e79 100644 --- a/src/hdf5_lib_part2.py +++ b/src/hdf5_lib_part2.py @@ -349,99 +349,6 @@ def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5 h5file.attrs.update(project_level_attributes) transfer_file_dict_to_hdf5(h5file, '/', file_dict) - -def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None): - - """ Creates an hdf5 file with as many levels as indicated by len(group_by_funcs). - Top level denotes the root group/directory and bottom level denotes measurement level groups. - - Parameters: - input_data (pd.DataFrame) : - group_by_funcs (list of callables or strs) : contains a list of callables or dataframe's column names that will be used - to partition or group files from top to bottom. - - Callables in the list must assign a categorical value to each file in a file list, internally represented as a DataFrame, - and they thus return a pd.Series of categorical values. - - On the other hand, strings in the list refer to the name of categorical columns in the input_data (when this is a DataFrame) - - Returns: - - """ - - # Check whether input_data is a valid file-system path or a DataFrame - is_valid_path = lambda x : os.path.exists(input_data) if isinstance(input_data,str) else False - - if is_valid_path(input_data): - - file_list = os.listdir(input_data) - - # Navigates file-system folders/directories from top to bottom. - #for dirpath, dirnames, filenames in os.walk(input_data,topdown=True): - - - #df = pd.DataFrame(file_list,columns=['filename']) - df = utils.augment_with_filetype(df) - - elif isinstance(input_data,pd.DataFrame): - df = input_data.copy() - else: - raise ValueError("input_data must be either a valid file-system path or a dataframe.") - - # Create group columns to form paths - if utils.is_callable_list(group_by_funcs): - grouping_cols = [] - for i, func in enumerate(group_by_funcs): - grouping_cols.append('level_'+str(i)+'_groups') - df['level_'+str(i)+'_groups'] = func(df) - elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]): - grouping_cols = group_by_funcs - else: - raise ValueError("'group_by_funcs' must be a list of callables (or str) that takes input_data as input an returns a valid categorical output.") - - # Concatenate group columns to form paths - df['group_path'] = df[grouping_cols].apply(lambda row: '/'.join(row.values.astype(str)), axis=1) - - if approach == 'botton-up': - # TODO: implement botton-up approach - if is_nested_hierarchy(df.loc[:,grouping_cols]): - print('Do something') - else: - raise ValueError("group_by_funcs do not define a valid group hierarchy. Please reprocess the input_data or choose different grouping functions.") - - elif approach == 'top-down': - # Check the length of group_by_funcs list is at most 2 - #if len(group_by_funcs) > 2: - # # TODO: extend to more than 2 callable elements. - # raise ValueError("group_by_funcs can only contain at most two grouping elements.") - - with h5py.File(ofilename, 'w') as file: - - # Create groups based on concatenated paths - for path in df['group_path'].unique(): - file.create_group(path) - # TODO: incorporate remaining cols (i.e., excluding the group columns) as either metadata or datasets - - #create_group_hierarchy(file, df, grouping_cols) - - file.attrs.create(name='depth', data=len(grouping_cols)-1) - - print(':)') - - else: - raise ValueError("'approach' must take values in ['top-down','bottom-up']") - - - #for i, value in enumerate(df['level_'+str(0)+'_groups'].unique().tolist()): - - # 2. Validate group hierarchy, lower level groups must be embedded in higher level groups - - # 3. Create hdf5 file with groups defined by the 'file_group' column - # - # Add datasets to groups and the groups and the group's attributes - - #return 0 - def main_mtable_h5_from_dataframe(): #import os diff --git a/src/hdf5_writer.py b/src/hdf5_writer.py index 5c0aba8..c727ffc 100644 --- a/src/hdf5_writer.py +++ b/src/hdf5_writer.py @@ -249,6 +249,82 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str, return path_to_output_file #, output_yml_filename_path +def create_hdf5_file_from_dataframe(ofilename, input_data, group_by_funcs: list, approach: str = None, extract_attrs_func=None): + """ + Creates an HDF5 file with hierarchical groups based on the specified grouping functions or columns. + + Parameters: + ----------- + ofilename (str): Path for the output HDF5 file. + input_data (pd.DataFrame or str): Input data as a DataFrame or a valid file system path. + group_by_funcs (list): List of callables or column names to define hierarchical grouping. + approach (str): Specifies the approach ('top-down' or 'bottom-up') for creating the HDF5 file. + extract_attrs_func (callable, optional): Function to extract additional attributes for HDF5 groups. + + Returns: + -------- + None + """ + # Check whether input_data is a valid file-system path or a DataFrame + is_valid_path = lambda x: os.path.exists(x) if isinstance(x, str) else False + + if is_valid_path(input_data): + # If input_data is a file-system path, create a DataFrame with file info + file_list = os.listdir(input_data) + df = pd.DataFrame(file_list, columns=['filename']) + df = utils.augment_with_filetype(df) # Add filetype information if needed + elif isinstance(input_data, pd.DataFrame): + # If input_data is a DataFrame, make a copy + df = input_data.copy() + else: + raise ValueError("input_data must be either a valid file-system path or a DataFrame.") + + # Generate grouping columns based on group_by_funcs + if utils.is_callable_list(group_by_funcs): + grouping_cols = [] + for i, func in enumerate(group_by_funcs): + col_name = f'level_{i}_groups' + grouping_cols.append(col_name) + df[col_name] = func(df) + elif utils.is_str_list(group_by_funcs) and all([item in df.columns for item in group_by_funcs]): + grouping_cols = group_by_funcs + else: + raise ValueError("'group_by_funcs' must be a list of callables or valid column names in the DataFrame.") + + # Generate group paths + df['group_path'] = ['/' + '/'.join(row) for row in df[grouping_cols].values.astype(str)] + + # Open the HDF5 file in write mode + with h5py.File(ofilename, 'w') as file: + for group_path in df['group_path'].unique(): + # Create groups in HDF5 + group = file.create_group(group_path) + + # Filter the DataFrame for the current group + datatable = df[df['group_path'] == group_path].copy() + + # Drop grouping columns and the generated 'group_path' + datatable = datatable.drop(columns=grouping_cols + ['group_path']) + + # Add datasets to groups if data exists + if not datatable.empty: + dataset = utils.convert_dataframe_to_np_structured_array(datatable) + group.create_dataset(name='data_table', data=dataset) + + # Add attributes if extract_attrs_func is provided + if extract_attrs_func: + attrs = extract_attrs_func(datatable) + for key, value in attrs.items(): + group.attrs[key] = value + + # Save metadata about depth of hierarchy + file.attrs.create(name='depth', data=len(grouping_cols) - 1) + + print(f"HDF5 file created successfully at {ofilename}") + + return ofilename + + def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5_path, script_date, script_name): """ Save processed dataframe columns with annotations to an HDF5 file.