Created function to save dataframes with annotations in hdf5 format

This commit is contained in:
2024-06-17 13:36:05 +02:00
parent 0eba80db41
commit ed1641af55

View File

@ -477,6 +477,62 @@ def create_hdf5_file_from_filesystem_path(output_filename : str,
return output_filename #, output_yml_filename_path
import os
#import src.hdf5_lib as h5lib
import src.g5505_utils as utils
import h5py
def save_processed_dataframe_to_hdf5(df, annotator, src_hdf5_path, script_date, script_name):
"""
Save processed dataframe columns with annotations to an HDF5 file.
Parameters:
df (pd.DataFrame): DataFrame containing processed time series.
annotator (): Annotator object with get_metadata method.
hdf5_path (str): Path to the source HDF5 file.
script_date (str): Date of the data generation script.
script_name (str): Name of the data generation script.
"""
# Convert datetime columns to string
datetime_cols = df.select_dtypes(include=['datetime64']).columns
df[datetime_cols] = df[datetime_cols].applymap(str)
# Convert dataframe to structured array
icad_data_table = utils.dataframe_to_np_structured_array(df)
# Get metadata
metadata_dict = annotator.get_metadata()
# Prepare high-level attributes
high_level_attributes = {
'parent_files': metadata_dict['parent_files'],
**metadata_dict['metadata']['sample'],
**metadata_dict['metadata']['environment'],
**metadata_dict['metadata']['instruments']
}
# Prepare data level attributes
data_level_attributes = metadata_dict['metadata']['datasets']
# Generate output filename
parent_file_name = os.path.split(src_hdf5_path)[1]
output_filename = f'data_products/processed/fig_{script_date}_{parent_file_name}'
# Prepare file dictionary
file_dict = {
'name': script_name,
'attributes_dict': high_level_attributes,
'datasets': [{
'name': "data_table",
'data': icad_data_table,
'shape': icad_data_table.shape,
'attributes': data_level_attributes
}]
}
# Write to HDF5
with h5py.File(output_filename, 'w') as h5file:
transfer_file_dict_to_hdf5(h5file, '/', file_dict)
def create_hdf5_file_from_dataframe(ofilename, input_data, approach : str, group_by_funcs : list, extract_attrs_func = None):