From e7ed6145f0286ff0ecd72ece1228b5a67ee4fc21 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Tue, 11 Jun 2024 10:38:04 +0200 Subject: [PATCH] Implemented a data extraction module to access data from an hdf5 file in the form of dataframes. --- src/hdf5_data_extraction.py | 55 +++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/hdf5_data_extraction.py b/src/hdf5_data_extraction.py index 16d76fc..eef6d9e 100644 --- a/src/hdf5_data_extraction.py +++ b/src/hdf5_data_extraction.py @@ -2,45 +2,46 @@ import h5py import pandas as pd import numpy as np import os +import src.hdf5_vis as hdf5_vis -def read_dataset_as_dataframe(hdf, dataset_name): - dataset = hdf[dataset_name] - data = np.empty(dataset.shape, dtype=dataset.dtype) - dataset.read_direct(data) - df = pd.DataFrame(data) - - if 'categorical' in dataset_name: - # Assuming all entries are byte strings encoded with utf-8 - for column_name in df.columns: - df[column_name] = df[column_name].str.decode('utf-8') - # Assuming there's a 'timestamps' column that needs to be converted to datetime - if 'timestamps' in df.columns: - df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True) - - elif 'numerical' in dataset_name: - df = df.apply(pd.to_numeric) - +def read_dataset_from_hdf5file(hdf5_file_path, dataset_path): + # Open the HDF5 file + with h5py.File(hdf5_file_path, 'r') as hdf: + # Load the dataset + dataset = hdf[dataset_path] + data = np.empty(dataset.shape, dtype=dataset.dtype) + dataset.read_direct(data) + df = pd.DataFrame(data) + + for col_name in df.select_dtypes(exclude='number'): + df[col_name] = df[col_name].str.decode('utf-8') #apply(lambda x: x.decode('utf-8') if isinstance(x,bytes) else x) + ## Extract metadata (attributes) and convert to a dictionary + #metadata = hdf5_vis.construct_attributes_dict(hdf[dataset_name].attrs) + ## Create a one-row DataFrame with the metadata + #metadata_df = pd.DataFrame.from_dict(data, orient='columns') return df -def extract_filename(dataset_name_path): +def read_metadata_from_hdf5obj(hdf5_file_path, obj_path): + # TODO: Complete this function + metadata_df = pd.DataFrame.empty() + return metadata_df - tmp = dataset_name_path.split('/') - - return tmp[len(tmp)-2] - -def list_datasets(hdf5_filename_path): +def list_datasets_in_hdf5file(hdf5_file_path): def get_datasets(name, obj, list_of_datasets): if isinstance(obj,h5py.Dataset): list_of_datasets.append(name) - head, tail = os.path.split(name) - print(f'Adding dataset: tail: {head} head: {tail}') + #print(f'Adding dataset: {name}') #tail: {head} head: {tail}') - with h5py.File(hdf5_filename_path,'r') as file: + with h5py.File(hdf5_file_path,'r') as file: list_of_datasets = [] file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets)) + dataset_df = pd.DataFrame({'dataset_name':list_of_datasets}) - return list_of_datasets + dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3]) + dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2]) + + return dataset_df