diff --git a/src/hdf5_data_extraction.py b/src/hdf5_data_extraction.py index eef6d9e..b299d60 100644 --- a/src/hdf5_data_extraction.py +++ b/src/hdf5_data_extraction.py @@ -3,6 +3,66 @@ import pandas as pd import numpy as np import os import src.hdf5_vis as hdf5_vis +import logging + +class HDF5DataOpsManager(): + def __init__(self, file_path, mode = 'r+') -> None: + + if mode in ['r','r+']: + self.mode = mode + self.file_path = file_path + self.file_obj = None + self._open_file() + + # Define private methods + + def _open_file(self): + if self.file_obj is None: + self.file_obj = h5py.File(self.file_path, self.mode) + + def _collect_dataset_names(self, name, obj, list_of_datasets): + if isinstance(obj, h5py.Dataset): + list_of_datasets.append(name) + + # Define public methods + + def close_file(self): + if self.file_obj: + self.file_obj.flush() # Ensure all data is written to disk + self.file_obj.close() + self.file_obj = None + + def retrieve_dataframe_of_dataset_names(self): + list_of_datasets = [] + self.file_obj.visititems(lambda name, obj: self._collect_dataset_names(name, obj, list_of_datasets)) + + dataset_df = pd.DataFrame({'dataset_name': list_of_datasets}) + dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3]) + dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2]) + + return dataset_df + + def read_dataset_as_dataframe(self,dataset_name): + """ + returns a copy of the dataset content in the form of dataframe when possible or numpy array + """ + if self.file_obj is None: + self.open_file() + + dataset_obj = self.file_obj()[dataset_name] + # Read dataset content from dataset obj + data = dataset_obj[...] + # The above statement can be understood as follows: + # data = np.empty(shape=dataset_obj.shape, + # dtype=dataset_obj.dtype) + # dataset_obj.read_direct(data) + + try: + return pd.DataFrame(data) + except ValueError as exp: + logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}") + return data # 'data' is a NumPy array here + def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):