Developed a class to manage data operations on a given hdf5 file

This commit is contained in:
2024-08-09 13:23:54 +02:00
parent 8f7f14ab68
commit 5fe7fc4b70

View File

@ -3,6 +3,66 @@ import pandas as pd
import numpy as np
import os
import src.hdf5_vis as hdf5_vis
import logging
class HDF5DataOpsManager():
def __init__(self, file_path, mode = 'r+') -> None:
if mode in ['r','r+']:
self.mode = mode
self.file_path = file_path
self.file_obj = None
self._open_file()
# Define private methods
def _open_file(self):
if self.file_obj is None:
self.file_obj = h5py.File(self.file_path, self.mode)
def _collect_dataset_names(self, name, obj, list_of_datasets):
if isinstance(obj, h5py.Dataset):
list_of_datasets.append(name)
# Define public methods
def close_file(self):
if self.file_obj:
self.file_obj.flush() # Ensure all data is written to disk
self.file_obj.close()
self.file_obj = None
def retrieve_dataframe_of_dataset_names(self):
list_of_datasets = []
self.file_obj.visititems(lambda name, obj: self._collect_dataset_names(name, obj, list_of_datasets))
dataset_df = pd.DataFrame({'dataset_name': list_of_datasets})
dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
return dataset_df
def read_dataset_as_dataframe(self,dataset_name):
"""
returns a copy of the dataset content in the form of dataframe when possible or numpy array
"""
if self.file_obj is None:
self.open_file()
dataset_obj = self.file_obj()[dataset_name]
# Read dataset content from dataset obj
data = dataset_obj[...]
# The above statement can be understood as follows:
# data = np.empty(shape=dataset_obj.shape,
# dtype=dataset_obj.dtype)
# dataset_obj.read_direct(data)
try:
return pd.DataFrame(data)
except ValueError as exp:
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}")
return data # 'data' is a NumPy array here
def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):