Developed a class to manage data operations on a given hdf5 file
This commit is contained in:
@ -3,6 +3,66 @@ import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import src.hdf5_vis as hdf5_vis
|
||||
import logging
|
||||
|
||||
class HDF5DataOpsManager():
|
||||
def __init__(self, file_path, mode = 'r+') -> None:
|
||||
|
||||
if mode in ['r','r+']:
|
||||
self.mode = mode
|
||||
self.file_path = file_path
|
||||
self.file_obj = None
|
||||
self._open_file()
|
||||
|
||||
# Define private methods
|
||||
|
||||
def _open_file(self):
|
||||
if self.file_obj is None:
|
||||
self.file_obj = h5py.File(self.file_path, self.mode)
|
||||
|
||||
def _collect_dataset_names(self, name, obj, list_of_datasets):
|
||||
if isinstance(obj, h5py.Dataset):
|
||||
list_of_datasets.append(name)
|
||||
|
||||
# Define public methods
|
||||
|
||||
def close_file(self):
|
||||
if self.file_obj:
|
||||
self.file_obj.flush() # Ensure all data is written to disk
|
||||
self.file_obj.close()
|
||||
self.file_obj = None
|
||||
|
||||
def retrieve_dataframe_of_dataset_names(self):
|
||||
list_of_datasets = []
|
||||
self.file_obj.visititems(lambda name, obj: self._collect_dataset_names(name, obj, list_of_datasets))
|
||||
|
||||
dataset_df = pd.DataFrame({'dataset_name': list_of_datasets})
|
||||
dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
|
||||
dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
|
||||
|
||||
return dataset_df
|
||||
|
||||
def read_dataset_as_dataframe(self,dataset_name):
|
||||
"""
|
||||
returns a copy of the dataset content in the form of dataframe when possible or numpy array
|
||||
"""
|
||||
if self.file_obj is None:
|
||||
self.open_file()
|
||||
|
||||
dataset_obj = self.file_obj()[dataset_name]
|
||||
# Read dataset content from dataset obj
|
||||
data = dataset_obj[...]
|
||||
# The above statement can be understood as follows:
|
||||
# data = np.empty(shape=dataset_obj.shape,
|
||||
# dtype=dataset_obj.dtype)
|
||||
# dataset_obj.read_direct(data)
|
||||
|
||||
try:
|
||||
return pd.DataFrame(data)
|
||||
except ValueError as exp:
|
||||
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}")
|
||||
return data # 'data' is a NumPy array here
|
||||
|
||||
|
||||
|
||||
def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):
|
||||
|
Reference in New Issue
Block a user