Robustified metadata and dataset extraction methods by requiring explicit load of file obj before their use. Renamed a few functions and fixed types in print statements.

This commit is contained in:
2024-10-10 11:28:23 +02:00
parent 7653e982a4
commit 2a9d69c757

View File

@ -54,29 +54,44 @@ class HDF5DataOpsManager():
self.file_obj.close()
self.file_obj = None
def load_dataset_metadata(self):
def extract_and_load_dataset_metadata(self):
def __get_datasets(name, obj, list_of_datasets):
if isinstance(obj,h5py.Dataset):
list_of_datasets.append(name)
#print(f'Adding dataset: {name}') #tail: {head} head: {tail}')
list_of_datasets = []
with h5py.File(self.file_path,'r') as file:
if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to extract datasets.")
try:
list_of_datasets = []
file.visititems(lambda name, obj: __get_datasets(name, obj, list_of_datasets))
dataset_metadata_df = pd.DataFrame({'dataset_name': list_of_datasets})
dataset_metadata_df['parent_instrument'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-3])
dataset_metadata_df['parent_file'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-2])
self.file_obj.visititems(lambda name, obj: __get_datasets(name, obj, list_of_datasets))
dataset_metadata_df = pd.DataFrame({'dataset_name': list_of_datasets})
dataset_metadata_df['parent_instrument'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-3])
dataset_metadata_df['parent_file'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-2])
self.dataset_metadata_df = dataset_metadata_df
except Exception as e:
self.unload_file_obj()
print(f"An unexpected error occurred: {e}. File object will be unloaded.")
self.dataset_metadata_df = dataset_metadata_df
def read_dataset_as_dataframe(self,dataset_name):
def extract_dataset_as_dataframe(self,dataset_name):
"""
returns a copy of the dataset content in the form of dataframe when possible or numpy array
"""
if self.file_obj is None:
self.load_file_obj()
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to extract datasets.")
dataset_obj = self.file_obj[dataset_name]
# Read dataset content from dataset obj
@ -88,9 +103,13 @@ class HDF5DataOpsManager():
try:
return pd.DataFrame(data)
except ValueError as exp:
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}. Instead, dataset will be returned as Numpy array.")
except ValueError as e:
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {e}. Instead, dataset will be returned as Numpy array.")
return data # 'data' is a NumPy array here
except Exception as e:
self.unload_file_obj()
print(f"An unexpected error occurred: {e}. Returning None and unloading file object")
return None
# Define metadata revision methods: append(), update(), delete(), and rename().
@ -126,7 +145,7 @@ class HDF5DataOpsManager():
"""
if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file' method before attempting to modify it.")
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
# Create a copy of annotation_dict to avoid modifying the original
annotation_dict_copy = copy.deepcopy(annotation_dict)
@ -184,7 +203,7 @@ class HDF5DataOpsManager():
"""
if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file' method before attempting to modify it.")
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
update_dict = {}
@ -231,7 +250,7 @@ class HDF5DataOpsManager():
"""
if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file' method before attempting to modify it.")
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
try:
obj = self.file_obj[obj_name]
@ -273,7 +292,7 @@ class HDF5DataOpsManager():
"""
if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file' method before attempting to modify it.")
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
try:
obj = self.file_obj[obj_name]