Removed construct_attributes_dict(attrs_obj) and replaced by {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()}
This commit is contained in:
@ -31,20 +31,16 @@ class HDF5DataOpsManager():
|
|||||||
"""
|
"""
|
||||||
def __init__(self, file_path, mode = 'r+') -> None:
|
def __init__(self, file_path, mode = 'r+') -> None:
|
||||||
|
|
||||||
|
# Class attributes
|
||||||
if mode in ['r','r+']:
|
if mode in ['r','r+']:
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.file_obj = None
|
self.file_obj = None
|
||||||
#self._open_file()
|
#self._open_file()
|
||||||
self.list_of_datasets = []
|
self.dataset_metadata_df = None
|
||||||
|
|
||||||
# Define private methods
|
# Define private methods
|
||||||
|
|
||||||
|
|
||||||
def _collect_dataset_names(self, name, obj, list_of_datasets):
|
|
||||||
if isinstance(obj, h5py.Dataset):
|
|
||||||
list_of_datasets.append(name)
|
|
||||||
|
|
||||||
# Define public methods
|
# Define public methods
|
||||||
|
|
||||||
def open_file(self):
|
def open_file(self):
|
||||||
@ -56,16 +52,23 @@ class HDF5DataOpsManager():
|
|||||||
self.file_obj.flush() # Ensure all data is written to disk
|
self.file_obj.flush() # Ensure all data is written to disk
|
||||||
self.file_obj.close()
|
self.file_obj.close()
|
||||||
self.file_obj = None
|
self.file_obj = None
|
||||||
|
|
||||||
|
def load_dataset_metadata(self):
|
||||||
|
|
||||||
def retrieve_dataframe_of_dataset_names(self):
|
def __get_datasets(name, obj, list_of_datasets):
|
||||||
|
if isinstance(obj,h5py.Dataset):
|
||||||
|
list_of_datasets.append(name)
|
||||||
|
#print(f'Adding dataset: {name}') #tail: {head} head: {tail}')
|
||||||
list_of_datasets = []
|
list_of_datasets = []
|
||||||
self.file_obj.visititems(lambda name, obj: self._collect_dataset_names(name, obj, list_of_datasets))
|
with h5py.File(self.file_path,'r') as file:
|
||||||
|
list_of_datasets = []
|
||||||
|
file.visititems(lambda name, obj: __get_datasets(name, obj, list_of_datasets))
|
||||||
|
|
||||||
dataset_df = pd.DataFrame({'dataset_name': list_of_datasets})
|
dataset_metadata_df = pd.DataFrame({'dataset_name': list_of_datasets})
|
||||||
dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
|
dataset_metadata_df['parent_instrument'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-3])
|
||||||
dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
|
dataset_metadata_df['parent_file'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-2])
|
||||||
|
|
||||||
return dataset_df
|
self.dataset_metadata_df = dataset_metadata_df
|
||||||
|
|
||||||
def read_dataset_as_dataframe(self,dataset_name):
|
def read_dataset_as_dataframe(self,dataset_name):
|
||||||
"""
|
"""
|
||||||
@ -371,25 +374,6 @@ def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):
|
|||||||
#metadata_df = pd.DataFrame.from_dict(data, orient='columns')
|
#metadata_df = pd.DataFrame.from_dict(data, orient='columns')
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def list_datasets_in_hdf5file(hdf5_file_path):
|
|
||||||
|
|
||||||
def get_datasets(name, obj, list_of_datasets):
|
|
||||||
if isinstance(obj,h5py.Dataset):
|
|
||||||
list_of_datasets.append(name)
|
|
||||||
#print(f'Adding dataset: {name}') #tail: {head} head: {tail}')
|
|
||||||
|
|
||||||
|
|
||||||
with h5py.File(hdf5_file_path,'r') as file:
|
|
||||||
list_of_datasets = []
|
|
||||||
file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets))
|
|
||||||
|
|
||||||
dataset_df = pd.DataFrame({'dataset_name':list_of_datasets})
|
|
||||||
|
|
||||||
dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
|
|
||||||
dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
|
|
||||||
|
|
||||||
return dataset_df
|
|
||||||
|
|
||||||
def get_parent_child_relationships(file: h5py.File):
|
def get_parent_child_relationships(file: h5py.File):
|
||||||
|
|
||||||
nodes = ['/']
|
nodes = ['/']
|
||||||
@ -423,29 +407,6 @@ def get_parent_child_relationships(file: h5py.File):
|
|||||||
return nodes, parent, values
|
return nodes, parent, values
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def construct_attributes_dict(attrs_obj):
|
|
||||||
|
|
||||||
attr_dict = {}
|
|
||||||
for key, value in attrs_obj.items():
|
|
||||||
attr_dict[key] = {}
|
|
||||||
if not key in ['file_list','filtered_file_list']:
|
|
||||||
|
|
||||||
if utils.is_structured_array(value):
|
|
||||||
#for subattr in value.dtype.names:
|
|
||||||
#attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])
|
|
||||||
attr_dict[key] = utils.to_serializable_dtype(value)
|
|
||||||
else:
|
|
||||||
attr_dict[key] = utils.to_serializable_dtype(value) # {"rename_as" : key,
|
|
||||||
#"value" : utils.to_serializable_dtype(value)
|
|
||||||
#}
|
|
||||||
|
|
||||||
#if isinstance(value,str):
|
|
||||||
# value.replace('\\','\\\\')
|
|
||||||
|
|
||||||
|
|
||||||
return attr_dict
|
|
||||||
|
|
||||||
def __print_metadata__(name, obj, folder_depth, yaml_dict):
|
def __print_metadata__(name, obj, folder_depth, yaml_dict):
|
||||||
|
|
||||||
# TODO: should we enable deeper folders ?
|
# TODO: should we enable deeper folders ?
|
||||||
@ -459,7 +420,8 @@ def __print_metadata__(name, obj, folder_depth, yaml_dict):
|
|||||||
#attr_dict = {}
|
#attr_dict = {}
|
||||||
group_dict = {}
|
group_dict = {}
|
||||||
|
|
||||||
attr_dict = construct_attributes_dict(obj.attrs)
|
# Convert attribute dict to a YAML/JSON serializable dict
|
||||||
|
attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()}
|
||||||
|
|
||||||
#for key, value in obj.attrs.items():
|
#for key, value in obj.attrs.items():
|
||||||
#print (key, value.dtype)
|
#print (key, value.dtype)
|
||||||
@ -482,9 +444,11 @@ def __print_metadata__(name, obj, folder_depth, yaml_dict):
|
|||||||
#print(name)
|
#print(name)
|
||||||
|
|
||||||
yaml_dict[obj.name] = group_dict
|
yaml_dict[obj.name] = group_dict
|
||||||
elif isinstance(obj, h5py.Dataset):
|
elif isinstance(obj, h5py.Dataset):
|
||||||
|
# Convert attribute dict to a YAML/JSON serializable dict
|
||||||
|
attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()}
|
||||||
parent_name = '/'.join(name_to_list[:-1])
|
parent_name = '/'.join(name_to_list[:-1])
|
||||||
yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": construct_attributes_dict(obj.attrs)}
|
yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": attr_dict}
|
||||||
#print(yaml.dump(group_dict,sort_keys=False))
|
#print(yaml.dump(group_dict,sort_keys=False))
|
||||||
|
|
||||||
#elif len(obj.name.split('/')) == 3:
|
#elif len(obj.name.split('/')) == 3:
|
||||||
@ -522,8 +486,8 @@ def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format
|
|||||||
|
|
||||||
# Open the HDF5 file and extract metadata
|
# Open the HDF5 file and extract metadata
|
||||||
with h5py.File(input_filename_path, 'r') as f:
|
with h5py.File(input_filename_path, 'r') as f:
|
||||||
# Construct attributes dictionary and top-level structure
|
# Convert attribute dict to a YAML/JSON serializable dict
|
||||||
attrs_dict = construct_attributes_dict(f.attrs)
|
attrs_dict = {key: utils.to_serializable_dtype(val) for key, val in f.attrs.items()}
|
||||||
yaml_dict[f.name] = {
|
yaml_dict[f.name] = {
|
||||||
"name": f.name,
|
"name": f.name,
|
||||||
"attributes": attrs_dict,
|
"attributes": attrs_dict,
|
||||||
|
Reference in New Issue
Block a user