diff --git a/src/hdf5_ops.py b/src/hdf5_ops.py index 3dccd37..4943cde 100644 --- a/src/hdf5_ops.py +++ b/src/hdf5_ops.py @@ -31,20 +31,16 @@ class HDF5DataOpsManager(): """ def __init__(self, file_path, mode = 'r+') -> None: + # Class attributes if mode in ['r','r+']: self.mode = mode self.file_path = file_path self.file_obj = None #self._open_file() - self.list_of_datasets = [] + self.dataset_metadata_df = None # Define private methods - - def _collect_dataset_names(self, name, obj, list_of_datasets): - if isinstance(obj, h5py.Dataset): - list_of_datasets.append(name) - # Define public methods def open_file(self): @@ -56,16 +52,23 @@ class HDF5DataOpsManager(): self.file_obj.flush() # Ensure all data is written to disk self.file_obj.close() self.file_obj = None + + def load_dataset_metadata(self): - def retrieve_dataframe_of_dataset_names(self): + def __get_datasets(name, obj, list_of_datasets): + if isinstance(obj,h5py.Dataset): + list_of_datasets.append(name) + #print(f'Adding dataset: {name}') #tail: {head} head: {tail}') list_of_datasets = [] - self.file_obj.visititems(lambda name, obj: self._collect_dataset_names(name, obj, list_of_datasets)) + with h5py.File(self.file_path,'r') as file: + list_of_datasets = [] + file.visititems(lambda name, obj: __get_datasets(name, obj, list_of_datasets)) - dataset_df = pd.DataFrame({'dataset_name': list_of_datasets}) - dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3]) - dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2]) + dataset_metadata_df = pd.DataFrame({'dataset_name': list_of_datasets}) + dataset_metadata_df['parent_instrument'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-3]) + dataset_metadata_df['parent_file'] = dataset_metadata_df['dataset_name'].apply(lambda x: x.split('/')[-2]) - return dataset_df + self.dataset_metadata_df = dataset_metadata_df def read_dataset_as_dataframe(self,dataset_name): """ @@ -371,25 +374,6 @@ def read_dataset_from_hdf5file(hdf5_file_path, dataset_path): #metadata_df = pd.DataFrame.from_dict(data, orient='columns') return df -def list_datasets_in_hdf5file(hdf5_file_path): - - def get_datasets(name, obj, list_of_datasets): - if isinstance(obj,h5py.Dataset): - list_of_datasets.append(name) - #print(f'Adding dataset: {name}') #tail: {head} head: {tail}') - - - with h5py.File(hdf5_file_path,'r') as file: - list_of_datasets = [] - file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets)) - - dataset_df = pd.DataFrame({'dataset_name':list_of_datasets}) - - dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3]) - dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2]) - - return dataset_df - def get_parent_child_relationships(file: h5py.File): nodes = ['/'] @@ -423,29 +407,6 @@ def get_parent_child_relationships(file: h5py.File): return nodes, parent, values - -def construct_attributes_dict(attrs_obj): - - attr_dict = {} - for key, value in attrs_obj.items(): - attr_dict[key] = {} - if not key in ['file_list','filtered_file_list']: - - if utils.is_structured_array(value): - #for subattr in value.dtype.names: - #attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr]) - attr_dict[key] = utils.to_serializable_dtype(value) - else: - attr_dict[key] = utils.to_serializable_dtype(value) # {"rename_as" : key, - #"value" : utils.to_serializable_dtype(value) - #} - - #if isinstance(value,str): - # value.replace('\\','\\\\') - - - return attr_dict - def __print_metadata__(name, obj, folder_depth, yaml_dict): # TODO: should we enable deeper folders ? @@ -459,7 +420,8 @@ def __print_metadata__(name, obj, folder_depth, yaml_dict): #attr_dict = {} group_dict = {} - attr_dict = construct_attributes_dict(obj.attrs) + # Convert attribute dict to a YAML/JSON serializable dict + attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()} #for key, value in obj.attrs.items(): #print (key, value.dtype) @@ -482,9 +444,11 @@ def __print_metadata__(name, obj, folder_depth, yaml_dict): #print(name) yaml_dict[obj.name] = group_dict - elif isinstance(obj, h5py.Dataset): + elif isinstance(obj, h5py.Dataset): + # Convert attribute dict to a YAML/JSON serializable dict + attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()} parent_name = '/'.join(name_to_list[:-1]) - yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": construct_attributes_dict(obj.attrs)} + yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": attr_dict} #print(yaml.dump(group_dict,sort_keys=False)) #elif len(obj.name.split('/')) == 3: @@ -522,8 +486,8 @@ def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format # Open the HDF5 file and extract metadata with h5py.File(input_filename_path, 'r') as f: - # Construct attributes dictionary and top-level structure - attrs_dict = construct_attributes_dict(f.attrs) + # Convert attribute dict to a YAML/JSON serializable dict + attrs_dict = {key: utils.to_serializable_dtype(val) for key, val in f.attrs.items()} yaml_dict[f.name] = { "name": f.name, "attributes": attrs_dict,