From 6f5d4adceedc015fce5d849c6b5e719d97194279 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Mon, 30 Sep 2024 16:32:39 +0200 Subject: [PATCH] Implemented metadata append, rename, delete, and update operations on the hdf5 manager object and refactored metadata update script based on yaml file to use said operations. --- pipelines/metadata_revision.py | 83 +++++++++---- src/hdf5_ops.py | 219 +++++++++++++++++++++++++++++---- 2 files changed, 259 insertions(+), 43 deletions(-) diff --git a/pipelines/metadata_revision.py b/pipelines/metadata_revision.py index 578c7bf..d0f68c6 100644 --- a/pipelines/metadata_revision.py +++ b/pipelines/metadata_revision.py @@ -190,32 +190,71 @@ def load_yaml(yaml_review_file): print(exc) return None -def update_hdf5_attributes(input_hdf5_file, yaml_dict): +#def update_hdf5_attributes(input_hdf5_file, yaml_dict): - def update_attributes(hdf5_obj, yaml_obj): - for attr_name, attr_value in yaml_obj['attributes'].items(): - - if not isinstance(attr_value, dict): - attr_value = {'rename_as': attr_name, 'value': attr_value} - - if (attr_name in hdf5_obj.attrs.keys()): # delete or update - if attr_value.get('delete'): # delete when True - hdf5_obj.attrs.__delitem__(attr_name) - elif not (attr_value.get('rename_as') == attr_name): # update when true - hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # convert_attrdict_to_np_structured_array(attr_value) - hdf5_obj.attrs.__delitem__(attr_name) - else: # add a new attribute - hdf5_obj.attrs.update({attr_name : utils.convert_attrdict_to_np_structured_array(attr_value)}) - - with h5py.File(input_hdf5_file, 'r+') as f: - for key in yaml_dict.keys(): - hdf5_obj = f[key] - yaml_obj = yaml_dict[key] - update_attributes(hdf5_obj, yaml_obj) def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file): + + """ + Updates, appends, or deletes metadata attributes in an HDF5 file based on a provided YAML dictionary. + + Parameters: + ----------- + input_hdf5_file : str + Path to the HDF5 file. + + yaml_dict : dict + Dictionary specifying objects and their attributes with operations. Example format: + { + "object_name": { + "attr_name": { + "value": attr_value, + "delete": True/False + } + } + } + """ yaml_dict = load_yaml(yaml_review_file) - update_hdf5_attributes(input_hdf5_file, yaml_dict) + + # Initialize HDF5 operations manager + DataOpsAPI = hdf5_ops.HDF5DataOpsManager(input_hdf5_file) + DataOpsAPI.open_file() + + # Iterate over each object in the YAML dictionary + for obj_name, attr_dict in yaml_dict.items(): + # Prepare dictionaries for append, update, and delete actions + append_dict = {} + update_dict = {} + delete_dict = {} + + if not obj_name in DataOpsAPI.file_obj: + continue # Skip if the object does not exist + + # Iterate over each attribute in the current object + for attr_name, attr_props in attr_dict['attributes'].items(): + if not isinstance(attr_props, dict): + #attr_props = {'value': attr_props} + # Check if the attribute exists (for updating) + if attr_name in DataOpsAPI.file_obj[obj_name].attrs: + update_dict[attr_name] = attr_props + # Otherwise, it's a new attribute to append + else: + append_dict[attr_name] = attr_props + else: + # Check if the attribute is marked for deletion + if attr_props.get('delete', False): + delete_dict[attr_name] = attr_props + + # Perform a single pass for all three operations + if append_dict: + DataOpsAPI.append_metadata(obj_name, append_dict) + if update_dict: + DataOpsAPI.update_metadata(obj_name, update_dict) + if delete_dict: + DataOpsAPI.delete_metadata(obj_name, delete_dict) + + # Close hdf5 file + DataOpsAPI.close_file() # Regenerate yaml snapshot of updated HDF5 file output_yml_filename_path = hdf5_ops.serialize_metadata(input_hdf5_file) print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}') diff --git a/src/hdf5_ops.py b/src/hdf5_ops.py index eff636d..57aed5f 100644 --- a/src/hdf5_ops.py +++ b/src/hdf5_ops.py @@ -16,6 +16,7 @@ import h5py import yaml import json +import copy class HDF5DataOpsManager(): def __init__(self, file_path, mode = 'r+') -> None: @@ -24,14 +25,11 @@ class HDF5DataOpsManager(): self.mode = mode self.file_path = file_path self.file_obj = None - self._open_file() + #self._open_file() self.list_of_datasets = [] - # Define private methods - - def _open_file(self): - if self.file_obj is None: - self.file_obj = h5py.File(self.file_path, self.mode) + # Define private methods + def _collect_dataset_names(self, name, obj, list_of_datasets): if isinstance(obj, h5py.Dataset): @@ -39,6 +37,10 @@ class HDF5DataOpsManager(): # Define public methods + def open_file(self): + if self.file_obj is None: + self.file_obj = h5py.File(self.file_path, self.mode) + def close_file(self): if self.file_obj: self.file_obj.flush() # Ensure all data is written to disk @@ -73,7 +75,7 @@ class HDF5DataOpsManager(): try: return pd.DataFrame(data) except ValueError as exp: - logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}") + logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}. Instead, dataset will be returned as Numpy array.") return data # 'data' is a NumPy array here def append_dataset(self,dataset_dict, group_name): @@ -90,21 +92,196 @@ class HDF5DataOpsManager(): self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data']) self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes']) - def append_annotations(self, obj_name, annotation_dict): - """ appends annotations in the form of a dictionary to the obj (group or dataset) spefified by obj_name""" + def append_metadata(self, obj_name, annotation_dict): + """ + Appends metadata attributes to the specified object (obj_name) based on the provided annotation_dict. + + This method ensures that the provided metadata attributes do not overwrite any existing ones. If an attribute already exists, + a ValueError is raised. The function supports storing scalar values (int, float, str) and compound values such as dictionaries + that are converted into NumPy structured arrays before being added to the metadata. + + Parameters: + ----------- + obj_name: str + Path to the target object (dataset or group) within the HDF5 file. + + annotation_dict: dict + A dictionary where the keys represent new attribute names (strings), and the values can be: + - Scalars: int, float, or str. + - Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays. + Example of a compound value: + + Example: + ---------- + annotation_dict = { + "relative_humidity": { + "value": 65, + "units": "percentage", + "range": "[0,100]", + "definition": "amount of water vapor present ..." + } + } + """ + + if self.file_obj is None: + self.open_file() + + # Create a copy of annotation_dict to avoid modifying the original + annotation_dict_copy = copy.deepcopy(annotation_dict) + + #with h5py.File(self.file_path, mode='r+') as file_obj: + obj = self.file_obj[obj_name] + + # Check if any attribute already exists + if any(key in obj.attrs for key in annotation_dict_copy.keys()): + raise ValueError("Make sure the provided (key, value) pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()") + + # Process the dictionary values and convert them to structured arrays if needed + for key, value in annotation_dict_copy.items(): + if isinstance(value, dict): + # Convert dictionaries to NumPy structured arrays for complex attributes + annotation_dict_copy[key] = utils.convert_attrdict_to_np_structured_array(value) + + # Update the object's attributes with the new metadata + obj.attrs.update(annotation_dict_copy) + + + def update_metadata(self, obj_name, annotation_dict): + """ + Updates the value of existing metadata attributes of the specified object (obj_name) based on the provided annotation_dict. + + The function disregards non-existing attributes and suggests to use the append_metadata() method to include those in the metadata. + + Parameters: + ----------- + obj_name : str + Path to the target object (dataset or group) within the HDF5 file. + + annotation_dict: dict + A dictionary where the keys represent existing attribute names (strings), and the values can be: + - Scalars: int, float, or str. + - Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays. + Example of a compound value: + + Example: + ---------- + annotation_dict = { + "relative_humidity": { + "value": 65, + "units": "percentage", + "range": "[0,100]", + "definition": "amount of water vapor present ..." + } + } + + + """ + + if self.file_obj is None: + self.open_file() + + update_dict = {} + + #with h5py.File(self.file_path, mode='r+') as file_obj: + obj = self.file_obj[obj_name] + + for key, value in annotation_dict.items(): + if key in obj.attrs: + if isinstance(value, dict): + update_dict[key] = utils.convert_attrdict_to_np_structured_array(value) + else: + update_dict[key] = value + else: + # Optionally, log or warn about non-existing keys being ignored. + print(f"Warning: Key '{key}' does not exist and will be ignored.") + + obj.attrs.update(update_dict) + + def delete_metadata(self, obj_name, annotation_dict): + """ + Deletes metadata attributes of the specified object (obj_name) based on the provided annotation_dict. + + Parameters: + ----------- + obj_name: str + Path to the target object (dataset or group) within the HDF5 file. + + annotation_dict: dict + Dictionary where keys represent attribute names, and values should be dictionaries containing + {"delete": True} to mark them for deletion. + + Example: + -------- + annotation_dict = {"attr_to_be_deleted": {"delete": True}} + + Behavior: + --------- + - Deletes the specified attributes from the object's metadata if marked for deletion. + - Issues a warning if the attribute is not found or not marked for deletion. + """ + + if self.file_obj is None: + self.open_file() + + #with h5py.File(self.file_path, mode='r+') as file_obj: + obj = self.file_obj[obj_name] + + for attr_key, value in annotation_dict.items(): + if attr_key in obj.attrs: + if isinstance(value, dict) and value.get('delete', False): + obj.attrs.__delitem__(attr_key) + else: + msg = f"Warning: Value for key '{attr_key}' is not marked for deletion or is invalid." + print(msg) + else: + msg = f"Warning: Key '{attr_key}' does not exist in metadata." + print(msg) + + + def rename_metadata(self, obj_name, renaming_map): + """ + Renames metadata attributes of the specified object (obj_name) based on the provided renaming_map. + + Parameters: + ----------- + obj_name: str + Path to the target object (dataset or group) within the HDF5 file. + + renaming_map: dict + A dictionary where keys are current attribute names (strings), and values are the new attribute names (strings or byte strings) to rename to. + + Example: + -------- + renaming_map = { + "old_attr_name": "new_attr_name", + "old_attr_2": "new_attr_2" + } + + """ + + #with h5py.File(self.file_path, mode='r+') as file_obj: + if self.file_obj is None: + self.open_file() obj = self.file_obj[obj_name] - # Verify if attributes to append are all new - if any(new_attr_key in obj.attrs.keys() for new_attr_key in annotation_dict.keys()): - self.close_file() - raise ValueError("Make sure the provided key, value pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()") + # Iterate over the renaming_map to process renaming + for old_attr, new_attr in renaming_map.items(): + if old_attr in obj.attrs: + # Get the old attribute's value + attr_value = obj.attrs[old_attr] - for new_attr_key in annotation_dict.keys(): - value = annotation_dict[new_attr_key] - if isinstance(value, dict): - annotation_dict[new_attr_key] = utils.convert_attrdict_to_np_structured_array(annotation_dict[new_attr_key]) - obj.attrs.update(annotation_dict) + # Create a new attribute with the new name + obj.attrs.create(new_attr, data=attr_value) + + # Delete the old attribute + obj.attrs.__delitem__(old_attr) + else: + # Skip if the old attribute doesn't exist + msg = f"Skipping: Attribute '{old_attr}' does not exist." + print(msg) # Optionally, replace with warnings.warn(msg) + + self.close_file() def get_metadata(self, obj_path): """ Get file attributes from object at path = obj_path. For example, @@ -249,9 +426,9 @@ def construct_attributes_dict(attrs_obj): #attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr]) attr_dict[key] = utils.to_serializable_dtype(value) else: - attr_dict[key] = {"rename_as" : key, - "value" : utils.to_serializable_dtype(value) - } + attr_dict[key] = utils.to_serializable_dtype(value) # {"rename_as" : key, + #"value" : utils.to_serializable_dtype(value) + #} #if isinstance(value,str): # value.replace('\\','\\\\')