Implemented metadata append, rename, delete, and update operations on the hdf5 manager object and refactored metadata update script based on yaml file to use said operations.

2024-09-30 16:32:39 +02:00
parent afe31288a0
commit 6f5d4adcee
2 changed files with 259 additions and 43 deletions
--- a/pipelines/metadata_revision.py
+++ b/pipelines/metadata_revision.py
@ -190,32 +190,71 @@ def load_yaml(yaml_review_file):
            print(exc)
            return None

-def update_hdf5_attributes(input_hdf5_file, yaml_dict):
+#def update_hdf5_attributes(input_hdf5_file, yaml_dict):

-    def update_attributes(hdf5_obj, yaml_obj):
-        for attr_name, attr_value in yaml_obj['attributes'].items():
-
-            if not isinstance(attr_value, dict):
-                attr_value = {'rename_as': attr_name, 'value': attr_value}
-            
-            if (attr_name in hdf5_obj.attrs.keys()): # delete or update
-                if attr_value.get('delete'): # delete when True
-                    hdf5_obj.attrs.__delitem__(attr_name)                        
-                elif not (attr_value.get('rename_as') == attr_name): # update when true
-                    hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # convert_attrdict_to_np_structured_array(attr_value)  
-                    hdf5_obj.attrs.__delitem__(attr_name)  
-            else: # add a new attribute
-                hdf5_obj.attrs.update({attr_name : utils.convert_attrdict_to_np_structured_array(attr_value)})  
-
-    with h5py.File(input_hdf5_file, 'r+') as f:
-        for key in yaml_dict.keys():
-            hdf5_obj = f[key]
-            yaml_obj = yaml_dict[key]
-            update_attributes(hdf5_obj, yaml_obj)

 def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
+    
+    """
+    Updates, appends, or deletes metadata attributes in an HDF5 file based on a provided YAML dictionary.
+    
+    Parameters:
+    -----------
+    input_hdf5_file : str
+        Path to the HDF5 file.
+    
+    yaml_dict : dict
+        Dictionary specifying objects and their attributes with operations. Example format:
+        {
+            "object_name": {
+                "attr_name": {
+                    "value": attr_value,
+                    "delete": True/False
+                }
+            }
+        }
+    """
    yaml_dict = load_yaml(yaml_review_file)
-    update_hdf5_attributes(input_hdf5_file, yaml_dict)
+
+    # Initialize HDF5 operations manager
+    DataOpsAPI = hdf5_ops.HDF5DataOpsManager(input_hdf5_file)
+    DataOpsAPI.open_file()
+
+    # Iterate over each object in the YAML dictionary
+    for obj_name, attr_dict in yaml_dict.items():
+        # Prepare dictionaries for append, update, and delete actions
+        append_dict = {}
+        update_dict = {}
+        delete_dict = {}
+
+        if not obj_name in DataOpsAPI.file_obj:
+            continue  # Skip if the object does not exist
+
+        # Iterate over each attribute in the current object
+        for attr_name, attr_props in attr_dict['attributes'].items():
+            if not isinstance(attr_props, dict):
+                #attr_props = {'value': attr_props}
+                # Check if the attribute exists (for updating)
+                if attr_name in DataOpsAPI.file_obj[obj_name].attrs: 
+                    update_dict[attr_name] = attr_props
+                # Otherwise, it's a new attribute to append
+                else:
+                    append_dict[attr_name] = attr_props
+            else:
+                # Check if the attribute is marked for deletion
+                if attr_props.get('delete', False):
+                    delete_dict[attr_name] = attr_props 
+
+        # Perform a single pass for all three operations
+        if append_dict:
+            DataOpsAPI.append_metadata(obj_name, append_dict)
+        if update_dict:
+            DataOpsAPI.update_metadata(obj_name, update_dict)
+        if delete_dict:
+            DataOpsAPI.delete_metadata(obj_name, delete_dict)
+    
+    # Close hdf5 file
+    DataOpsAPI.close_file()
    # Regenerate yaml snapshot of updated HDF5 file
    output_yml_filename_path = hdf5_ops.serialize_metadata(input_hdf5_file)
    print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
--- a/src/hdf5_ops.py
+++ b/src/hdf5_ops.py
@ -16,6 +16,7 @@ import h5py

 import yaml
 import json
+import copy

 class HDF5DataOpsManager():
    def __init__(self, file_path, mode = 'r+') -> None:
@ -24,14 +25,11 @@ class HDF5DataOpsManager():
            self.mode = mode
        self.file_path = file_path        
        self.file_obj = None
-        self._open_file()
+        #self._open_file()
        self.list_of_datasets = []        

-    # Define private methods
-    
-    def _open_file(self): 
-        if self.file_obj is None:
-            self.file_obj = h5py.File(self.file_path, self.mode)
+    # Define private methods   
+

    def _collect_dataset_names(self, name, obj, list_of_datasets):
        if isinstance(obj, h5py.Dataset):
@ -39,6 +37,10 @@ class HDF5DataOpsManager():

    # Define public methods

+    def open_file(self): 
+        if self.file_obj is None:
+            self.file_obj = h5py.File(self.file_path, self.mode)
+
    def close_file(self):
        if self.file_obj:
            self.file_obj.flush()  # Ensure all data is written to disk
@ -73,7 +75,7 @@ class HDF5DataOpsManager():
        try:
            return pd.DataFrame(data)
        except ValueError as exp:
-            logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}")
+            logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}. Instead, dataset will be returned as Numpy array.")
            return data  # 'data' is a NumPy array here
        
    def append_dataset(self,dataset_dict, group_name):
@ -90,21 +92,196 @@ class HDF5DataOpsManager():
        self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
        self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])

-    def append_annotations(self, obj_name, annotation_dict):
-        """ appends annotations in the form of a dictionary to the obj (group or dataset) spefified by obj_name"""
+    def append_metadata(self, obj_name, annotation_dict):
+        """ 
+        Appends metadata attributes to the specified object (obj_name) based on the provided annotation_dict.
+
+        This method ensures that the provided metadata attributes do not overwrite any existing ones. If an attribute already exists, 
+        a ValueError is raised. The function supports storing scalar values (int, float, str) and compound values such as dictionaries 
+        that are converted into NumPy structured arrays before being added to the metadata.
+
+        Parameters:
+        -----------
+        obj_name: str
+            Path to the target object (dataset or group) within the HDF5 file.
+
+        annotation_dict: dict
+            A dictionary where the keys represent new attribute names (strings), and the values can be:
+                - Scalars: int, float, or str.
+                - Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays. 
+                Example of a compound value:
+                
+                Example:
+                ----------
+                annotation_dict = {
+                    "relative_humidity": {
+                        "value": 65,
+                        "units": "percentage",
+                        "range": "[0,100]",
+                        "definition": "amount of water vapor present ..."
+                    }
+                }
+        """
+
+        if self.file_obj is None:
+            self.open_file()
+
+        # Create a copy of annotation_dict to avoid modifying the original
+        annotation_dict_copy = copy.deepcopy(annotation_dict)
+
+        #with h5py.File(self.file_path, mode='r+') as file_obj:
+        obj = self.file_obj[obj_name]
+
+        # Check if any attribute already exists
+        if any(key in obj.attrs for key in annotation_dict_copy.keys()):
+            raise ValueError("Make sure the provided (key, value) pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()")
+        
+        # Process the dictionary values and convert them to structured arrays if needed
+        for key, value in annotation_dict_copy.items():
+            if isinstance(value, dict):
+                # Convert dictionaries to NumPy structured arrays for complex attributes
+                annotation_dict_copy[key] = utils.convert_attrdict_to_np_structured_array(value)
+
+        # Update the object's attributes with the new metadata
+        obj.attrs.update(annotation_dict_copy)
+
+
+    def update_metadata(self, obj_name, annotation_dict):
+        """ 
+        Updates the value of existing metadata attributes of the specified object (obj_name) based on the provided annotation_dict.
+
+        The function disregards non-existing attributes and suggests to use the append_metadata() method to include those in the metadata.
+
+        Parameters:
+        -----------
+        obj_name : str
+            Path to the target object (dataset or group) within the HDF5 file.
+
+        annotation_dict: dict
+            A dictionary where the keys represent existing attribute names (strings), and the values can be:
+                - Scalars: int, float, or str.
+                - Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays. 
+                Example of a compound value:
+                
+                Example:
+                ----------
+                annotation_dict = {
+                    "relative_humidity": {
+                        "value": 65,
+                        "units": "percentage",
+                        "range": "[0,100]",
+                        "definition": "amount of water vapor present ..."
+                    }
+                }
+
+
+        """
+
+        if self.file_obj is None:
+            self.open_file()
+
+        update_dict = {}
+
+        #with h5py.File(self.file_path, mode='r+') as file_obj:
+        obj = self.file_obj[obj_name]
+
+        for key, value in annotation_dict.items():
+            if key in obj.attrs:
+                if isinstance(value, dict):
+                    update_dict[key] = utils.convert_attrdict_to_np_structured_array(value)
+                else:
+                    update_dict[key] = value
+            else:
+                # Optionally, log or warn about non-existing keys being ignored.
+                print(f"Warning: Key '{key}' does not exist and will be ignored.")
+
+        obj.attrs.update(update_dict)
+
+    def delete_metadata(self, obj_name, annotation_dict):
+        """
+        Deletes metadata attributes of the specified object (obj_name) based on the provided annotation_dict.
+
+        Parameters:
+        -----------
+        obj_name: str
+            Path to the target object (dataset or group) within the HDF5 file.
+        
+        annotation_dict: dict
+            Dictionary where keys represent attribute names, and values should be dictionaries containing 
+            {"delete": True} to mark them for deletion.
+
+        Example:
+        --------
+        annotation_dict = {"attr_to_be_deleted": {"delete": True}}
+
+        Behavior:
+        ---------
+        - Deletes the specified attributes from the object's metadata if marked for deletion.
+        - Issues a warning if the attribute is not found or not marked for deletion.
+        """
+        
+        if self.file_obj is None:
+            self.open_file()
+
+        #with h5py.File(self.file_path, mode='r+') as file_obj:
+        obj = self.file_obj[obj_name]
+
+        for attr_key, value in annotation_dict.items():
+            if attr_key in obj.attrs:
+                if isinstance(value, dict) and value.get('delete', False):
+                    obj.attrs.__delitem__(attr_key)
+                else:
+                    msg = f"Warning: Value for key '{attr_key}' is not marked for deletion or is invalid."
+                    print(msg)
+            else:
+                msg = f"Warning: Key '{attr_key}' does not exist in metadata."
+                print(msg)
+
+
+    def rename_metadata(self, obj_name, renaming_map):
+        """ 
+        Renames metadata attributes of the specified object (obj_name) based on the provided renaming_map.
+
+        Parameters:
+        -----------
+        obj_name: str
+            Path to the target object (dataset or group) within the HDF5 file.
+
+        renaming_map: dict
+            A dictionary where keys are current attribute names (strings), and values are the new attribute names (strings or byte strings) to rename to.
+            
+            Example:
+            --------
+            renaming_map = {
+                "old_attr_name": "new_attr_name",
+                "old_attr_2": "new_attr_2"
+            }
+
+        """
+
+        #with h5py.File(self.file_path, mode='r+') as file_obj:
+        if self.file_obj is None:
+            self.open_file()

        obj = self.file_obj[obj_name]

-        # Verify if attributes to append are all new 
-        if any(new_attr_key in obj.attrs.keys() for new_attr_key in annotation_dict.keys()):
-            self.close_file()
-            raise ValueError("Make sure the provided key, value pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()")
+        # Iterate over the renaming_map to process renaming
+        for old_attr, new_attr in renaming_map.items():
+            if old_attr in obj.attrs:
+                # Get the old attribute's value
+                attr_value = obj.attrs[old_attr]

-        for new_attr_key in annotation_dict.keys():
-            value = annotation_dict[new_attr_key]
-            if isinstance(value, dict):
-                annotation_dict[new_attr_key] = utils.convert_attrdict_to_np_structured_array(annotation_dict[new_attr_key])                
-        obj.attrs.update(annotation_dict)
+                # Create a new attribute with the new name
+                obj.attrs.create(new_attr, data=attr_value)
+
+                # Delete the old attribute
+                obj.attrs.__delitem__(old_attr)
+            else:
+                # Skip if the old attribute doesn't exist
+                msg = f"Skipping: Attribute '{old_attr}' does not exist."
+                print(msg)  # Optionally, replace with warnings.warn(msg)
+
+        self.close_file()

    def get_metadata(self, obj_path):
        """ Get file attributes from object at path = obj_path. For example,
@ -249,9 +426,9 @@ def construct_attributes_dict(attrs_obj):
                   #attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])  
                attr_dict[key] = utils.to_serializable_dtype(value)             
            else:
-                attr_dict[key] = {"rename_as" : key,
-                              "value" : utils.to_serializable_dtype(value)
-                             }
+                attr_dict[key] = utils.to_serializable_dtype(value) # {"rename_as" : key,
+                              #"value" : utils.to_serializable_dtype(value)
+                             #}

            #if isinstance(value,str):
            #    value.replace('\\','\\\\')