Made def third_update_hdf5_file_with_review more modular by separating data update and git operations, resulting new functions that can be reused in less restrictive matadata annotation contexts.

2024-05-29 15:26:48 +02:00
parent ef7c6c9efb
commit 3a9aede909
1 changed files with 82 additions and 139 deletions
--- a/src/metadata_review_lib.py
+++ b/src/metadata_review_lib.py
@ -8,6 +8,15 @@ import h5py
 import yaml
 import src.g5505_utils as utils
 import src.hdf5_vis as hdf5_vis
+import src.hdf5_lib as hdf5_lib
+import src.git_ops as git_ops
+# TODO: incorporate lines 14-18 in git_ops module and refactor code where needed
+current_branch_command = ['git','branch','--show-current']    
+status_command = ['git','status']
+add_command = lambda add_list: ['git','add'] + add_list
+rm_command = lambda add_list: ['git','add'] + add_list
+commit_command = lambda message: ['git','commit','-m', message]
+
 #import input_files.config_file as config_file

 import numpy as np
@ -30,11 +39,35 @@ def get_review_status(filename_path):
            workflow_steps.append(line)
    return workflow_steps[-1]

-current_branch_command = ['git','branch','--show-current']    
-status_command = ['git','status']
-add_command = lambda add_list: ['git','add'] + add_list
-rm_command = lambda add_list: ['git','add'] + add_list
-commit_command = lambda message: ['git','commit','-m', message]
+def parse_attribute(attr_value):
+    dtype = []
+    values_list = []
+    max_length = max(len(item) for item in attr_value.keys())
+    for key in attr_value.keys():
+        if (not key=='rename_as'):
+            dtype.append((key,f'S{max_length}'))
+            values_list.append(attr_value[key])  
+
+    if len(values_list)>1:
+        new_attr_value = np.array([tuple(values_list)],dtype=dtype)
+    elif values_list:
+        new_attr_value = values_list[0]
+    else:
+        new_attr_value = 'missing'
+
+    return new_attr_value
+
+def convert_string_to_bytes(input_list: list):
+    utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
+    if input_list:
+        max_length = max(len(item) for item in input_list)
+        # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
+        input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
+        input_array_bytes  = np.array(input_list_bytes,dtype=utf8_type(max_length)) 
+    else:
+        input_array_bytes  = np.array([],dtype=utf8_type(0))
+
+    return input_array_bytes

 def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):

@ -185,151 +218,61 @@ def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
    else:
        print('Nothing to commit.')

-
-def parse_attribute(attr_value):
-    dtype = []
-    values_list = []
-    max_length = 100
-    for key in attr_value.keys():
-        if (not key=='rename_as'):
-            dtype.append((key,f'S{max_length}'))
-            values_list.append(attr_value[key])  
-
-    if len(values_list)>1:
-        new_attr_value = np.array([tuple(values_list)],dtype=dtype)
-    elif values_list:
-        new_attr_value = values_list[0]
-    else:
-        new_attr_value = 'missing'
-
-    return new_attr_value
-
-def convert_string_to_bytes(input_list: list):
-    utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
-    if input_list:
-        max_length = max(len(item) for item in input_list)
-        # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
-        input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
-        input_array_bytes  = np.array(input_list_bytes,dtype=utf8_type(max_length)) 
-    else:
-        input_array_bytes  = np.array([],dtype=utf8_type(0))
-
-    return input_array_bytes
-
-def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs = {}, hdf5_upload : bool = False):
-
-    """Third"""
-    # compare review file with current yalm file and then based on the changes open hdf5 file and access only
-    # groups that changed :). the below approach is suboptimal 
-
-    # TODO: only enable update if your branch is data owner :)
-
-    if not 'submitted' in get_review_status(input_hdf5_file):
-        raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
-
-    def count(name,obj,yml_dict):
-        print(obj.name)
-        if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
-            obj_review = yml_dict[obj.name]
-
-            additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
-
-            count_additions = sum(additions)
-
-            deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
-
-            count_delections = sum(deletions)
-
-            print('additions',count_additions, 'deletions', count_delections)
-
-    with open(yaml_review_file,'r') as stream:
+#
+def load_yaml(yaml_review_file):
+    with open(yaml_review_file, 'r') as stream:
        try:
-            yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
+            return yaml.load(stream, Loader=yaml.FullLoader)
        except yaml.YAMLError as exc:
            print(exc)
+            return None

-        with h5py.File(input_hdf5_file,'r+') as f:
-            #f.visititems(lambda name, obj: count(name,obj,yaml_dict))
-            for key in yaml_dict.keys(): # keys should coincide with group names
-                print(key)
-                # Select hdf5 and yaml objects at key
-                hdf5_obj = f[key]
-                yaml_obj = yaml_dict[key]
+def update_hdf5_attributes(input_hdf5_file, yaml_dict):

-                count(hdf5_obj.name, hdf5_obj, yaml_dict)
+    def update_attributes(hdf5_obj, yaml_obj):
+        for attr_name, attr_value in yaml_obj['attributes'].items():

-                for attr_name, attr_value in yaml_obj['attributes'].items():
-                    #attr_value = yaml_obj['attributes'][attr_name] 
+            if not isinstance(attr_value, dict):
+                attr_value = {'rename_as': attr_name, 'value': attr_value}
+            
+            if (attr_name in hdf5_obj.attrs.keys()): # delete or update
+                if attr_value.get('delete'): # delete when True
+                    hdf5_obj.attrs.__delitem__(attr_name)                        
+                elif not (attr_value.get('rename_as') == attr_name): # update when true
+                    hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # parse_attribute(attr_value)  
+                    hdf5_obj.attrs.__delitem__(attr_name)  
+            else: # add a new attribute
+                hdf5_obj.attrs.update({attr_name : parse_attribute(attr_value)})  

-                    if not isinstance(attr_value,dict):
-                        attr_value = {'rename_as':attr_name, 'value':attr_value, 'delete': False}
+    with h5py.File(input_hdf5_file, 'r+') as f:
+        for key in yaml_dict.keys():
+            hdf5_obj = f[key]
+            yaml_obj = yaml_dict[key]
+            update_attributes(hdf5_obj, yaml_obj)

-                    if attr_value.get('delete',False) and (attr_name in hdf5_obj.attrs.keys()):
-                        hdf5_obj.attrs.__delitem__(attr_name)
-                        continue
-
-                    # Check whether attr_name belongs to the existing attributes of hdf5_obj
-                    if attr_name in hdf5_obj.attrs.keys(): 
-                    #else: # renaming attribute and possibly change of value assigment
-
-                    #if isinstance(attr_value,dict):
-                    #    # Retreive possibly new attribute's name and value
-                        new_attr_name  = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
-                        
-                        hdf5_obj.attrs[new_attr_name] = parse_attribute(attr_value)
-
-                        # Remove from hdf5_obj.attrs attribute w/ name: attr_name if
-                        # yaml indicates a renaming of the attribute.
-                        if not (new_attr_name == attr_name): 
-                            hdf5_obj.attrs.__delitem__(attr_name)
-
-                    #else:
-                    #    hdf5_obj.attrs[attr_name] = attr_value
-                    elif not attr_value.get('delete',False): # if true inclusion, else don't take any action
-                        #hdf5_obj.attrs.__delitem__(attr_name): # attribute inclusion
-                        #if isinstance(attr_value,dict):
-                            # Retreive new attribute's name and value
-                        #    new_attr_name  = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
-                        #    new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
-                        hdf5_obj.attrs[attr_name] = parse_attribute(attr_value)
-                        #else:
-                        #    hdf5_obj.attrs[attr_name] = attr_value
-    print(input_hdf5_file + ' was successfully updated\n')
-
-    # Recreate/or update yaml representation of updated input_hdf5_file.                   
+def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
+    yaml_dict = load_yaml(yaml_review_file)
+    update_hdf5_attributes(input_hdf5_file, yaml_dict)
+    # Regenerate yaml snapshot of updated HDF5 file
    output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
-    print(output_yml_filename_path + ' was successfully regenerated from the updated version of ', input_hdf5_file)
+    print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
+    
+def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
+    if 'submitted' not in get_review_status(input_hdf5_file):
+        raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')

+    update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
+    git_ops.perform_git_operations(hdf5_upload)

-    status_command = ['git','status']
-    add_command = lambda add_list: ['git','add'] + add_list
-    commit_command = lambda message: ['git','commit','-m', message]
-    push_command = lambda repository,refspec: ['git','push',repository,refspec]
-
-    status = subprocess.run(status_command,capture_output=True,check=True)
-
-    if hdf5_upload:
-        upload_ext = ['.h5','.yaml']
-    else:
-        upload_ext = ['.yaml']
-
-    files_to_add_list = []
-    for line in status.stdout.splitlines():
-        # conver line from bytes to str
-        tmp = line.decode("utf-8")
-        if 'modified' in tmp:
-            if any([ext in tmp for ext in upload_ext] ):
-                files_to_add_list.append(tmp.split()[1])                
-    if files_to_add_list:
-        output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
-        # TODO:verify if files were correctly staged
-        #status = subprocess.run(status_command,capture_output=True,check=True)
-        message = 'Updated hdf5 file with yaml review file.'
-        commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
-        print(commit_output.stdout)
-    else:
-        print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
-
+def count(hdf5_obj,yml_dict):
+    print(hdf5_obj.name)
+    if isinstance(hdf5_obj,h5py.Group) and len(hdf5_obj.name.split('/')) <= 4:
+        obj_review = yml_dict[hdf5_obj.name]
+        additions = [not (item in hdf5_obj.attrs.keys()) for item in obj_review['attributes'].keys()]
+        count_additions = sum(additions)
+        deletions = [not (item in obj_review['attributes'].keys()) for item in hdf5_obj.attrs.keys()]
+        count_delections = sum(deletions)
+        print('additions',count_additions, 'deletions', count_delections)

 def last_submit_metadata_review(reviewer_attrs):