From 3a9aede9093b3769e152154918acb3b1987b89de Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Wed, 29 May 2024 15:26:48 +0200 Subject: [PATCH] Made def third_update_hdf5_file_with_review more modular by separating data update and git operations, resulting new functions that can be reused in less restrictive matadata annotation contexts. --- src/metadata_review_lib.py | 221 ++++++++++++++----------------------- 1 file changed, 82 insertions(+), 139 deletions(-) diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py index 7cc9e65..44f72f3 100644 --- a/src/metadata_review_lib.py +++ b/src/metadata_review_lib.py @@ -8,6 +8,15 @@ import h5py import yaml import src.g5505_utils as utils import src.hdf5_vis as hdf5_vis +import src.hdf5_lib as hdf5_lib +import src.git_ops as git_ops +# TODO: incorporate lines 14-18 in git_ops module and refactor code where needed +current_branch_command = ['git','branch','--show-current'] +status_command = ['git','status'] +add_command = lambda add_list: ['git','add'] + add_list +rm_command = lambda add_list: ['git','add'] + add_list +commit_command = lambda message: ['git','commit','-m', message] + #import input_files.config_file as config_file import numpy as np @@ -30,11 +39,35 @@ def get_review_status(filename_path): workflow_steps.append(line) return workflow_steps[-1] -current_branch_command = ['git','branch','--show-current'] -status_command = ['git','status'] -add_command = lambda add_list: ['git','add'] + add_list -rm_command = lambda add_list: ['git','add'] + add_list -commit_command = lambda message: ['git','commit','-m', message] +def parse_attribute(attr_value): + dtype = [] + values_list = [] + max_length = max(len(item) for item in attr_value.keys()) + for key in attr_value.keys(): + if (not key=='rename_as'): + dtype.append((key,f'S{max_length}')) + values_list.append(attr_value[key]) + + if len(values_list)>1: + new_attr_value = np.array([tuple(values_list)],dtype=dtype) + elif values_list: + new_attr_value = values_list[0] + else: + new_attr_value = 'missing' + + return new_attr_value + +def convert_string_to_bytes(input_list: list): + utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length) + if input_list: + max_length = max(len(item) for item in input_list) + # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded + input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list] + input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length)) + else: + input_array_bytes = np.array([],dtype=utf8_type(0)) + + return input_array_bytes def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False): @@ -185,151 +218,61 @@ def second_save_metadata_review(review_yaml_file_path, reviewer_attrs): else: print('Nothing to commit.') - -def parse_attribute(attr_value): - dtype = [] - values_list = [] - max_length = 100 - for key in attr_value.keys(): - if (not key=='rename_as'): - dtype.append((key,f'S{max_length}')) - values_list.append(attr_value[key]) - - if len(values_list)>1: - new_attr_value = np.array([tuple(values_list)],dtype=dtype) - elif values_list: - new_attr_value = values_list[0] - else: - new_attr_value = 'missing' - - return new_attr_value - -def convert_string_to_bytes(input_list: list): - utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length) - if input_list: - max_length = max(len(item) for item in input_list) - # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded - input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list] - input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length)) - else: - input_array_bytes = np.array([],dtype=utf8_type(0)) - - return input_array_bytes - -def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs = {}, hdf5_upload : bool = False): - - """Third""" - # compare review file with current yalm file and then based on the changes open hdf5 file and access only - # groups that changed :). the below approach is suboptimal - - # TODO: only enable update if your branch is data owner :) - - if not 'submitted' in get_review_status(input_hdf5_file): - raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().') - - def count(name,obj,yml_dict): - print(obj.name) - if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4: - obj_review = yml_dict[obj.name] - - additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()] - - count_additions = sum(additions) - - deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()] - - count_delections = sum(deletions) - - print('additions',count_additions, 'deletions', count_delections) - - with open(yaml_review_file,'r') as stream: +# +def load_yaml(yaml_review_file): + with open(yaml_review_file, 'r') as stream: try: - yaml_dict = yaml.load(stream, Loader=yaml.FullLoader) + return yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: print(exc) + return None - with h5py.File(input_hdf5_file,'r+') as f: - #f.visititems(lambda name, obj: count(name,obj,yaml_dict)) - for key in yaml_dict.keys(): # keys should coincide with group names - print(key) - # Select hdf5 and yaml objects at key - hdf5_obj = f[key] - yaml_obj = yaml_dict[key] +def update_hdf5_attributes(input_hdf5_file, yaml_dict): - count(hdf5_obj.name, hdf5_obj, yaml_dict) + def update_attributes(hdf5_obj, yaml_obj): + for attr_name, attr_value in yaml_obj['attributes'].items(): - for attr_name, attr_value in yaml_obj['attributes'].items(): - #attr_value = yaml_obj['attributes'][attr_name] + if not isinstance(attr_value, dict): + attr_value = {'rename_as': attr_name, 'value': attr_value} + + if (attr_name in hdf5_obj.attrs.keys()): # delete or update + if attr_value.get('delete'): # delete when True + hdf5_obj.attrs.__delitem__(attr_name) + elif not (attr_value.get('rename_as') == attr_name): # update when true + hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # parse_attribute(attr_value) + hdf5_obj.attrs.__delitem__(attr_name) + else: # add a new attribute + hdf5_obj.attrs.update({attr_name : parse_attribute(attr_value)}) - if not isinstance(attr_value,dict): - attr_value = {'rename_as':attr_name, 'value':attr_value, 'delete': False} + with h5py.File(input_hdf5_file, 'r+') as f: + for key in yaml_dict.keys(): + hdf5_obj = f[key] + yaml_obj = yaml_dict[key] + update_attributes(hdf5_obj, yaml_obj) - if attr_value.get('delete',False) and (attr_name in hdf5_obj.attrs.keys()): - hdf5_obj.attrs.__delitem__(attr_name) - continue - - # Check whether attr_name belongs to the existing attributes of hdf5_obj - if attr_name in hdf5_obj.attrs.keys(): - #else: # renaming attribute and possibly change of value assigment - - #if isinstance(attr_value,dict): - # # Retreive possibly new attribute's name and value - new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value - - hdf5_obj.attrs[new_attr_name] = parse_attribute(attr_value) - - # Remove from hdf5_obj.attrs attribute w/ name: attr_name if - # yaml indicates a renaming of the attribute. - if not (new_attr_name == attr_name): - hdf5_obj.attrs.__delitem__(attr_name) - - #else: - # hdf5_obj.attrs[attr_name] = attr_value - elif not attr_value.get('delete',False): # if true inclusion, else don't take any action - #hdf5_obj.attrs.__delitem__(attr_name): # attribute inclusion - #if isinstance(attr_value,dict): - # Retreive new attribute's name and value - # new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value - # new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned - hdf5_obj.attrs[attr_name] = parse_attribute(attr_value) - #else: - # hdf5_obj.attrs[attr_name] = attr_value - print(input_hdf5_file + ' was successfully updated\n') - - # Recreate/or update yaml representation of updated input_hdf5_file. +def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file): + yaml_dict = load_yaml(yaml_review_file) + update_hdf5_attributes(input_hdf5_file, yaml_dict) + # Regenerate yaml snapshot of updated HDF5 file output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file) - print(output_yml_filename_path + ' was successfully regenerated from the updated version of ', input_hdf5_file) + print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}') + +def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False): + if 'submitted' not in get_review_status(input_hdf5_file): + raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().') + update_hdf5_file_with_review(input_hdf5_file, yaml_review_file) + git_ops.perform_git_operations(hdf5_upload) - status_command = ['git','status'] - add_command = lambda add_list: ['git','add'] + add_list - commit_command = lambda message: ['git','commit','-m', message] - push_command = lambda repository,refspec: ['git','push',repository,refspec] - - status = subprocess.run(status_command,capture_output=True,check=True) - - if hdf5_upload: - upload_ext = ['.h5','.yaml'] - else: - upload_ext = ['.yaml'] - - files_to_add_list = [] - for line in status.stdout.splitlines(): - # conver line from bytes to str - tmp = line.decode("utf-8") - if 'modified' in tmp: - if any([ext in tmp for ext in upload_ext] ): - files_to_add_list.append(tmp.split()[1]) - if files_to_add_list: - output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True) - # TODO:verify if files were correctly staged - #status = subprocess.run(status_command,capture_output=True,check=True) - message = 'Updated hdf5 file with yaml review file.' - commit_output = subprocess.run(commit_command(message),capture_output=True,check=True) - print(commit_output.stdout) - else: - print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.") - +def count(hdf5_obj,yml_dict): + print(hdf5_obj.name) + if isinstance(hdf5_obj,h5py.Group) and len(hdf5_obj.name.split('/')) <= 4: + obj_review = yml_dict[hdf5_obj.name] + additions = [not (item in hdf5_obj.attrs.keys()) for item in obj_review['attributes'].keys()] + count_additions = sum(additions) + deletions = [not (item in obj_review['attributes'].keys()) for item in hdf5_obj.attrs.keys()] + count_delections = sum(deletions) + print('additions',count_additions, 'deletions', count_delections) def last_submit_metadata_review(reviewer_attrs):