Made def third_update_hdf5_file_with_review more modular by separating data update and git operations, resulting new functions that can be reused in less restrictive matadata annotation contexts.

This commit is contained in:
2024-05-29 15:26:48 +02:00
parent ef7c6c9efb
commit 3a9aede909

View File

@ -8,6 +8,15 @@ import h5py
import yaml
import src.g5505_utils as utils
import src.hdf5_vis as hdf5_vis
import src.hdf5_lib as hdf5_lib
import src.git_ops as git_ops
# TODO: incorporate lines 14-18 in git_ops module and refactor code where needed
current_branch_command = ['git','branch','--show-current']
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
rm_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
#import input_files.config_file as config_file
import numpy as np
@ -30,11 +39,35 @@ def get_review_status(filename_path):
workflow_steps.append(line)
return workflow_steps[-1]
current_branch_command = ['git','branch','--show-current']
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
rm_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
def parse_attribute(attr_value):
dtype = []
values_list = []
max_length = max(len(item) for item in attr_value.keys())
for key in attr_value.keys():
if (not key=='rename_as'):
dtype.append((key,f'S{max_length}'))
values_list.append(attr_value[key])
if len(values_list)>1:
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
elif values_list:
new_attr_value = values_list[0]
else:
new_attr_value = 'missing'
return new_attr_value
def convert_string_to_bytes(input_list: list):
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
if input_list:
max_length = max(len(item) for item in input_list)
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
else:
input_array_bytes = np.array([],dtype=utf8_type(0))
return input_array_bytes
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
@ -185,151 +218,61 @@ def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
else:
print('Nothing to commit.')
def parse_attribute(attr_value):
dtype = []
values_list = []
max_length = 100
for key in attr_value.keys():
if (not key=='rename_as'):
dtype.append((key,f'S{max_length}'))
values_list.append(attr_value[key])
if len(values_list)>1:
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
elif values_list:
new_attr_value = values_list[0]
else:
new_attr_value = 'missing'
return new_attr_value
def convert_string_to_bytes(input_list: list):
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
if input_list:
max_length = max(len(item) for item in input_list)
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
else:
input_array_bytes = np.array([],dtype=utf8_type(0))
return input_array_bytes
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs = {}, hdf5_upload : bool = False):
"""Third"""
# compare review file with current yalm file and then based on the changes open hdf5 file and access only
# groups that changed :). the below approach is suboptimal
# TODO: only enable update if your branch is data owner :)
if not 'submitted' in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
def count(name,obj,yml_dict):
print(obj.name)
if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
obj_review = yml_dict[obj.name]
additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
with open(yaml_review_file,'r') as stream:
#
def load_yaml(yaml_review_file):
with open(yaml_review_file, 'r') as stream:
try:
yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
return yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(exc)
return None
with h5py.File(input_hdf5_file,'r+') as f:
#f.visititems(lambda name, obj: count(name,obj,yaml_dict))
for key in yaml_dict.keys(): # keys should coincide with group names
print(key)
# Select hdf5 and yaml objects at key
hdf5_obj = f[key]
yaml_obj = yaml_dict[key]
def update_hdf5_attributes(input_hdf5_file, yaml_dict):
count(hdf5_obj.name, hdf5_obj, yaml_dict)
def update_attributes(hdf5_obj, yaml_obj):
for attr_name, attr_value in yaml_obj['attributes'].items():
for attr_name, attr_value in yaml_obj['attributes'].items():
#attr_value = yaml_obj['attributes'][attr_name]
if not isinstance(attr_value, dict):
attr_value = {'rename_as': attr_name, 'value': attr_value}
if (attr_name in hdf5_obj.attrs.keys()): # delete or update
if attr_value.get('delete'): # delete when True
hdf5_obj.attrs.__delitem__(attr_name)
elif not (attr_value.get('rename_as') == attr_name): # update when true
hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # parse_attribute(attr_value)
hdf5_obj.attrs.__delitem__(attr_name)
else: # add a new attribute
hdf5_obj.attrs.update({attr_name : parse_attribute(attr_value)})
if not isinstance(attr_value,dict):
attr_value = {'rename_as':attr_name, 'value':attr_value, 'delete': False}
with h5py.File(input_hdf5_file, 'r+') as f:
for key in yaml_dict.keys():
hdf5_obj = f[key]
yaml_obj = yaml_dict[key]
update_attributes(hdf5_obj, yaml_obj)
if attr_value.get('delete',False) and (attr_name in hdf5_obj.attrs.keys()):
hdf5_obj.attrs.__delitem__(attr_name)
continue
# Check whether attr_name belongs to the existing attributes of hdf5_obj
if attr_name in hdf5_obj.attrs.keys():
#else: # renaming attribute and possibly change of value assigment
#if isinstance(attr_value,dict):
# # Retreive possibly new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
hdf5_obj.attrs[new_attr_name] = parse_attribute(attr_value)
# Remove from hdf5_obj.attrs attribute w/ name: attr_name if
# yaml indicates a renaming of the attribute.
if not (new_attr_name == attr_name):
hdf5_obj.attrs.__delitem__(attr_name)
#else:
# hdf5_obj.attrs[attr_name] = attr_value
elif not attr_value.get('delete',False): # if true inclusion, else don't take any action
#hdf5_obj.attrs.__delitem__(attr_name): # attribute inclusion
#if isinstance(attr_value,dict):
# Retreive new attribute's name and value
# new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
# new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
hdf5_obj.attrs[attr_name] = parse_attribute(attr_value)
#else:
# hdf5_obj.attrs[attr_name] = attr_value
print(input_hdf5_file + ' was successfully updated\n')
# Recreate/or update yaml representation of updated input_hdf5_file.
def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
yaml_dict = load_yaml(yaml_review_file)
update_hdf5_attributes(input_hdf5_file, yaml_dict)
# Regenerate yaml snapshot of updated HDF5 file
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
print(output_yml_filename_path + ' was successfully regenerated from the updated version of ', input_hdf5_file)
print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
if 'submitted' not in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
git_ops.perform_git_operations(hdf5_upload)
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
push_command = lambda repository,refspec: ['git','push',repository,refspec]
status = subprocess.run(status_command,capture_output=True,check=True)
if hdf5_upload:
upload_ext = ['.h5','.yaml']
else:
upload_ext = ['.yaml']
files_to_add_list = []
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp:
if any([ext in tmp for ext in upload_ext] ):
files_to_add_list.append(tmp.split()[1])
if files_to_add_list:
output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
# TODO:verify if files were correctly staged
#status = subprocess.run(status_command,capture_output=True,check=True)
message = 'Updated hdf5 file with yaml review file.'
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
print(commit_output.stdout)
else:
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
def count(hdf5_obj,yml_dict):
print(hdf5_obj.name)
if isinstance(hdf5_obj,h5py.Group) and len(hdf5_obj.name.split('/')) <= 4:
obj_review = yml_dict[hdf5_obj.name]
additions = [not (item in hdf5_obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in hdf5_obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
def last_submit_metadata_review(reviewer_attrs):