Made def third_update_hdf5_file_with_review more modular by separating data update and git operations, resulting new functions that can be reused in less restrictive matadata annotation contexts.
This commit is contained in:
@ -8,6 +8,15 @@ import h5py
|
||||
import yaml
|
||||
import src.g5505_utils as utils
|
||||
import src.hdf5_vis as hdf5_vis
|
||||
import src.hdf5_lib as hdf5_lib
|
||||
import src.git_ops as git_ops
|
||||
# TODO: incorporate lines 14-18 in git_ops module and refactor code where needed
|
||||
current_branch_command = ['git','branch','--show-current']
|
||||
status_command = ['git','status']
|
||||
add_command = lambda add_list: ['git','add'] + add_list
|
||||
rm_command = lambda add_list: ['git','add'] + add_list
|
||||
commit_command = lambda message: ['git','commit','-m', message]
|
||||
|
||||
#import input_files.config_file as config_file
|
||||
|
||||
import numpy as np
|
||||
@ -30,11 +39,35 @@ def get_review_status(filename_path):
|
||||
workflow_steps.append(line)
|
||||
return workflow_steps[-1]
|
||||
|
||||
current_branch_command = ['git','branch','--show-current']
|
||||
status_command = ['git','status']
|
||||
add_command = lambda add_list: ['git','add'] + add_list
|
||||
rm_command = lambda add_list: ['git','add'] + add_list
|
||||
commit_command = lambda message: ['git','commit','-m', message]
|
||||
def parse_attribute(attr_value):
|
||||
dtype = []
|
||||
values_list = []
|
||||
max_length = max(len(item) for item in attr_value.keys())
|
||||
for key in attr_value.keys():
|
||||
if (not key=='rename_as'):
|
||||
dtype.append((key,f'S{max_length}'))
|
||||
values_list.append(attr_value[key])
|
||||
|
||||
if len(values_list)>1:
|
||||
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
|
||||
elif values_list:
|
||||
new_attr_value = values_list[0]
|
||||
else:
|
||||
new_attr_value = 'missing'
|
||||
|
||||
return new_attr_value
|
||||
|
||||
def convert_string_to_bytes(input_list: list):
|
||||
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
|
||||
if input_list:
|
||||
max_length = max(len(item) for item in input_list)
|
||||
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
|
||||
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
|
||||
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
|
||||
else:
|
||||
input_array_bytes = np.array([],dtype=utf8_type(0))
|
||||
|
||||
return input_array_bytes
|
||||
|
||||
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False):
|
||||
|
||||
@ -185,151 +218,61 @@ def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
|
||||
else:
|
||||
print('Nothing to commit.')
|
||||
|
||||
|
||||
def parse_attribute(attr_value):
|
||||
dtype = []
|
||||
values_list = []
|
||||
max_length = 100
|
||||
for key in attr_value.keys():
|
||||
if (not key=='rename_as'):
|
||||
dtype.append((key,f'S{max_length}'))
|
||||
values_list.append(attr_value[key])
|
||||
|
||||
if len(values_list)>1:
|
||||
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
|
||||
elif values_list:
|
||||
new_attr_value = values_list[0]
|
||||
else:
|
||||
new_attr_value = 'missing'
|
||||
|
||||
return new_attr_value
|
||||
|
||||
def convert_string_to_bytes(input_list: list):
|
||||
utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length)
|
||||
if input_list:
|
||||
max_length = max(len(item) for item in input_list)
|
||||
# Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded
|
||||
input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list]
|
||||
input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length))
|
||||
else:
|
||||
input_array_bytes = np.array([],dtype=utf8_type(0))
|
||||
|
||||
return input_array_bytes
|
||||
|
||||
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs = {}, hdf5_upload : bool = False):
|
||||
|
||||
"""Third"""
|
||||
# compare review file with current yalm file and then based on the changes open hdf5 file and access only
|
||||
# groups that changed :). the below approach is suboptimal
|
||||
|
||||
# TODO: only enable update if your branch is data owner :)
|
||||
|
||||
if not 'submitted' in get_review_status(input_hdf5_file):
|
||||
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
|
||||
|
||||
def count(name,obj,yml_dict):
|
||||
print(obj.name)
|
||||
if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
|
||||
obj_review = yml_dict[obj.name]
|
||||
|
||||
additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
|
||||
|
||||
count_additions = sum(additions)
|
||||
|
||||
deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
|
||||
|
||||
count_delections = sum(deletions)
|
||||
|
||||
print('additions',count_additions, 'deletions', count_delections)
|
||||
|
||||
with open(yaml_review_file,'r') as stream:
|
||||
#
|
||||
def load_yaml(yaml_review_file):
|
||||
with open(yaml_review_file, 'r') as stream:
|
||||
try:
|
||||
yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
return yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
return None
|
||||
|
||||
with h5py.File(input_hdf5_file,'r+') as f:
|
||||
#f.visititems(lambda name, obj: count(name,obj,yaml_dict))
|
||||
for key in yaml_dict.keys(): # keys should coincide with group names
|
||||
print(key)
|
||||
# Select hdf5 and yaml objects at key
|
||||
hdf5_obj = f[key]
|
||||
yaml_obj = yaml_dict[key]
|
||||
def update_hdf5_attributes(input_hdf5_file, yaml_dict):
|
||||
|
||||
count(hdf5_obj.name, hdf5_obj, yaml_dict)
|
||||
def update_attributes(hdf5_obj, yaml_obj):
|
||||
for attr_name, attr_value in yaml_obj['attributes'].items():
|
||||
|
||||
for attr_name, attr_value in yaml_obj['attributes'].items():
|
||||
#attr_value = yaml_obj['attributes'][attr_name]
|
||||
if not isinstance(attr_value, dict):
|
||||
attr_value = {'rename_as': attr_name, 'value': attr_value}
|
||||
|
||||
if (attr_name in hdf5_obj.attrs.keys()): # delete or update
|
||||
if attr_value.get('delete'): # delete when True
|
||||
hdf5_obj.attrs.__delitem__(attr_name)
|
||||
elif not (attr_value.get('rename_as') == attr_name): # update when true
|
||||
hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # parse_attribute(attr_value)
|
||||
hdf5_obj.attrs.__delitem__(attr_name)
|
||||
else: # add a new attribute
|
||||
hdf5_obj.attrs.update({attr_name : parse_attribute(attr_value)})
|
||||
|
||||
if not isinstance(attr_value,dict):
|
||||
attr_value = {'rename_as':attr_name, 'value':attr_value, 'delete': False}
|
||||
with h5py.File(input_hdf5_file, 'r+') as f:
|
||||
for key in yaml_dict.keys():
|
||||
hdf5_obj = f[key]
|
||||
yaml_obj = yaml_dict[key]
|
||||
update_attributes(hdf5_obj, yaml_obj)
|
||||
|
||||
if attr_value.get('delete',False) and (attr_name in hdf5_obj.attrs.keys()):
|
||||
hdf5_obj.attrs.__delitem__(attr_name)
|
||||
continue
|
||||
|
||||
# Check whether attr_name belongs to the existing attributes of hdf5_obj
|
||||
if attr_name in hdf5_obj.attrs.keys():
|
||||
#else: # renaming attribute and possibly change of value assigment
|
||||
|
||||
#if isinstance(attr_value,dict):
|
||||
# # Retreive possibly new attribute's name and value
|
||||
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
|
||||
|
||||
hdf5_obj.attrs[new_attr_name] = parse_attribute(attr_value)
|
||||
|
||||
# Remove from hdf5_obj.attrs attribute w/ name: attr_name if
|
||||
# yaml indicates a renaming of the attribute.
|
||||
if not (new_attr_name == attr_name):
|
||||
hdf5_obj.attrs.__delitem__(attr_name)
|
||||
|
||||
#else:
|
||||
# hdf5_obj.attrs[attr_name] = attr_value
|
||||
elif not attr_value.get('delete',False): # if true inclusion, else don't take any action
|
||||
#hdf5_obj.attrs.__delitem__(attr_name): # attribute inclusion
|
||||
#if isinstance(attr_value,dict):
|
||||
# Retreive new attribute's name and value
|
||||
# new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
|
||||
# new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
|
||||
hdf5_obj.attrs[attr_name] = parse_attribute(attr_value)
|
||||
#else:
|
||||
# hdf5_obj.attrs[attr_name] = attr_value
|
||||
print(input_hdf5_file + ' was successfully updated\n')
|
||||
|
||||
# Recreate/or update yaml representation of updated input_hdf5_file.
|
||||
def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
|
||||
yaml_dict = load_yaml(yaml_review_file)
|
||||
update_hdf5_attributes(input_hdf5_file, yaml_dict)
|
||||
# Regenerate yaml snapshot of updated HDF5 file
|
||||
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
|
||||
print(output_yml_filename_path + ' was successfully regenerated from the updated version of ', input_hdf5_file)
|
||||
print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
|
||||
|
||||
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_review_file, reviewer_attrs={}, hdf5_upload=False):
|
||||
if 'submitted' not in get_review_status(input_hdf5_file):
|
||||
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
|
||||
|
||||
update_hdf5_file_with_review(input_hdf5_file, yaml_review_file)
|
||||
git_ops.perform_git_operations(hdf5_upload)
|
||||
|
||||
status_command = ['git','status']
|
||||
add_command = lambda add_list: ['git','add'] + add_list
|
||||
commit_command = lambda message: ['git','commit','-m', message]
|
||||
push_command = lambda repository,refspec: ['git','push',repository,refspec]
|
||||
|
||||
status = subprocess.run(status_command,capture_output=True,check=True)
|
||||
|
||||
if hdf5_upload:
|
||||
upload_ext = ['.h5','.yaml']
|
||||
else:
|
||||
upload_ext = ['.yaml']
|
||||
|
||||
files_to_add_list = []
|
||||
for line in status.stdout.splitlines():
|
||||
# conver line from bytes to str
|
||||
tmp = line.decode("utf-8")
|
||||
if 'modified' in tmp:
|
||||
if any([ext in tmp for ext in upload_ext] ):
|
||||
files_to_add_list.append(tmp.split()[1])
|
||||
if files_to_add_list:
|
||||
output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
|
||||
# TODO:verify if files were correctly staged
|
||||
#status = subprocess.run(status_command,capture_output=True,check=True)
|
||||
message = 'Updated hdf5 file with yaml review file.'
|
||||
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
|
||||
print(commit_output.stdout)
|
||||
else:
|
||||
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
|
||||
|
||||
def count(hdf5_obj,yml_dict):
|
||||
print(hdf5_obj.name)
|
||||
if isinstance(hdf5_obj,h5py.Group) and len(hdf5_obj.name.split('/')) <= 4:
|
||||
obj_review = yml_dict[hdf5_obj.name]
|
||||
additions = [not (item in hdf5_obj.attrs.keys()) for item in obj_review['attributes'].keys()]
|
||||
count_additions = sum(additions)
|
||||
deletions = [not (item in obj_review['attributes'].keys()) for item in hdf5_obj.attrs.keys()]
|
||||
count_delections = sum(deletions)
|
||||
print('additions',count_additions, 'deletions', count_delections)
|
||||
|
||||
def last_submit_metadata_review(reviewer_attrs):
|
||||
|
||||
|
Reference in New Issue
Block a user