Files
dima/src/metadata_review_lib.py

339 lines
14 KiB
Python

import sys
import os
root_dir = os.path.abspath(os.curdir)
sys.path.append(root_dir)
import subprocess
import h5py
import yaml
import shutil
import src.g5505_utils as utils
import src.hdf5_vis as hdf5_vis
import input_files.config_file as config_file
import src.hidden as hidden
import numpy as np
import pygit2 as pygit
YAML_EXT = ".yaml"
TXT_EXT = ".txt"
repo_obj = pygit.Repository(os.path.abspath(os.curdir))
def get_review_status(filename_path):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
workflow_steps = []
for line in f:
workflow_steps.append(line)
return workflow_steps[-1]
def checkout_review_branch(repo_obj,initials):
# Create a new branch
branch_name = 'metadata-review-by-'+initials
head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD
if not branch_name in repo_obj.branches:
branch = repo_obj.create_branch(branch_name, head_commit)
else:
branch = repo_obj.branches[branch_name]
repo_obj.checkout(branch)
def first_initialize_metadata_review(filename_path,initials):
"""
Initialize metadata review by creating review folder with a copy of yaml representation of
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
"""
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# Check file_path points to h5 file
if not 'h5' in ext:
raise ValueError("filename_path needs to point to a suitable h5 file.")
# Verify if yaml snapshot of input h5 file exists
if not os.path.exists(os.path.join(filename_path_tail,filename+YAML_EXT)):
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ")
# Initialize metadata review workflow
print("Create branch metadata-review-by-"+initials+"\n")
# Check if review file already exists and then check if it is still untracked
if not os.path.exists(os.path.join("review/",filename+YAML_EXT)):
review_filename_path = utils.make_file_copy(os.path.join(filename_path_tail,filename+YAML_EXT), 'review')
else:
review_filename_path = os.path.join("review/",filename+YAML_EXT)
#else:
# raise Warning("the file " + os.path.join("review/",filename+YAML_EXT)+ " already exists. Delete this file to reinitialize the metadata review process.")
review_filename_tail, ext = os.path.splitext(review_filename_path)
with open(os.path.join(review_filename_tail+"-review_status"+".txt"),'w') as f:
f.write('under review')
checkout_review_branch(repo_obj,initials)
status_dict = repo_obj.status()
for filepath, file_status in status_dict.items():
# Identify keys associated to review files and stage them
if 'review/'+filename in filepath:
# Stage changes
repo_obj.index.add(filepath)
author = config_file.author #default_signature
committer = config_file.committer
message = "Initialized metadata review process."
tree = repo_obj.index.write_tree()
oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
#print("Add and commit"+"\n")
def second_submit_metadata_review(filename_path, initials):
"""
Once you're done reviewing the yaml representation of hdf5 file in review folder.
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
running this function.
"""
# 1 verify review initializatin was performed first
# 2. change review status in txt to complete
# 3. git add review/ and git commit -m "Submitted metadata review"
if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'a') as f:
f.write('\nsubmitted')
# Create a new branch
branch_name = 'metadata-review-by-'+initials
head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD
if not branch_name in repo_obj.branches:
branch = repo_obj.create_branch(branch_name, head_commit)
else:
branch = repo_obj.branches[branch_name]
repo_obj.checkout(branch)
status_dict = repo_obj.status()
for filepath, file_status in status_dict.items():
# Identify keys associated to review files and stage them
if ('review/'+filename in filepath) and (file_status == pygit.GIT_STATUS_WT_MODIFIED):
# Stage changes
repo_obj.index.add(filepath)
author = config_file.author #default_signature
committer = config_file.committer
message = "Submitted metadata review."
tree = repo_obj.index.write_tree()
oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
def third_complete_metadata_review(initials):
push_command = lambda repository,refspec: ['git','push',repository,refspec]
list_branches_command = ['git','branch','--list']
repository = 'origin'
branch_name = 'metadata-review-by-'+initials # refspec
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
if not branch_name in branches.stdout:
print('There is no branch named '+branch_name+'.\n')
print('Make sure to run metadata reviewer workflow from the beginning without missing any steps.')
return
current_branch_command = ['git','branch','--show-current']
curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
if not branch_name in curr_branch.stdout:
print('Complete metadata review could not be completed.\n')
print('Make sure a metadata-reviewer workflow has already been started on branch '+branch_name+'\n')
print('The step "Complete metadata review" will have no effect.')
return
result = subprocess.run(push_command(repository,branch_name),capture_output=True,check=True)
print(result.stdout)
return result.returncode
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_file):
# compare review file with current yalm file and then based on the changes open hdf5 file and access only
# groups that changed :). the below approach is suboptimal
# TODO: only enable update if your branch is data owner :)
if not 'submitted' in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
def count(name,obj,yml_dict):
print(obj.name)
if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
obj_review = yml_dict[obj.name]
additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
with open(yaml_file,'r') as stream:
try:
yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(exc)
with h5py.File(input_hdf5_file,'r+') as f:
#f.visititems(lambda name, obj: count(name,obj,yaml_dict))
for key in yaml_dict.keys(): # keys should coincide with group names
print(key)
# Select hdf5 and yaml objects at key
hdf5_obj = f[key]
yaml_obj = yaml_dict[key]
count(hdf5_obj.name, hdf5_obj, yaml_dict)
for attr_name, attr_value in yaml_obj['attributes'].items():
#attr_value = yaml_obj['attributes'][attr_name]
# Check whether attr_name belongs to the existing attributes of hdf5_obj
if attr_name in hdf5_obj.attrs.keys():
#else: # renaming attribute and possibly change of value assigment
if isinstance(attr_value,dict):
# Retreive possibly new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
new_attr_value = attr_value.get('value',hdf5_obj.attrs[attr_name])
hdf5_obj.attrs[new_attr_name] = new_attr_value
# Remove from hdf5_obj.attrs attribute w/ name: attr_name if
# yaml indicates a renaming of the attribute.
if not (new_attr_name == attr_name):
hdf5_obj.attrs.__delitem__(attr_name)
else:
hdf5_obj.attrs[attr_name] = attr_value
else: # attribute inclusion
if isinstance(attr_value,dict):
# Retreive new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
hdf5_obj.attrs[new_attr_name] = new_attr_value
else:
hdf5_obj.attrs[attr_name] = attr_value
# Recreate/or update yaml representation of updated input_hdf5_file.
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
push_command = lambda repository,refspec: ['git','push',repository,refspec]
status = subprocess.run(status_command,capture_output=True,check=True)
files_to_add_list = []
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp:
if any([ext in tmp for ext in ['.h5','.yaml']] ):
files_to_add_list.append(tmp.split()[1])
if files_to_add_list:
output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
# TODO:verify if files were correctly staged
#status = subprocess.run(status_command,capture_output=True,check=True)
message = 'Updated hdf5 file with yaml review file.'
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
else:
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
#with open('review/review_status.txt','r+') as f:
# f.write('hdf5 file updated w/ metadata review')
#try:
# yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
#except yaml.YAMLError as exc:
# print(exc)
def fourth_complete_metadata_review(initials):
repository = 'origin'
branch_name = 'data-owner-review-by-'+initials
current_branch_command = ['git','branch','--show-current']
list_branches_command = ['git','branch','--list']
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
if not branch_name in branches.stdout:
print('There is no branch named '+branch_name+'.\n')
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
return
curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
if not branch_name in curr_branch.stdout:
print('Complete metadata review could not be completed.\n')
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
print('The step "Complete metadata review" will have no effect.')
return
push_command = lambda repository,refspec: ['git','push',repository,refspec]
# push
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
print(result.stdout)
# 1. git add output_files/
# 2. delete review/
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
# 3. git rm review/
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
return True
#import config_file
#import hdf5_vis
def main():
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
#first_initialize_metadata_review(output_filename_path,initials='NG')
#second_submit_metadata_review()
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
#fourth_complete_metadata_review()
#if __name__ == '__main__':
# main()