Files
dima/src/metadata_review_lib.py

379 lines
16 KiB
Python

import sys
import os
root_dir = os.path.abspath(os.curdir)
sys.path.append(root_dir)
import subprocess
import h5py
import yaml
import shutil
import src.g5505_utils as utils
import src.hdf5_vis as hdf5_vis
#import input_files.config_file as config_file
import src.hidden as hidden
import numpy as np
import pygit2 as pygit
YAML_EXT = ".yaml"
TXT_EXT = ".txt"
repo_obj = pygit.Repository(os.path.abspath(os.curdir))
def get_review_status(filename_path):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
workflow_steps = []
for line in f:
workflow_steps.append(line)
return workflow_steps[-1]
def checkout_review_branch(branch_name):
# Create a new branch
#branch_name = 'metadata-review-by-'+initials
head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD
checkout_branch_command = lambda branch_name : ['git','checkout', branch_name]
output = subprocess.run(checkout_branch_command(branch_name), capture_output=True,text=True,check=True)
print(output.stdout)
#if not branch_name in repo_obj.branches:
# branch = repo_obj.create_branch(branch_name, head_commit)
#else:
# branch = repo_obj.branches[branch_name]
#repo_obj.checkout(branch)
current_branch_command = ['git','branch','--show-current']
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs):
"""
First: Initialize review branch with review folder with a copy of yaml representation of
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
"""
initials = reviewer_attrs['initials']
branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
hdf5_file_path_tail, filename_path_head = os.path.split(hdf5_file_path)
filename, ext = os.path.splitext(filename_path_head)
# Check file_path points to h5 file
if not 'h5' in ext:
raise ValueError("filename_path needs to point to an h5 file.")
# Verify if yaml snapshot of input h5 file exists
if not os.path.exists(os.path.join(hdf5_file_path_tail,filename+YAML_EXT)):
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ")
# Initialize metadata review workflow
# print("Create branch metadata-review-by-"+initials+"\n")
#checkout_review_branch(branch_name)
# Check you are working at the right branch
current_branch_command = ['git','branch','--show-current']
curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
if not branch_name in curr_branch.stdout:
raise ValueError("Branch "+branch_name+" was not found. \nPlease open a Git Bash Terminal, and follow the below instructions: \n1. Change directory to your project's directory. \n2. Excecute the command: git checkout "+branch_name)
# Check if review file already exists and then check if it is still untracked
review_yaml_file_path = os.path.join("review/",filename+YAML_EXT)
if not os.path.exists(review_yaml_file_path):
review_yaml_file_path = utils.make_file_copy(os.path.join(hdf5_file_path_tail,filename+YAML_EXT), 'review')
review_yaml_file_path_tail, ext = os.path.splitext(review_yaml_file_path)
#if not os.path.exists(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt")):
with open(os.path.join(review_yaml_file_path_tail+"-review_status"+".txt"),'w') as f:
f.write('under review')
# Stage untracked review files and commit them to local repository
status = subprocess.run(status_command,capture_output=True,text=True,check=True)
untracked_files = []
for line in status.stdout.splitlines():
#tmp = line.decode("utf-8")
#modified_files.append(tmp.split()[1])
if 'review/' in line and not 'modified' in line: # untracked files
untracked_files.append(line.strip())
if untracked_files:
result = subprocess.run(add_command(untracked_files),capture_output=True,check=True)
message = 'Initialized metadata review.'
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
else:
print('This action will not have any effect because metadata review process has been already initialized.')
#status_dict = repo_obj.status()
#for filepath, file_status in status_dict.items():
# Identify keys associated to review files and stage them
# if 'review/'+filename in filepath:
# Stage changes
# repo_obj.index.add(filepath)
#author = config_file.author #default_signature
#committer = config_file.committer
#message = "Initialized metadata review process."
#tree = repo_obj.index.write_tree()
#oid = repo_obj.create_commit('HEAD', author, committer, message, tree, [repo_obj.head.peel().oid])
#print("Add and commit"+"\n")
return review_yaml_file_path
def second_save_metadata_review(review_yaml_file_path, reviewer_attrs):
"""
Second: Once you're done reviewing the yaml representation of hdf5 file in review folder.
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
running this function.
"""
# 1 verify review initializatin was performed first
# 2. change review status in txt to complete
# 3. git add review/ and git commit -m "Submitted metadata review"
initials = reviewer_attrs['initials']
branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
# TODO: replace with subprocess + git
#checkout_review_branch(repo_obj, branch_name)
# Check you are working at the right branch
curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
if not branch_name in curr_branch.stdout:
raise ValueError('Please checkout ' + branch_name + ' via Git Bash before submitting metadata review files. ')
# Collect modified review files
status = subprocess.run(status_command,capture_output=True,check=True)
modified_files = []
os.path.basename(review_yaml_file_path)
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp and 'review/' in tmp and os.path.basename(review_yaml_file_path) in tmp:
modified_files.append(tmp.split()[1])
# Stage modified files and commit them to local repository
review_yaml_file_path_tail, review_yaml_file_path_head = os.path.split(review_yaml_file_path)
filename, ext = os.path.splitext(review_yaml_file_path_head)
if modified_files:
review_status_file_path = os.path.join("review/",filename+"-review_status"+TXT_EXT)
with open(review_status_file_path,'a') as f:
f.write('\nsubmitted')
modified_files.append(review_status_file_path)
result = subprocess.run(add_command(modified_files),capture_output=True,check=True)
message = 'Submitted metadata review.'
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
for line in commit_output.stdout.splitlines():
print(line.decode('utf-8'))
else:
print('Nothing to commit.')
def third_update_hdf5_file_with_review(input_hdf5_file, yalm_review_file, reviewer_attrs = {}, hdf5_upload : bool = False):
"""Third"""
# compare review file with current yalm file and then based on the changes open hdf5 file and access only
# groups that changed :). the below approach is suboptimal
# TODO: only enable update if your branch is data owner :)
if not 'submitted' in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
def count(name,obj,yml_dict):
print(obj.name)
if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
obj_review = yml_dict[obj.name]
additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
with open(yalm_review_file,'r') as stream:
try:
yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(exc)
with h5py.File(input_hdf5_file,'r+') as f:
#f.visititems(lambda name, obj: count(name,obj,yaml_dict))
for key in yaml_dict.keys(): # keys should coincide with group names
print(key)
# Select hdf5 and yaml objects at key
hdf5_obj = f[key]
yaml_obj = yaml_dict[key]
count(hdf5_obj.name, hdf5_obj, yaml_dict)
for attr_name, attr_value in yaml_obj['attributes'].items():
#attr_value = yaml_obj['attributes'][attr_name]
# Check whether attr_name belongs to the existing attributes of hdf5_obj
if attr_name in hdf5_obj.attrs.keys():
#else: # renaming attribute and possibly change of value assigment
if isinstance(attr_value,dict):
# Retreive possibly new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
dtype = []
values_list = []
max_length = 100
for key in attr_value.keys():
if (not key=='rename_as'):
dtype.append((key,f'S{max_length}'))
values_list.append(attr_value.get(key,hdf5_obj.attrs[attr_name]))
if len(values_list)>2:
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
else:
new_attr_value = attr_value.get('value',hdf5_obj.attrs[attr_name])
hdf5_obj.attrs[new_attr_name] = new_attr_value
# Remove from hdf5_obj.attrs attribute w/ name: attr_name if
# yaml indicates a renaming of the attribute.
if not (new_attr_name == attr_name):
hdf5_obj.attrs.__delitem__(attr_name)
else:
hdf5_obj.attrs[attr_name] = attr_value
else: # attribute inclusion
if isinstance(attr_value,dict):
# Retreive new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
hdf5_obj.attrs[new_attr_name] = new_attr_value
else:
hdf5_obj.attrs[attr_name] = attr_value
# Recreate/or update yaml representation of updated input_hdf5_file.
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
status_command = ['git','status']
add_command = lambda add_list: ['git','add'] + add_list
commit_command = lambda message: ['git','commit','-m', message]
push_command = lambda repository,refspec: ['git','push',repository,refspec]
status = subprocess.run(status_command,capture_output=True,check=True)
if hdf5_upload:
upload_ext = ['.h5','.yaml']
else:
upload_ext = ['.yaml']
files_to_add_list = []
for line in status.stdout.splitlines():
# conver line from bytes to str
tmp = line.decode("utf-8")
if 'modified' in tmp:
if any([ext in tmp for ext in upload_ext] ):
files_to_add_list.append(tmp.split()[1])
if files_to_add_list:
output = subprocess.run(add_command(files_to_add_list),capture_output=True,check=True)
# TODO:verify if files were correctly staged
#status = subprocess.run(status_command,capture_output=True,check=True)
message = 'Updated hdf5 file with yaml review file.'
commit_output = subprocess.run(commit_command(message),capture_output=True,check=True)
else:
print("There were no found h5 and yaml files, needing to be saved. This action will not have effect on the review process' commit history.")
#with open('review/review_status.txt','r+') as f:
# f.write('hdf5 file updated w/ metadata review')
#try:
# yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
#except yaml.YAMLError as exc:
# print(exc)
def last_submit_metadata_review(reviewer_attrs):
"""Fourth: """
initials =reviewer_attrs['initials']
repository = 'origin'
branch_name = '-'.join([reviewer_attrs['type'],'review','by',initials])
push_command = lambda repository,refspec: ['git','push',repository,refspec]
current_branch_command = ['git','branch','--show-current']
list_branches_command = ['git','branch','--list']
branches = subprocess.run(list_branches_command,capture_output=True,text=True,check=True)
if not branch_name in branches.stdout:
print('There is no branch named '+branch_name+'.\n')
print('Make sure to run data owner review workflow from the beginning without missing any steps.')
return
curr_branch = subprocess.run(current_branch_command,capture_output=True,text=True,check=True)
if not branch_name in curr_branch.stdout:
print('Complete metadata review could not be completed.\n')
print('Make sure a data-owner workflow has already been started on branch '+branch_name+'\n')
print('The step "Complete metadata review" will have no effect.')
return
# push
result = subprocess.run(push_command(repository,branch_name),capture_output=True,text=True,check=True)
print(result.stdout)
# 1. git add output_files/
# 2. delete review/
#shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
# 3. git rm review/
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
return result.returncode
#import config_file
#import hdf5_vis
def main():
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
#first_initialize_metadata_review(output_filename_path,initials='NG')
#second_submit_metadata_review()
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
#fourth_complete_metadata_review()
#if __name__ == '__main__':
# main()