Implemented metadata review library

This commit is contained in:
2024-03-26 16:21:02 +01:00
parent 1f2bb419fe
commit 302b7dbfa5

216
src/metadata_review_lib.py Normal file
View File

@ -0,0 +1,216 @@
import sys
import os
root_dir = os.path.abspath(os.curdir)
sys.path.append(root_dir)
import h5py
import yaml
import shutil
import src.g5505_utils as utils
import src.hdf5_vis as hdf5_vis
import numpy as np
import pygit2 as pygit
YAML_EXT = ".yaml"
TXT_EXT = ".txt"
repo_obj = pygit.Repository(os.path.abspath(os.curdir))
def get_review_status(filename_path):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f:
workflow_steps = []
for line in f:
workflow_steps.append(line)
return workflow_steps[-1]
def first_initialize_metadata_review(filename_path,initials):
"""
Initialize metadata review by creating review folder with a copy of yaml representation of
hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review.
"""
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# Check file_path points to h5 file
if not 'h5' in ext:
raise ValueError("filename_path needs to point to a suitable h5 file.")
# Verify if yaml snapshot of input h5 file exists
if not os.path.exists(os.path.join(filename_path_tail,filename+YAML_EXT)):
raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ")
# Initialize metadata review workflow
print("Create branch metadata-review-by-"+initials+"\n")
if not os.path.exists(os.path.join("review/",filename+YAML_EXT)):
review_filename_path = utils.make_file_copy(os.path.join(filename_path_tail,filename+YAML_EXT), 'review')
else:
raise Warning("the file " + os.path.join("review/",filename+YAML_EXT)+ " already exists. Delete this file to reinitialize the metadata review process.")
review_filename_tail, ext = os.path.splitext(review_filename_path)
with open(os.path.join(review_filename_tail+"-review_status"+".txt"),'w') as f:
f.write('under review')
# Create a new branch
branch_name = 'metadata-review-by-'+initials
head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD
if not branch_name in repo_obj.branches:
new_branch = repo_obj.create_branch(branch_name, head_commit)
repo_obj.checkout(new_branch)
status_dict = repo_obj.status()
for key in status_dict:
# Identify keys associated to review files and stage them
if review_filename_tail in key:
# Stage changes
repo_obj.index.add(key)
repo_obj.create_commit(message="Initialized metadata review process.")
#print("Add and commit"+"\n")
def second_submit_metadata_review(filename_path):
"""
Once you're done reviewing the yaml representation of hdf5 file in review folder.
Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by
running this function.
"""
# 1 verify review initializatin was performed first
# 2. change review status in txt to complete
# 3. git add review/ and git commit -m "Submitted metadata review"
if any([status in get_review_status(filename_path) for status in ['under review','submitted']]):
filename_path_tail, filename_path_head = os.path.split(filename_path)
filename, ext = os.path.splitext(filename_path_head)
# TODO:
with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'a') as f:
f.write('\nsubmitted')
#return True
def third_update_hdf5_file_with_review(input_hdf5_file, yaml_file):
# compare review file with current yalm file and then based on the changes open hdf5 file and access only
# groups that changed :). the below approach is suboptimal
if not 'submitted' in get_review_status(input_hdf5_file):
raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().')
def count(name,obj,yml_dict):
print(obj.name)
if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4:
obj_review = yml_dict[obj.name]
additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
with open(yaml_file,'r') as stream:
try:
yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(exc)
with h5py.File(input_hdf5_file,'r+') as f:
#f.visititems(lambda name, obj: count(name,obj,yaml_dict))
for key in yaml_dict.keys(): # keys should coincide with group names
print(key)
# Select hdf5 and yaml objects at key
hdf5_obj = f[key]
yaml_obj = yaml_dict[key]
count(hdf5_obj.name, hdf5_obj, yaml_dict)
for attr_name, attr_value in yaml_obj['attributes'].items():
#attr_value = yaml_obj['attributes'][attr_name]
# Check whether attr_name belongs to the existing attributes of hdf5_obj
if attr_name in hdf5_obj.attrs.keys():
#else: # renaming attribute and possibly change of value assigment
if isinstance(attr_value,dict):
# Retreive possibly new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
new_attr_value = attr_value.get('value',hdf5_obj.attrs[attr_name])
hdf5_obj.attrs[new_attr_name] = new_attr_value
# Remove from hdf5_obj.attrs attribute w/ name: attr_name if
# yaml indicates a renaming of the attribute.
if not (new_attr_name == attr_name):
hdf5_obj.attrs.__delitem__(attr_name)
else:
hdf5_obj.attrs[attr_name] = attr_value
else: # attribute inclusion
if isinstance(attr_value,dict):
# Retreive new attribute's name and value
new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value
new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned
hdf5_obj.attrs[new_attr_name] = new_attr_value
else:
hdf5_obj.attrs[attr_name] = attr_value
# Recreate/or update yaml representation of updated input_hdf5_file.
output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file)
#with open('review/review_status.txt','r+') as f:
# f.write('hdf5 file updated w/ metadata review')
#try:
# yaml_dict = yaml.load(stream, Loader=yaml.FullLoader)
#except yaml.YAMLError as exc:
# print(exc)
def fourth_complete_metadata_review():
# 1. git add output_files/
# 2. delete review/
shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review"))
# 3. git rm review/
# 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date."
return True
#import config_file
#import hdf5_vis
def main():
output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5"
output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm"
output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path)
#output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path)
#first_initialize_metadata_review(output_filename_path,initials='NG')
#second_submit_metadata_review()
#if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)):
# third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename))
#fourth_complete_metadata_review()
#if __name__ == '__main__':
# main()