diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py new file mode 100644 index 0000000..34e05e6 --- /dev/null +++ b/src/metadata_review_lib.py @@ -0,0 +1,216 @@ +import sys +import os +root_dir = os.path.abspath(os.curdir) +sys.path.append(root_dir) + +import h5py +import yaml +import shutil +import src.g5505_utils as utils +import src.hdf5_vis as hdf5_vis + +import numpy as np + +import pygit2 as pygit + +YAML_EXT = ".yaml" +TXT_EXT = ".txt" + +repo_obj = pygit.Repository(os.path.abspath(os.curdir)) + + +def get_review_status(filename_path): + + filename_path_tail, filename_path_head = os.path.split(filename_path) + filename, ext = os.path.splitext(filename_path_head) + # TODO: + with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'r') as f: + workflow_steps = [] + for line in f: + workflow_steps.append(line) + return workflow_steps[-1] + + +def first_initialize_metadata_review(filename_path,initials): + + """ + Initialize metadata review by creating review folder with a copy of yaml representation of + hdf5 file under review and by creating a txt file with the state of the review process, e.g., under review. + + """ + + filename_path_tail, filename_path_head = os.path.split(filename_path) + filename, ext = os.path.splitext(filename_path_head) + + + + # Check file_path points to h5 file + if not 'h5' in ext: + raise ValueError("filename_path needs to point to a suitable h5 file.") + + # Verify if yaml snapshot of input h5 file exists + if not os.path.exists(os.path.join(filename_path_tail,filename+YAML_EXT)): + raise ValueError("metadata review cannot be initialized. The associated .yaml file under review was not found. Run take_yml_snapshot_of_hdf5_file(filename_path) ") + + # Initialize metadata review workflow + print("Create branch metadata-review-by-"+initials+"\n") + + if not os.path.exists(os.path.join("review/",filename+YAML_EXT)): + review_filename_path = utils.make_file_copy(os.path.join(filename_path_tail,filename+YAML_EXT), 'review') + else: + raise Warning("the file " + os.path.join("review/",filename+YAML_EXT)+ " already exists. Delete this file to reinitialize the metadata review process.") + + review_filename_tail, ext = os.path.splitext(review_filename_path) + + with open(os.path.join(review_filename_tail+"-review_status"+".txt"),'w') as f: + f.write('under review') + + + # Create a new branch + branch_name = 'metadata-review-by-'+initials + head_commit = repo_obj.head.peel()# Get the commit hash associated with HEAD + + if not branch_name in repo_obj.branches: + new_branch = repo_obj.create_branch(branch_name, head_commit) + repo_obj.checkout(new_branch) + + status_dict = repo_obj.status() + for key in status_dict: + # Identify keys associated to review files and stage them + if review_filename_tail in key: + # Stage changes + repo_obj.index.add(key) + repo_obj.create_commit(message="Initialized metadata review process.") + + #print("Add and commit"+"\n") + + + +def second_submit_metadata_review(filename_path): + """ + Once you're done reviewing the yaml representation of hdf5 file in review folder. + Change the review status to complete and save (add and commit) modified .yalm and .txt files in the project by + running this function. + + """ + # 1 verify review initializatin was performed first + # 2. change review status in txt to complete + # 3. git add review/ and git commit -m "Submitted metadata review" + + + if any([status in get_review_status(filename_path) for status in ['under review','submitted']]): + filename_path_tail, filename_path_head = os.path.split(filename_path) + filename, ext = os.path.splitext(filename_path_head) + # TODO: + with open(os.path.join("review/",filename+"-review_status"+TXT_EXT),'a') as f: + f.write('\nsubmitted') + #return True + +def third_update_hdf5_file_with_review(input_hdf5_file, yaml_file): + + # compare review file with current yalm file and then based on the changes open hdf5 file and access only + # groups that changed :). the below approach is suboptimal + + if not 'submitted' in get_review_status(input_hdf5_file): + raise ValueError('Review yaml file must be submitted before trying to perform an update. Run first second_submit_metadata_review().') + + def count(name,obj,yml_dict): + print(obj.name) + if isinstance(obj,h5py.Group) and len(obj.name.split('/')) <= 4: + obj_review = yml_dict[obj.name] + + additions = [not (item in obj.attrs.keys()) for item in obj_review['attributes'].keys()] + + count_additions = sum(additions) + + deletions = [not (item in obj_review['attributes'].keys()) for item in obj.attrs.keys()] + + count_delections = sum(deletions) + + print('additions',count_additions, 'deletions', count_delections) + + with open(yaml_file,'r') as stream: + try: + yaml_dict = yaml.load(stream, Loader=yaml.FullLoader) + except yaml.YAMLError as exc: + print(exc) + + with h5py.File(input_hdf5_file,'r+') as f: + #f.visititems(lambda name, obj: count(name,obj,yaml_dict)) + for key in yaml_dict.keys(): # keys should coincide with group names + print(key) + # Select hdf5 and yaml objects at key + hdf5_obj = f[key] + yaml_obj = yaml_dict[key] + + count(hdf5_obj.name, hdf5_obj, yaml_dict) + + for attr_name, attr_value in yaml_obj['attributes'].items(): + #attr_value = yaml_obj['attributes'][attr_name] + + # Check whether attr_name belongs to the existing attributes of hdf5_obj + if attr_name in hdf5_obj.attrs.keys(): + #else: # renaming attribute and possibly change of value assigment + if isinstance(attr_value,dict): + # Retreive possibly new attribute's name and value + new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value + new_attr_value = attr_value.get('value',hdf5_obj.attrs[attr_name]) + + hdf5_obj.attrs[new_attr_name] = new_attr_value + + # Remove from hdf5_obj.attrs attribute w/ name: attr_name if + # yaml indicates a renaming of the attribute. + if not (new_attr_name == attr_name): + hdf5_obj.attrs.__delitem__(attr_name) + else: + hdf5_obj.attrs[attr_name] = attr_value + else: # attribute inclusion + if isinstance(attr_value,dict): + # Retreive new attribute's name and value + new_attr_name = attr_value.get('rename_as',attr_name) # if 'rename_as' is a key in attr_value returns the value, otherwise it return the existing value + new_attr_value = attr_value.get('value',np.nan) # TODO: let the user know why np.nan might have been assigned + hdf5_obj.attrs[new_attr_name] = new_attr_value + else: + hdf5_obj.attrs[attr_name] = attr_value + + # Recreate/or update yaml representation of updated input_hdf5_file. + output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(input_hdf5_file) + + #with open('review/review_status.txt','r+') as f: + # f.write('hdf5 file updated w/ metadata review') + + #try: + # yaml_dict = yaml.load(stream, Loader=yaml.FullLoader) + #except yaml.YAMLError as exc: + # print(exc) + + +def fourth_complete_metadata_review(): + + # 1. git add output_files/ + # 2. delete review/ + shutil.rmtree(os.path.join(os.path.abspath(os.curdir),"review")) + # 3. git rm review/ + # 4. git commit -m "Completed review process. Current state of hdf5 file and yml should be up to date." + return True + + +#import config_file +#import hdf5_vis + +def main(): + + output_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.h5" + output_yml_filename_path = "output_files/unified_file_smog_chamber_2024-03-19_UTC-OFST_+0100_NG.yalm" + output_yml_filename_path_tail, filename = os.path.split(output_yml_filename_path) + #output_yml_filename_path = hdf5_vis.take_yml_snapshot_of_hdf5_file(output_filename_path) + + #first_initialize_metadata_review(output_filename_path,initials='NG') + #second_submit_metadata_review() + #if os.path.exists(os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)): + # third_update_hdf5_file_with_review(output_filename_path, os.path.join(os.path.join(os.path.abspath(os.curdir),"review"),filename)) + #fourth_complete_metadata_review() + +#if __name__ == '__main__': + +# main()