Files
dima/pipelines/metadata_revision.py

185 lines
7.6 KiB
Python

import sys
import os
try:
thisFilePath = os.path.abspath(__file__)
except NameError:
print("Error: __file__ is not available. Ensure the script is being run from a file.")
print("[Notice] Path to DIMA package may not be resolved properly.")
thisFilePath = os.getcwd() # Use current directory or specify a default
dimaPath = os.path.normpath(os.path.join(thisFilePath, "..",'..')) # Move up to project root
if dimaPath not in sys.path: # Avoid duplicate entries
sys.path.append(dimaPath)
import h5py
import yaml
try:
from dima.src import hdf5_ops as hdf5_ops
except ModuleNotFoundError:
import src.hdf5_ops as hdf5_ops
def load_yaml(review_yaml_file):
with open(review_yaml_file, 'r') as stream:
try:
return yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(exc)
return None
def validate_yaml_dict(input_hdf5_file, yaml_dict):
errors = []
notes = []
with h5py.File(input_hdf5_file, 'r') as hdf5_file:
# 1. Check for valid object names
for key in yaml_dict:
if key not in hdf5_file:
error_msg = f"Error: {key} is not a valid object's name in the HDF5 file."
print(error_msg)
errors.append(error_msg)
# 2. Confirm metadata dict for each object is a dictionary
for key, meta_dict in yaml_dict.items():
if not isinstance(meta_dict, dict):
error_msg = f"Error: Metadata for {key} should be a dictionary."
print(error_msg)
errors.append(error_msg)
else:
if 'attributes' not in meta_dict:
warning_msg = f"Warning: No 'attributes' in metadata dict for {key}."
print(warning_msg)
notes.append(warning_msg)
# 3. Verify update, append, and delete operations are well specified
for key, meta_dict in yaml_dict.items():
attributes = meta_dict.get("attributes", {})
for attr_name, attr_value in attributes.items():
# Ensure the object exists before accessing attributes
if key in hdf5_file:
hdf5_obj_attrs = hdf5_file[key].attrs # Access object-specific attributes
if attr_name in hdf5_obj_attrs:
# Attribute exists: it can be updated or deleted
if isinstance(attr_value, dict) and "delete" in attr_value:
note_msg = f"Note: '{attr_name}' in {key} may be deleted if 'delete' is set as true."
print(note_msg)
notes.append(note_msg)
else:
note_msg = f"Note: '{attr_name}' in {key} will be updated."
print(note_msg)
notes.append(note_msg)
else:
# Attribute does not exist: it can be appended or flagged as an invalid delete
if isinstance(attr_value, dict) and "delete" in attr_value:
error_msg = f"Error: Cannot delete non-existent attribute '{attr_name}' in {key}."
print(error_msg)
errors.append(error_msg)
else:
note_msg = f"Note: '{attr_name}' in {key} will be appended."
print(note_msg)
notes.append(note_msg)
else:
error_msg = f"Error: '{key}' is not a valid object in the HDF5 file."
print(error_msg)
errors.append(error_msg)
return len(errors) == 0, errors, notes
def update_hdf5_file_with_review(input_hdf5_file, review_yaml_file):
"""
Updates, appends, or deletes metadata attributes in an HDF5 file based on a provided YAML dictionary.
Parameters:
-----------
input_hdf5_file : str
Path to the HDF5 file.
yaml_dict : dict
Dictionary specifying objects and their attributes with operations. Example format:
{
"object_name": { "attributes" : "attr_name": { "value": attr_value,
"delete": true | false
}
}
}
"""
yaml_dict = load_yaml(review_yaml_file)
success, errors, notes = validate_yaml_dict(input_hdf5_file,yaml_dict)
if not success:
raise ValueError(f"Review yaml file {review_yaml_file} is invalid. Validation errors: {errors}")
# Initialize HDF5 operations manager
DataOpsAPI = hdf5_ops.HDF5DataOpsManager(input_hdf5_file)
DataOpsAPI.load_file_obj()
# Iterate over each object in the YAML dictionary
for obj_name, attr_dict in yaml_dict.items():
# Prepare dictionaries for append, update, and delete actions
append_dict = {}
update_dict = {}
delete_dict = {}
if not obj_name in DataOpsAPI.file_obj:
continue # Skip if the object does not exist
# Iterate over each attribute in the current object
for attr_name, attr_props in attr_dict['attributes'].items():
if not isinstance(attr_props, dict):
#attr_props = {'value': attr_props}
# Check if the attribute exists (for updating)
if attr_name in DataOpsAPI.file_obj[obj_name].attrs:
update_dict[attr_name] = attr_props
# Otherwise, it's a new attribute to append
else:
append_dict[attr_name] = attr_props
else:
# Check if the attribute is marked for deletion
if attr_props.get('delete', False):
delete_dict[attr_name] = attr_props
# Perform a single pass for all three operations
if append_dict:
DataOpsAPI.append_metadata(obj_name, append_dict)
if update_dict:
DataOpsAPI.update_metadata(obj_name, update_dict)
if delete_dict:
DataOpsAPI.delete_metadata(obj_name, delete_dict)
# Close hdf5 file
DataOpsAPI.unload_file_obj()
# Regenerate yaml snapshot of updated HDF5 file
output_yml_filename_path = hdf5_ops.serialize_metadata(input_hdf5_file)
print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
def count(hdf5_obj,yml_dict):
print(hdf5_obj.name)
if isinstance(hdf5_obj,h5py.Group) and len(hdf5_obj.name.split('/')) <= 4:
obj_review = yml_dict[hdf5_obj.name]
additions = [not (item in hdf5_obj.attrs.keys()) for item in obj_review['attributes'].keys()]
count_additions = sum(additions)
deletions = [not (item in obj_review['attributes'].keys()) for item in hdf5_obj.attrs.keys()]
count_delections = sum(deletions)
print('additions',count_additions, 'deletions', count_delections)
if __name__ == "__main__":
if len(sys.argv) < 4:
print("Usage: python metadata_revision.py update <path/to/target_file.hdf5> <path/to/metadata_review_file.yaml>")
sys.exit(1)
if sys.argv[1] == 'update':
input_hdf5_file = sys.argv[2]
review_yaml_file = sys.argv[3]
update_hdf5_file_with_review(input_hdf5_file, review_yaml_file)
#run(sys.argv[2])