Implemented metadata append, rename, delete, and update operations on the hdf5 manager object and refactored metadata update script based on yaml file to use said operations.
This commit is contained in:
@ -190,32 +190,71 @@ def load_yaml(yaml_review_file):
|
|||||||
print(exc)
|
print(exc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def update_hdf5_attributes(input_hdf5_file, yaml_dict):
|
#def update_hdf5_attributes(input_hdf5_file, yaml_dict):
|
||||||
|
|
||||||
def update_attributes(hdf5_obj, yaml_obj):
|
|
||||||
for attr_name, attr_value in yaml_obj['attributes'].items():
|
|
||||||
|
|
||||||
if not isinstance(attr_value, dict):
|
|
||||||
attr_value = {'rename_as': attr_name, 'value': attr_value}
|
|
||||||
|
|
||||||
if (attr_name in hdf5_obj.attrs.keys()): # delete or update
|
|
||||||
if attr_value.get('delete'): # delete when True
|
|
||||||
hdf5_obj.attrs.__delitem__(attr_name)
|
|
||||||
elif not (attr_value.get('rename_as') == attr_name): # update when true
|
|
||||||
hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # convert_attrdict_to_np_structured_array(attr_value)
|
|
||||||
hdf5_obj.attrs.__delitem__(attr_name)
|
|
||||||
else: # add a new attribute
|
|
||||||
hdf5_obj.attrs.update({attr_name : utils.convert_attrdict_to_np_structured_array(attr_value)})
|
|
||||||
|
|
||||||
with h5py.File(input_hdf5_file, 'r+') as f:
|
|
||||||
for key in yaml_dict.keys():
|
|
||||||
hdf5_obj = f[key]
|
|
||||||
yaml_obj = yaml_dict[key]
|
|
||||||
update_attributes(hdf5_obj, yaml_obj)
|
|
||||||
|
|
||||||
def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
|
def update_hdf5_file_with_review(input_hdf5_file, yaml_review_file):
|
||||||
|
|
||||||
|
"""
|
||||||
|
Updates, appends, or deletes metadata attributes in an HDF5 file based on a provided YAML dictionary.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
input_hdf5_file : str
|
||||||
|
Path to the HDF5 file.
|
||||||
|
|
||||||
|
yaml_dict : dict
|
||||||
|
Dictionary specifying objects and their attributes with operations. Example format:
|
||||||
|
{
|
||||||
|
"object_name": {
|
||||||
|
"attr_name": {
|
||||||
|
"value": attr_value,
|
||||||
|
"delete": True/False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
yaml_dict = load_yaml(yaml_review_file)
|
yaml_dict = load_yaml(yaml_review_file)
|
||||||
update_hdf5_attributes(input_hdf5_file, yaml_dict)
|
|
||||||
|
# Initialize HDF5 operations manager
|
||||||
|
DataOpsAPI = hdf5_ops.HDF5DataOpsManager(input_hdf5_file)
|
||||||
|
DataOpsAPI.open_file()
|
||||||
|
|
||||||
|
# Iterate over each object in the YAML dictionary
|
||||||
|
for obj_name, attr_dict in yaml_dict.items():
|
||||||
|
# Prepare dictionaries for append, update, and delete actions
|
||||||
|
append_dict = {}
|
||||||
|
update_dict = {}
|
||||||
|
delete_dict = {}
|
||||||
|
|
||||||
|
if not obj_name in DataOpsAPI.file_obj:
|
||||||
|
continue # Skip if the object does not exist
|
||||||
|
|
||||||
|
# Iterate over each attribute in the current object
|
||||||
|
for attr_name, attr_props in attr_dict['attributes'].items():
|
||||||
|
if not isinstance(attr_props, dict):
|
||||||
|
#attr_props = {'value': attr_props}
|
||||||
|
# Check if the attribute exists (for updating)
|
||||||
|
if attr_name in DataOpsAPI.file_obj[obj_name].attrs:
|
||||||
|
update_dict[attr_name] = attr_props
|
||||||
|
# Otherwise, it's a new attribute to append
|
||||||
|
else:
|
||||||
|
append_dict[attr_name] = attr_props
|
||||||
|
else:
|
||||||
|
# Check if the attribute is marked for deletion
|
||||||
|
if attr_props.get('delete', False):
|
||||||
|
delete_dict[attr_name] = attr_props
|
||||||
|
|
||||||
|
# Perform a single pass for all three operations
|
||||||
|
if append_dict:
|
||||||
|
DataOpsAPI.append_metadata(obj_name, append_dict)
|
||||||
|
if update_dict:
|
||||||
|
DataOpsAPI.update_metadata(obj_name, update_dict)
|
||||||
|
if delete_dict:
|
||||||
|
DataOpsAPI.delete_metadata(obj_name, delete_dict)
|
||||||
|
|
||||||
|
# Close hdf5 file
|
||||||
|
DataOpsAPI.close_file()
|
||||||
# Regenerate yaml snapshot of updated HDF5 file
|
# Regenerate yaml snapshot of updated HDF5 file
|
||||||
output_yml_filename_path = hdf5_ops.serialize_metadata(input_hdf5_file)
|
output_yml_filename_path = hdf5_ops.serialize_metadata(input_hdf5_file)
|
||||||
print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
|
print(f'{output_yml_filename_path} was successfully regenerated from the updated version of{input_hdf5_file}')
|
||||||
|
219
src/hdf5_ops.py
219
src/hdf5_ops.py
@ -16,6 +16,7 @@ import h5py
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
import json
|
import json
|
||||||
|
import copy
|
||||||
|
|
||||||
class HDF5DataOpsManager():
|
class HDF5DataOpsManager():
|
||||||
def __init__(self, file_path, mode = 'r+') -> None:
|
def __init__(self, file_path, mode = 'r+') -> None:
|
||||||
@ -24,14 +25,11 @@ class HDF5DataOpsManager():
|
|||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.file_obj = None
|
self.file_obj = None
|
||||||
self._open_file()
|
#self._open_file()
|
||||||
self.list_of_datasets = []
|
self.list_of_datasets = []
|
||||||
|
|
||||||
# Define private methods
|
# Define private methods
|
||||||
|
|
||||||
def _open_file(self):
|
|
||||||
if self.file_obj is None:
|
|
||||||
self.file_obj = h5py.File(self.file_path, self.mode)
|
|
||||||
|
|
||||||
def _collect_dataset_names(self, name, obj, list_of_datasets):
|
def _collect_dataset_names(self, name, obj, list_of_datasets):
|
||||||
if isinstance(obj, h5py.Dataset):
|
if isinstance(obj, h5py.Dataset):
|
||||||
@ -39,6 +37,10 @@ class HDF5DataOpsManager():
|
|||||||
|
|
||||||
# Define public methods
|
# Define public methods
|
||||||
|
|
||||||
|
def open_file(self):
|
||||||
|
if self.file_obj is None:
|
||||||
|
self.file_obj = h5py.File(self.file_path, self.mode)
|
||||||
|
|
||||||
def close_file(self):
|
def close_file(self):
|
||||||
if self.file_obj:
|
if self.file_obj:
|
||||||
self.file_obj.flush() # Ensure all data is written to disk
|
self.file_obj.flush() # Ensure all data is written to disk
|
||||||
@ -73,7 +75,7 @@ class HDF5DataOpsManager():
|
|||||||
try:
|
try:
|
||||||
return pd.DataFrame(data)
|
return pd.DataFrame(data)
|
||||||
except ValueError as exp:
|
except ValueError as exp:
|
||||||
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}")
|
logging.error(f"Failed to convert dataset '{dataset_name}' to DataFrame: {exp}. Instead, dataset will be returned as Numpy array.")
|
||||||
return data # 'data' is a NumPy array here
|
return data # 'data' is a NumPy array here
|
||||||
|
|
||||||
def append_dataset(self,dataset_dict, group_name):
|
def append_dataset(self,dataset_dict, group_name):
|
||||||
@ -90,21 +92,196 @@ class HDF5DataOpsManager():
|
|||||||
self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
|
self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
|
||||||
self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])
|
self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])
|
||||||
|
|
||||||
def append_annotations(self, obj_name, annotation_dict):
|
def append_metadata(self, obj_name, annotation_dict):
|
||||||
""" appends annotations in the form of a dictionary to the obj (group or dataset) spefified by obj_name"""
|
"""
|
||||||
|
Appends metadata attributes to the specified object (obj_name) based on the provided annotation_dict.
|
||||||
|
|
||||||
|
This method ensures that the provided metadata attributes do not overwrite any existing ones. If an attribute already exists,
|
||||||
|
a ValueError is raised. The function supports storing scalar values (int, float, str) and compound values such as dictionaries
|
||||||
|
that are converted into NumPy structured arrays before being added to the metadata.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
obj_name: str
|
||||||
|
Path to the target object (dataset or group) within the HDF5 file.
|
||||||
|
|
||||||
|
annotation_dict: dict
|
||||||
|
A dictionary where the keys represent new attribute names (strings), and the values can be:
|
||||||
|
- Scalars: int, float, or str.
|
||||||
|
- Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays.
|
||||||
|
Example of a compound value:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
----------
|
||||||
|
annotation_dict = {
|
||||||
|
"relative_humidity": {
|
||||||
|
"value": 65,
|
||||||
|
"units": "percentage",
|
||||||
|
"range": "[0,100]",
|
||||||
|
"definition": "amount of water vapor present ..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.file_obj is None:
|
||||||
|
self.open_file()
|
||||||
|
|
||||||
|
# Create a copy of annotation_dict to avoid modifying the original
|
||||||
|
annotation_dict_copy = copy.deepcopy(annotation_dict)
|
||||||
|
|
||||||
|
#with h5py.File(self.file_path, mode='r+') as file_obj:
|
||||||
|
obj = self.file_obj[obj_name]
|
||||||
|
|
||||||
|
# Check if any attribute already exists
|
||||||
|
if any(key in obj.attrs for key in annotation_dict_copy.keys()):
|
||||||
|
raise ValueError("Make sure the provided (key, value) pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()")
|
||||||
|
|
||||||
|
# Process the dictionary values and convert them to structured arrays if needed
|
||||||
|
for key, value in annotation_dict_copy.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# Convert dictionaries to NumPy structured arrays for complex attributes
|
||||||
|
annotation_dict_copy[key] = utils.convert_attrdict_to_np_structured_array(value)
|
||||||
|
|
||||||
|
# Update the object's attributes with the new metadata
|
||||||
|
obj.attrs.update(annotation_dict_copy)
|
||||||
|
|
||||||
|
|
||||||
|
def update_metadata(self, obj_name, annotation_dict):
|
||||||
|
"""
|
||||||
|
Updates the value of existing metadata attributes of the specified object (obj_name) based on the provided annotation_dict.
|
||||||
|
|
||||||
|
The function disregards non-existing attributes and suggests to use the append_metadata() method to include those in the metadata.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
obj_name : str
|
||||||
|
Path to the target object (dataset or group) within the HDF5 file.
|
||||||
|
|
||||||
|
annotation_dict: dict
|
||||||
|
A dictionary where the keys represent existing attribute names (strings), and the values can be:
|
||||||
|
- Scalars: int, float, or str.
|
||||||
|
- Compound values (dictionaries) for more complex metadata, which are converted to NumPy structured arrays.
|
||||||
|
Example of a compound value:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
----------
|
||||||
|
annotation_dict = {
|
||||||
|
"relative_humidity": {
|
||||||
|
"value": 65,
|
||||||
|
"units": "percentage",
|
||||||
|
"range": "[0,100]",
|
||||||
|
"definition": "amount of water vapor present ..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.file_obj is None:
|
||||||
|
self.open_file()
|
||||||
|
|
||||||
|
update_dict = {}
|
||||||
|
|
||||||
|
#with h5py.File(self.file_path, mode='r+') as file_obj:
|
||||||
|
obj = self.file_obj[obj_name]
|
||||||
|
|
||||||
|
for key, value in annotation_dict.items():
|
||||||
|
if key in obj.attrs:
|
||||||
|
if isinstance(value, dict):
|
||||||
|
update_dict[key] = utils.convert_attrdict_to_np_structured_array(value)
|
||||||
|
else:
|
||||||
|
update_dict[key] = value
|
||||||
|
else:
|
||||||
|
# Optionally, log or warn about non-existing keys being ignored.
|
||||||
|
print(f"Warning: Key '{key}' does not exist and will be ignored.")
|
||||||
|
|
||||||
|
obj.attrs.update(update_dict)
|
||||||
|
|
||||||
|
def delete_metadata(self, obj_name, annotation_dict):
|
||||||
|
"""
|
||||||
|
Deletes metadata attributes of the specified object (obj_name) based on the provided annotation_dict.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
obj_name: str
|
||||||
|
Path to the target object (dataset or group) within the HDF5 file.
|
||||||
|
|
||||||
|
annotation_dict: dict
|
||||||
|
Dictionary where keys represent attribute names, and values should be dictionaries containing
|
||||||
|
{"delete": True} to mark them for deletion.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
--------
|
||||||
|
annotation_dict = {"attr_to_be_deleted": {"delete": True}}
|
||||||
|
|
||||||
|
Behavior:
|
||||||
|
---------
|
||||||
|
- Deletes the specified attributes from the object's metadata if marked for deletion.
|
||||||
|
- Issues a warning if the attribute is not found or not marked for deletion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.file_obj is None:
|
||||||
|
self.open_file()
|
||||||
|
|
||||||
|
#with h5py.File(self.file_path, mode='r+') as file_obj:
|
||||||
|
obj = self.file_obj[obj_name]
|
||||||
|
|
||||||
|
for attr_key, value in annotation_dict.items():
|
||||||
|
if attr_key in obj.attrs:
|
||||||
|
if isinstance(value, dict) and value.get('delete', False):
|
||||||
|
obj.attrs.__delitem__(attr_key)
|
||||||
|
else:
|
||||||
|
msg = f"Warning: Value for key '{attr_key}' is not marked for deletion or is invalid."
|
||||||
|
print(msg)
|
||||||
|
else:
|
||||||
|
msg = f"Warning: Key '{attr_key}' does not exist in metadata."
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def rename_metadata(self, obj_name, renaming_map):
|
||||||
|
"""
|
||||||
|
Renames metadata attributes of the specified object (obj_name) based on the provided renaming_map.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
-----------
|
||||||
|
obj_name: str
|
||||||
|
Path to the target object (dataset or group) within the HDF5 file.
|
||||||
|
|
||||||
|
renaming_map: dict
|
||||||
|
A dictionary where keys are current attribute names (strings), and values are the new attribute names (strings or byte strings) to rename to.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
--------
|
||||||
|
renaming_map = {
|
||||||
|
"old_attr_name": "new_attr_name",
|
||||||
|
"old_attr_2": "new_attr_2"
|
||||||
|
}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
#with h5py.File(self.file_path, mode='r+') as file_obj:
|
||||||
|
if self.file_obj is None:
|
||||||
|
self.open_file()
|
||||||
|
|
||||||
obj = self.file_obj[obj_name]
|
obj = self.file_obj[obj_name]
|
||||||
|
|
||||||
# Verify if attributes to append are all new
|
# Iterate over the renaming_map to process renaming
|
||||||
if any(new_attr_key in obj.attrs.keys() for new_attr_key in annotation_dict.keys()):
|
for old_attr, new_attr in renaming_map.items():
|
||||||
self.close_file()
|
if old_attr in obj.attrs:
|
||||||
raise ValueError("Make sure the provided key, value pairs are not existing metadata elements or attributes. To modify or delete existing attributes use .modify_annotation() or .delete_annotation()")
|
# Get the old attribute's value
|
||||||
|
attr_value = obj.attrs[old_attr]
|
||||||
|
|
||||||
for new_attr_key in annotation_dict.keys():
|
# Create a new attribute with the new name
|
||||||
value = annotation_dict[new_attr_key]
|
obj.attrs.create(new_attr, data=attr_value)
|
||||||
if isinstance(value, dict):
|
|
||||||
annotation_dict[new_attr_key] = utils.convert_attrdict_to_np_structured_array(annotation_dict[new_attr_key])
|
# Delete the old attribute
|
||||||
obj.attrs.update(annotation_dict)
|
obj.attrs.__delitem__(old_attr)
|
||||||
|
else:
|
||||||
|
# Skip if the old attribute doesn't exist
|
||||||
|
msg = f"Skipping: Attribute '{old_attr}' does not exist."
|
||||||
|
print(msg) # Optionally, replace with warnings.warn(msg)
|
||||||
|
|
||||||
|
self.close_file()
|
||||||
|
|
||||||
def get_metadata(self, obj_path):
|
def get_metadata(self, obj_path):
|
||||||
""" Get file attributes from object at path = obj_path. For example,
|
""" Get file attributes from object at path = obj_path. For example,
|
||||||
@ -249,9 +426,9 @@ def construct_attributes_dict(attrs_obj):
|
|||||||
#attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])
|
#attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])
|
||||||
attr_dict[key] = utils.to_serializable_dtype(value)
|
attr_dict[key] = utils.to_serializable_dtype(value)
|
||||||
else:
|
else:
|
||||||
attr_dict[key] = {"rename_as" : key,
|
attr_dict[key] = utils.to_serializable_dtype(value) # {"rename_as" : key,
|
||||||
"value" : utils.to_serializable_dtype(value)
|
#"value" : utils.to_serializable_dtype(value)
|
||||||
}
|
#}
|
||||||
|
|
||||||
#if isinstance(value,str):
|
#if isinstance(value,str):
|
||||||
# value.replace('\\','\\\\')
|
# value.replace('\\','\\\\')
|
||||||
|
Reference in New Issue
Block a user