diff --git a/src/hdf5_ops.py b/src/hdf5_ops.py index b819aa4..21de1f0 100644 --- a/src/hdf5_ops.py +++ b/src/hdf5_ops.py @@ -459,50 +459,51 @@ def get_parent_child_relationships(file: h5py.File): def __print_metadata__(name, obj, folder_depth, yaml_dict): - # TODO: should we enable deeper folders ? - if len(obj.name.split('/')) <= folder_depth: + """ + Extracts metadata from HDF5 groups and datasets and organizes them into a dictionary with compact representation. + + Parameters: + ----------- + name (str): Name of the HDF5 object being inspected. + obj (h5py.Group or h5py.Dataset): The HDF5 object (Group or Dataset). + folder_depth (int): Maximum depth of folders to explore. + yaml_dict (dict): Dictionary to populate with metadata. + """ + # Process only objects within the specified folder depth + if len(obj.name.split('/')) <= folder_depth: # and ".h5" not in obj.name: name_to_list = obj.name.split('/') - name_head = name_to_list[-1] + name_head = name_to_list[-1] if not name_to_list[-1]=='' else obj.name - if isinstance(obj,h5py.Group): - #print('name:', obj.name) - #print('attributes:', dict(obj.attrs)) - #attr_dict = {} - group_dict = {} - - # Convert attribute dict to a YAML/JSON serializable dict + if isinstance(obj, h5py.Group): # Handle groups + # Convert attributes to a YAML/JSON serializable format attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()} - #for key, value in obj.attrs.items(): - #print (key, value.dtype) - # if key == 'Layout': - # print(value) + # Initialize the group dictionary + group_dict = {"name": name_head, "attributes": attr_dict} - # if not key in ['file_list','filtered_file_list']: + # Handle group members compactly + #subgroups = [member_name for member_name in obj if isinstance(obj[member_name], h5py.Group)] + #datasets = [member_name for member_name in obj if isinstance(obj[member_name], h5py.Dataset)] - # value = make_dtype_yaml_compatible(value) - - # attr_dict[key] = {'rename_as' : key, - # 'value' : value - # } - - #group_dict[obj.name] = {'name': obj.name, 'attributes': attr_dict} - group_dict = {"name": name_head, "attributes": attr_dict, "datasets":{}} - #group_dict[obj.name]["name"] = obj.name - #group_dict[obj.name]["attributes"] = attr_dict - #group_dict[obj.name]["datasets"] = {} - #print(name) + # Summarize groups and datasets + #group_dict["content_summary"] = { + # "group_count": len(subgroups), + # "group_preview": subgroups[:3] + (["..."] if len(subgroups) > 3 else []), + # "dataset_count": len(datasets), + # "dataset_preview": datasets[:3] + (["..."] if len(datasets) > 3 else []) + #} yaml_dict[obj.name] = group_dict - elif isinstance(obj, h5py.Dataset): - # Convert attribute dict to a YAML/JSON serializable dict - attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()} - parent_name = '/'.join(name_to_list[:-1]) - yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": attr_dict} - #print(yaml.dump(group_dict,sort_keys=False)) - #elif len(obj.name.split('/')) == 3: - # print(yaml.dump()) + elif isinstance(obj, h5py.Dataset): # Handle datasets + # Convert attributes to a YAML/JSON serializable format + attr_dict = {key: utils.to_serializable_dtype(val) for key, val in obj.attrs.items()} + + dataset_dict = {"name": name_head, "attributes": attr_dict} + + yaml_dict[obj.name] = dataset_dict + + def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format: str = 'yaml') -> str: """ @@ -537,12 +538,13 @@ def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format # Open the HDF5 file and extract metadata with h5py.File(input_filename_path, 'r') as f: # Convert attribute dict to a YAML/JSON serializable dict - attrs_dict = {key: utils.to_serializable_dtype(val) for key, val in f.attrs.items()} - yaml_dict[f.name] = { - "name": f.name, - "attributes": attrs_dict, - "datasets": {} - } + #attrs_dict = {key: utils.to_serializable_dtype(val) for key, val in f.attrs.items()} + #yaml_dict[f.name] = { + # "name": f.name, + # "attributes": attrs_dict, + # "datasets": {} + #} + __print_metadata__(f.name, f, folder_depth, yaml_dict) # Traverse HDF5 file hierarchy and add datasets f.visititems(lambda name, obj: __print_metadata__(name, obj, folder_depth, yaml_dict))