Improved parsing from HDF5 attr dict to yaml compatible dict. Now we can parse HDF5 compound attributes (structured np arrays).

2024-05-28 11:27:44 +02:00
parent 41c7660be3
commit f6a46168ec
1 changed files with 25 additions and 17 deletions
--- a/src/hdf5_vis.py
+++ b/src/hdf5_vis.py
@ -59,31 +59,38 @@ def display_group_hierarchy_on_a_treemap(filename: str):
    #pio.write_image(fig,file_name + ".png",width=800,height=600,format='png')

 #
-    
 def make_dtype_yaml_compatible(value):
    try:
        if isinstance(value, np.generic):
-            #if np.issubdtype(value.dtype, np.string_):
-                #value = value.astype(str)
            if np.issubdtype(value.dtype, np.bytes_):
                value = value.decode('utf-8')
            elif np.issubdtype(value.dtype, np.unicode_):
-                value = str(value) 
+                value = str(value)
            elif np.issubdtype(value.dtype, np.number):
                value = float(value)
            else:
-                print('Yaml-compatible data-type was not found. Value has been set to Nan.')
+                print('Yaml-compatible data-type was not found. Value has been set to NaN.')
                value = np.nan    
-        elif isinstance(value, np.ndarray):                        
-            if np.issubdtype(value.dtype, np.string_) or np.issubdtype(value.dtype, np.generic):
-                value = [str(item) for item in value] if len(value)>1 else str(value[0]) # value.astype(str).tolist() 
-            elif np.issubdtype(value.dtype, np.integer) :
-                value = [int(item) for item in value] if len(value)>1 else int(value[0]) # value.astype(int).tolist() 
-            elif np.issubdtype(value.dtype, np.floating):
-                value = [float(item) for item in value]  if len(value)>1 else float(value[0]) # value.astype(float).tolist() 
+        elif isinstance(value, np.ndarray):  
+            # Handling structured array types (with fields)
+            if value.dtype.names:
+                value = {field: make_dtype_yaml_compatible(value[field]) for field in value.dtype.names}
+            else:
+                # Handling regular array NumPy types                         
+                if np.issubdtype(value.dtype, np.bytes_):
+                    value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
+                elif np.issubdtype(value.dtype, np.unicode_):
+                    value = [str(item) for item in value] if len(value) > 1 else str(value[0])
+                elif np.issubdtype(value.dtype, np.integer):
+                    value = [int(item) for item in value] if len(value) > 1 else int(value[0])
+                elif np.issubdtype(value.dtype, np.floating):
+                    value = [float(item) for item in value] if len(value) > 1 else float(value[0])
+                else:
+                    print('Yaml-compatible data-type was not found. Value has been set to NaN.')
+                    value = np.nan

-    except:
-        print('Yaml-compatible data-type was not found. Value has been set to Nan.')
+    except Exception as e:
+        print(f'Error converting value: {e}. Value has been set to NaN.')
        value = np.nan

    return value
@ -104,8 +111,9 @@ def construct_attributes_dict(attrs_obj):
        if not key in ['file_list','filtered_file_list']:
            
            if is_structured_array(value):
-               for subattr in value.dtype.names:
-                   attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])                 
+               #for subattr in value.dtype.names:
+                   #attr_dict[key][subattr] = make_dtype_yaml_compatible(value[subattr])  
+                attr_dict[key] = make_dtype_yaml_compatible(value)             
            else:
                value =  make_dtype_yaml_compatible(value)
                attr_dict[key] = {"rename_as" : key,
@ -156,7 +164,7 @@ def print_metadata(name, obj, folder_depth, yaml_dict):
            yaml_dict[obj.name] = group_dict
        elif isinstance(obj, h5py.Dataset):            
            parent_name = '/'.join(name_to_list[:-1])
-            yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes":dict(obj.attrs)}
+            yaml_dict[parent_name]["datasets"][name_head] = {"rename_as": name_head ,"attributes": construct_attributes_dict(obj.attrs)}
            #print(yaml.dump(group_dict,sort_keys=False))

        #elif len(obj.name.split('/')) == 3: