Added lines to treat string attributes as fixed-length strings, which are represented as bytes that need to be decoded with utf-8. There are a few advantages, and hdf5 reader provide more precise behavior than variable length strings

2024-03-22 17:28:47 +01:00
parent 13cb6395aa
commit 1bf1f60beb
1 changed files with 12 additions and 2 deletions
--- a/src/hdf5_lib.py
+++ b/src/hdf5_lib.py
@@ -317,11 +317,21 @@ def create_hdf5_file_from_filesystem_path(ofilename : str,
                        if not file_dict:
                            continue

-                                    # file_dict = file_obj
                        # Create group and add their attributes
                        h5file[group_name].create_group(name=file_dict['name'])
                        for key in file_dict['attributes_dict'].keys():
-                            h5file[group_name][file_dict['name']].attrs.create(name=key,data=file_dict['attributes_dict'][key])
+                            
+                            # Represent string values as fixed length strings in the HDF5 file, which need
+                            # to be decoded as string when we read them. It provides better control than variable strings,
+                            # at the expense of flexibility.
+                            # https://docs.h5py.org/en/stable/strings.html
+                            value = file_dict['attributes_dict'][key]
+                            if isinstance(value,str):
+                                utf8_type = h5py.string_dtype('utf-8', len(value))
+                                value = np.array(value.encode('utf-8'),dtype=utf8_type)
+
+                            h5file[group_name][file_dict['name']].attrs.create(name=key,
+                                                                               data=value)
                            
                        # Add datasets to just created group
                        for dataset in file_dict['datasets']: