Moved is_structured_array() and to_serializable_dtype() to utils, ranamed a few functions and propagated changes to dependent modules.

This commit is contained in:
2024-09-26 14:03:11 +02:00
parent a57e46d89c
commit a92660049f
5 changed files with 679 additions and 98 deletions

View File

@ -109,7 +109,7 @@ def created_at():
created_at = now_tz_aware.strftime('%Y-%m-%d_%H-%M-%S') + '_UTC-OFST_' + tz
return created_at
def dataframe_to_np_structured_array(df: pd.DataFrame):
def convert_dataframe_to_np_structured_array(df: pd.DataFrame):
# Define the dtype for the structured array, ensuring compatibility with h5py
dtype = []
@ -153,6 +153,47 @@ def convert_string_to_bytes(input_list: list):
return input_array_bytes
def convert_attrdict_to_np_structured_array(attr_value: dict):
"""
Converts a dictionary of attributes into a numpy structured array for HDF5
compound type compatibility.
Each dictionary key is mapped to a field in the structured array, with the
data type (S) determined by the longest string representation of the values.
If the dictionary is empty, the function returns 'missing'.
Parameters
----------
attr_value : dict
Dictionary containing the attributes to be converted. Example:
attr_value = {
'name': 'Temperature',
'unit': 'Celsius',
'value': 23.5,
'timestamp': '2023-09-26 10:00'
}
Returns
-------
new_attr_value : ndarray or str
Numpy structured array with UTF-8 encoded fields. Returns 'missing' if
the input dictionary is empty.
"""
dtype = []
values_list = []
max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
for key in attr_value.keys():
if key != 'rename_as':
dtype.append((key, f'S{max_length}'))
values_list.append(attr_value[key])
if values_list:
new_attr_value = np.array([tuple(values_list)], dtype=dtype)
else:
new_attr_value = 'missing'
return new_attr_value
def infer_units(column_name):
# TODO: complete or remove
@ -165,23 +206,6 @@ def infer_units(column_name):
return match
def parse_attribute(attr_value : dict):
"Parse a dictionary attribute into an equivalent numpy structured array, which compatible with compound HDF5 type"
dtype = []
values_list = []
max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
for key in attr_value.keys():
if (not key=='rename_as'):
dtype.append((key,f'S{max_length}'))
values_list.append(attr_value[key])
if values_list:
new_attr_value = np.array([tuple(values_list)],dtype=dtype)
else:
new_attr_value = 'missing'
return new_attr_value
def progressBar(count_value, total, suffix=''):
bar_length = 100
filled_up_Length = int(round(bar_length* count_value / float(total)))
@ -270,4 +294,59 @@ def copy_directory_with_contraints(input_dir_path, output_dir_path,
except Exception as e:
logging.error("Failed to copy %s: %s", src_file_path, e)
return path_to_files_dict
return path_to_files_dict
def to_serializable_dtype(value):
"""Transform value's dtype into YAML/JSON compatible dtype
Parameters
----------
value : _type_
_description_
Returns
-------
_type_
_description_
"""
try:
if isinstance(value, np.generic):
if np.issubdtype(value.dtype, np.bytes_):
value = value.decode('utf-8')
elif np.issubdtype(value.dtype, np.unicode_):
value = str(value)
elif np.issubdtype(value.dtype, np.number):
value = float(value)
else:
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
value = np.nan
elif isinstance(value, np.ndarray):
# Handling structured array types (with fields)
if value.dtype.names:
value = {field: to_serializable_dtype(value[field]) for field in value.dtype.names}
else:
# Handling regular array NumPy types
if np.issubdtype(value.dtype, np.bytes_):
value = [item.decode('utf-8') for item in value] if len(value) > 1 else value[0].decode('utf-8')
elif np.issubdtype(value.dtype, np.unicode_):
value = [str(item) for item in value] if len(value) > 1 else str(value[0])
elif np.issubdtype(value.dtype, np.integer):
value = [int(item) for item in value] if len(value) > 1 else int(value[0])
elif np.issubdtype(value.dtype, np.floating):
value = [float(item) for item in value] if len(value) > 1 else float(value[0])
else:
print('Yaml-compatible data-type was not found. Value has been set to NaN.')
value = np.nan
except Exception as e:
print(f'Error converting value: {e}. Value has been set to NaN.')
value = np.nan
return value
def is_structured_array(attr_val):
if isinstance(attr_val,np.ndarray):
return True if attr_val.dtype.names is not None else False
else:
return False