diff --git a/src/g5505_file_reader.py b/src/g5505_file_reader.py index 5a718cc..92145c9 100644 --- a/src/g5505_file_reader.py +++ b/src/g5505_file_reader.py @@ -37,7 +37,7 @@ def read_xps_ibw_file_as_dict(filename): ------- file_dict : dict A dictionary containing the datasets from the IBW file. - + Raises ------ ValueError @@ -259,27 +259,36 @@ def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): file_dict['datasets'] = [] #### - if numerical_variables: - dataset = {} - dataset['name'] = 'data_table'#_numerical_variables' - dataset['data'] = utils.dataframe_to_np_structured_array(pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)) #df_numerical_attrs.to_numpy() - dataset['shape'] = dataset['data'].shape - dataset['dtype'] = type(dataset['data']) - #dataset['data_units'] = file_obj['wave']['data_units'] + df = pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1) - try: - dataset['attributes'] = description_dict['table_header'].copy() - for key in description_dict['table_header'].keys(): - if not key in numerical_variables: - dataset['attributes'].pop(key) # delete key - else: - dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key]) - if timestamps_name in categorical_variables: - dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'}) - except ValueError as err: - print(err) + #if numerical_variables: + dataset = {} + dataset['name'] = 'data_table'#_numerical_variables' + dataset['data'] = utils.dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy() + dataset['shape'] = dataset['data'].shape + dataset['dtype'] = type(dataset['data']) + #dataset['data_units'] = file_obj['wave']['data_units'] + # + # Create attribute descriptions based on description_dict + dataset['attributes'] = {} - file_dict['datasets'].append(dataset) + for column_name in df.columns: + column_attr_dict = description_dict['table_header'].get(column_name,{'note':'there was no description available. Review instrument files.'}) + dataset['attributes'].update({column_name: utils.parse_attribute(column_attr_dict)}) + + #try: + # dataset['attributes'] = description_dict['table_header'].copy() + # for key in description_dict['table_header'].keys(): + # if not key in numerical_variables: + # dataset['attributes'].pop(key) # delete key + # else: + # dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key]) + # if timestamps_name in categorical_variables: + # dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'}) + #except ValueError as err: + # print(err) + + file_dict['datasets'].append(dataset) #if categorical_variables: diff --git a/src/g5505_utils.py b/src/g5505_utils.py index fda1bbb..48de47f 100644 --- a/src/g5505_utils.py +++ b/src/g5505_utils.py @@ -165,6 +165,22 @@ def infer_units(column_name): return match +def parse_attribute(attr_value : dict): + "Parse a dictionary attribute into an equivalent numpy structured array, which compatible with compound HDF5 type" + dtype = [] + values_list = [] + max_length = max(len(str(attr_value[key])) for key in attr_value.keys()) + for key in attr_value.keys(): + if (not key=='rename_as'): + dtype.append((key,f'S{max_length}')) + values_list.append(attr_value[key]) + + if values_list: + new_attr_value = np.array([tuple(values_list)],dtype=dtype) + else: + new_attr_value = 'missing' + + return new_attr_value def progressBar(count_value, total, suffix=''): bar_length = 100 diff --git a/src/hdf5_lib.py b/src/hdf5_lib.py index 3d7b5de..82e60ed 100644 --- a/src/hdf5_lib.py +++ b/src/hdf5_lib.py @@ -427,7 +427,7 @@ def save_processed_dataframe_to_hdf5(df, annotator, output_filename): # src_hdf5 for key, value in data_level_attributes.items(): if isinstance(value,dict): - data_level_attributes[key] = metadata_lib.parse_attribute(value) + data_level_attributes[key] = utils.parse_attribute(value) # Prepare file dictionary diff --git a/src/metadata_review_lib.py b/src/metadata_review_lib.py index ab4a4a0..d98e72d 100644 --- a/src/metadata_review_lib.py +++ b/src/metadata_review_lib.py @@ -32,35 +32,6 @@ def get_review_status(filename_path): workflow_steps.append(line) return workflow_steps[-1] -def parse_attribute(attr_value : dict): - "Parse a dictionary attribute into an equivalent numpy structured array, which compatible with compound HDF5 type" - dtype = [] - values_list = [] - max_length = max(len(str(attr_value[key])) for key in attr_value.keys()) - for key in attr_value.keys(): - if (not key=='rename_as'): - dtype.append((key,f'S{max_length}')) - values_list.append(attr_value[key]) - - if values_list: - new_attr_value = np.array([tuple(values_list)],dtype=dtype) - else: - new_attr_value = 'missing' - - return new_attr_value - -def convert_string_to_bytes(input_list: list): - utf8_type = lambda max_length: h5py.string_dtype('utf-8', max_length) - if input_list: - max_length = max(len(item) for item in input_list) - # Convert the strings to bytes with utf-8 encoding, specifying errors='ignore' to skip characters that cannot be encoded - input_list_bytes = [item.encode('utf-8', errors='ignore') for item in input_list] - input_array_bytes = np.array(input_list_bytes,dtype=utf8_type(max_length)) - else: - input_array_bytes = np.array([],dtype=utf8_type(0)) - - return input_array_bytes - def first_initialize_metadata_review(hdf5_file_path, reviewer_attrs, restart = False): """ @@ -234,7 +205,7 @@ def update_hdf5_attributes(input_hdf5_file, yaml_dict): hdf5_obj.attrs[attr_value.get('rename_as')] = hdf5_obj.attrs[attr_name] # parse_attribute(attr_value) hdf5_obj.attrs.__delitem__(attr_name) else: # add a new attribute - hdf5_obj.attrs.update({attr_name : parse_attribute(attr_value)}) + hdf5_obj.attrs.update({attr_name : utils.parse_attribute(attr_value)}) with h5py.File(input_hdf5_file, 'r+') as f: for key in yaml_dict.keys():