diff --git a/src/g5505_file_reader.py b/src/g5505_file_reader.py index cc0cdb4..6f8ad28 100644 --- a/src/g5505_file_reader.py +++ b/src/g5505_file_reader.py @@ -86,6 +86,37 @@ def infer_units(column_name): return match +def dataframe_to_np_structured_array(df: pd.DataFrame): + + # Define the dtype for the structured array, ensuring compatibility with h5py + dtype = [] + for col in df.columns: + col_dtype = df[col].dtype + if pd.api.types.is_string_dtype(col_dtype): + # Convert string dtype to fixed-length strings + max_len = df[col].str.len().max() + dtype.append((col, f'S{max_len}')) + elif pd.api.types.is_integer_dtype(col_dtype): + dtype.append((col, 'i4')) # Assuming 32-bit integer + elif pd.api.types.is_float_dtype(col_dtype): + dtype.append((col, 'f4')) # Assuming 32-bit float + else: + raise ValueError(f"Unsupported dtype: {col_dtype}") + + # Convert the DataFrame to a structured array + structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype) + + #return structured_array + #table_header = df.columns + #table = df.to_numpy() + + #rows,cols = table.shape + #tmp = [tuple(table[i,:]) for i in range(rows)] + #dtype_tmp = [(table_header[i],'f4') for i in range(cols)] + #data = np.array(tmp, dtype=dtype_tmp) + + return structured_array + def read_txt_files_as_dict(filename : str ): with open('src/text_data_sources.yaml','r') as stream: @@ -131,9 +162,10 @@ def read_txt_files_as_dict(filename : str ): if table_header in line.decode(file_encoding): list_of_substrings = line.decode(file_encoding).split(separator) data_start = True - column_names = [] - for i, name in enumerate(list_of_substrings): - column_names.append(str(i)+'_'+name) + column_names = [str(i)+'_'+name for i, name in enumerate(list_of_substrings)] + #column_names = [] + #for i, name in enumerate(list_of_substrings): + # column_names.append(str(i)+'_'+name) #print(line_number, len(column_names ),'\n') break @@ -143,9 +175,17 @@ def read_txt_files_as_dict(filename : str ): # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character #line = ' '.join(list_of_substrings+['\n']) #line = ' '.join(list_of_substrings) - table_preamble.append(' '.join(list_of_substrings))# += new_line + table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line - header_dict["table_preamble"] = table_preamble + # Represent string values as fixed length strings in the HDF5 file, which need + # to be decoded as string when we read them. It provides better control than variable strings, + # at the expense of flexibility. + # https://docs.h5py.org/en/stable/strings.html + + if table_preamble: + max_length = max(len(item) for item in table_preamble) + utf8_type = h5py.string_dtype('utf-8', max_length) + header_dict["table_preamble"] = np.array(table_preamble,dtype=utf8_type) # TODO: it does not work with separator as none :(. fix for RGA @@ -164,9 +204,14 @@ def read_txt_files_as_dict(filename : str ): # Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml if timestamp_variables: - df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index] + #df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index] #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index] + + + df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1) df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables) + #print(df_categorical_attrs) + categorical_variables = [item for item in df_categorical_attrs.columns] #### @@ -185,13 +230,12 @@ def read_txt_files_as_dict(filename : str ): if numerical_variables: dataset = {} - dataset['name'] = 'numerical_variables' - dataset['data'] = df_numerical_attrs.to_numpy() + dataset['name'] = 'table_numerical_variables' + dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy() dataset['shape'] = dataset['data'].shape dataset['dtype'] = type(dataset['data']) #dataset['data_units'] = file_obj['wave']['data_units'] file_dict['datasets'].append(dataset) - rows,cols = dataset['shape'] try: dataset['attributes'] = description_dict['table_header'].copy() @@ -203,29 +247,29 @@ def read_txt_files_as_dict(filename : str ): except ValueError as err: print(err) - dataset = {} - numerical_variables= [item.encode("utf-8") for item in numerical_variables] - dataset['name'] = 'numerical_variable_names' - dataset['data'] = np.array(numerical_variables).reshape((1,cols)) - dataset['shape'] = dataset['data'].shape - dataset['dtype'] = type(dataset['data']) - file_dict['datasets'].append(dataset) + #dataset = {} + #numerical_variables= [item.encode("utf-8") for item in numerical_variables] + #dataset['name'] = 'numerical_variable_names' + #dataset['data'] = np.array(numerical_variables).reshape((1,len(numerical_variables))) + #dataset['shape'] = dataset['data'].shape + #dataset['dtype'] = type(dataset['data']) + #file_dict['datasets'].append(dataset) if categorical_variables: dataset = {} - dataset['name'] = 'categorical_variables' - dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy() + dataset['name'] = 'table_categorical_variables' + dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy() dataset['shape'] = dataset['data'].shape dataset['dtype'] = type(dataset['data']) file_dict['datasets'].append(dataset) - dataset = {} - categorical_variables = [item.encode("utf-8") for item in categorical_variables] - dataset['name'] = 'categorial_variable_names' - dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables))) - dataset['shape'] = dataset['data'].shape - dataset['dtype'] = type(dataset['data']) - file_dict['datasets'].append(dataset) + # dataset = {} + # categorical_variables = [item.encode("utf-8") for item in categorical_variables] + # dataset['name'] = 'categorial_variable_names' + # dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables))) + # dataset['shape'] = dataset['data'].shape + # dataset['dtype'] = type(dataset['data']) + # file_dict['datasets'].append(dataset) except: return {}