Optimzed and included df to np structured array conversion. \n-Replaced loop plus append with list comprehension. \n-Replaced pd df column concatenation based on row-wise concatenation with df.aggr() method that uses column wise concatenation.

This commit is contained in:
2024-05-23 22:18:37 +02:00
parent a45fb4476b
commit bd458c6cd0

View File

@ -86,6 +86,37 @@ def infer_units(column_name):
return match return match
def dataframe_to_np_structured_array(df: pd.DataFrame):
# Define the dtype for the structured array, ensuring compatibility with h5py
dtype = []
for col in df.columns:
col_dtype = df[col].dtype
if pd.api.types.is_string_dtype(col_dtype):
# Convert string dtype to fixed-length strings
max_len = df[col].str.len().max()
dtype.append((col, f'S{max_len}'))
elif pd.api.types.is_integer_dtype(col_dtype):
dtype.append((col, 'i4')) # Assuming 32-bit integer
elif pd.api.types.is_float_dtype(col_dtype):
dtype.append((col, 'f4')) # Assuming 32-bit float
else:
raise ValueError(f"Unsupported dtype: {col_dtype}")
# Convert the DataFrame to a structured array
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
#return structured_array
#table_header = df.columns
#table = df.to_numpy()
#rows,cols = table.shape
#tmp = [tuple(table[i,:]) for i in range(rows)]
#dtype_tmp = [(table_header[i],'f4') for i in range(cols)]
#data = np.array(tmp, dtype=dtype_tmp)
return structured_array
def read_txt_files_as_dict(filename : str ): def read_txt_files_as_dict(filename : str ):
with open('src/text_data_sources.yaml','r') as stream: with open('src/text_data_sources.yaml','r') as stream:
@ -131,9 +162,10 @@ def read_txt_files_as_dict(filename : str ):
if table_header in line.decode(file_encoding): if table_header in line.decode(file_encoding):
list_of_substrings = line.decode(file_encoding).split(separator) list_of_substrings = line.decode(file_encoding).split(separator)
data_start = True data_start = True
column_names = [] column_names = [str(i)+'_'+name for i, name in enumerate(list_of_substrings)]
for i, name in enumerate(list_of_substrings): #column_names = []
column_names.append(str(i)+'_'+name) #for i, name in enumerate(list_of_substrings):
# column_names.append(str(i)+'_'+name)
#print(line_number, len(column_names ),'\n') #print(line_number, len(column_names ),'\n')
break break
@ -143,9 +175,17 @@ def read_txt_files_as_dict(filename : str ):
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
#line = ' '.join(list_of_substrings+['\n']) #line = ' '.join(list_of_substrings+['\n'])
#line = ' '.join(list_of_substrings) #line = ' '.join(list_of_substrings)
table_preamble.append(' '.join(list_of_substrings))# += new_line table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
header_dict["table_preamble"] = table_preamble # Represent string values as fixed length strings in the HDF5 file, which need
# to be decoded as string when we read them. It provides better control than variable strings,
# at the expense of flexibility.
# https://docs.h5py.org/en/stable/strings.html
if table_preamble:
max_length = max(len(item) for item in table_preamble)
utf8_type = h5py.string_dtype('utf-8', max_length)
header_dict["table_preamble"] = np.array(table_preamble,dtype=utf8_type)
# TODO: it does not work with separator as none :(. fix for RGA # TODO: it does not work with separator as none :(. fix for RGA
@ -164,9 +204,14 @@ def read_txt_files_as_dict(filename : str ):
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml # Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
if timestamp_variables: if timestamp_variables:
df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index] #df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index] #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables) df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
#print(df_categorical_attrs)
categorical_variables = [item for item in df_categorical_attrs.columns] categorical_variables = [item for item in df_categorical_attrs.columns]
#### ####
@ -185,13 +230,12 @@ def read_txt_files_as_dict(filename : str ):
if numerical_variables: if numerical_variables:
dataset = {} dataset = {}
dataset['name'] = 'numerical_variables' dataset['name'] = 'table_numerical_variables'
dataset['data'] = df_numerical_attrs.to_numpy() dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy()
dataset['shape'] = dataset['data'].shape dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data']) dataset['dtype'] = type(dataset['data'])
#dataset['data_units'] = file_obj['wave']['data_units'] #dataset['data_units'] = file_obj['wave']['data_units']
file_dict['datasets'].append(dataset) file_dict['datasets'].append(dataset)
rows,cols = dataset['shape']
try: try:
dataset['attributes'] = description_dict['table_header'].copy() dataset['attributes'] = description_dict['table_header'].copy()
@ -203,29 +247,29 @@ def read_txt_files_as_dict(filename : str ):
except ValueError as err: except ValueError as err:
print(err) print(err)
dataset = {} #dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables] #numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names' #dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols)) #dataset['data'] = np.array(numerical_variables).reshape((1,len(numerical_variables)))
dataset['shape'] = dataset['data'].shape #dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data']) #dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset) #file_dict['datasets'].append(dataset)
if categorical_variables: if categorical_variables:
dataset = {} dataset = {}
dataset['name'] = 'categorical_variables' dataset['name'] = 'table_categorical_variables'
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy() dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
dataset['shape'] = dataset['data'].shape dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data']) dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset) file_dict['datasets'].append(dataset)
dataset = {} # dataset = {}
categorical_variables = [item.encode("utf-8") for item in categorical_variables] # categorical_variables = [item.encode("utf-8") for item in categorical_variables]
dataset['name'] = 'categorial_variable_names' # dataset['name'] = 'categorial_variable_names'
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables))) # dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
dataset['shape'] = dataset['data'].shape # dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data']) # dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset) # file_dict['datasets'].append(dataset)
except: except:
return {} return {}