Optimzed and included df to np structured array conversion. \n-Replaced loop plus append with list comprehension. \n-Replaced pd df column concatenation based on row-wise concatenation with df.aggr() method that uses column wise concatenation.

This commit is contained in:
2024-05-23 22:18:37 +02:00
parent a45fb4476b
commit bd458c6cd0

View File

@ -86,6 +86,37 @@ def infer_units(column_name):
return match
def dataframe_to_np_structured_array(df: pd.DataFrame):
# Define the dtype for the structured array, ensuring compatibility with h5py
dtype = []
for col in df.columns:
col_dtype = df[col].dtype
if pd.api.types.is_string_dtype(col_dtype):
# Convert string dtype to fixed-length strings
max_len = df[col].str.len().max()
dtype.append((col, f'S{max_len}'))
elif pd.api.types.is_integer_dtype(col_dtype):
dtype.append((col, 'i4')) # Assuming 32-bit integer
elif pd.api.types.is_float_dtype(col_dtype):
dtype.append((col, 'f4')) # Assuming 32-bit float
else:
raise ValueError(f"Unsupported dtype: {col_dtype}")
# Convert the DataFrame to a structured array
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
#return structured_array
#table_header = df.columns
#table = df.to_numpy()
#rows,cols = table.shape
#tmp = [tuple(table[i,:]) for i in range(rows)]
#dtype_tmp = [(table_header[i],'f4') for i in range(cols)]
#data = np.array(tmp, dtype=dtype_tmp)
return structured_array
def read_txt_files_as_dict(filename : str ):
with open('src/text_data_sources.yaml','r') as stream:
@ -131,9 +162,10 @@ def read_txt_files_as_dict(filename : str ):
if table_header in line.decode(file_encoding):
list_of_substrings = line.decode(file_encoding).split(separator)
data_start = True
column_names = []
for i, name in enumerate(list_of_substrings):
column_names.append(str(i)+'_'+name)
column_names = [str(i)+'_'+name for i, name in enumerate(list_of_substrings)]
#column_names = []
#for i, name in enumerate(list_of_substrings):
# column_names.append(str(i)+'_'+name)
#print(line_number, len(column_names ),'\n')
break
@ -143,9 +175,17 @@ def read_txt_files_as_dict(filename : str ):
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
#line = ' '.join(list_of_substrings+['\n'])
#line = ' '.join(list_of_substrings)
table_preamble.append(' '.join(list_of_substrings))# += new_line
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
header_dict["table_preamble"] = table_preamble
# Represent string values as fixed length strings in the HDF5 file, which need
# to be decoded as string when we read them. It provides better control than variable strings,
# at the expense of flexibility.
# https://docs.h5py.org/en/stable/strings.html
if table_preamble:
max_length = max(len(item) for item in table_preamble)
utf8_type = h5py.string_dtype('utf-8', max_length)
header_dict["table_preamble"] = np.array(table_preamble,dtype=utf8_type)
# TODO: it does not work with separator as none :(. fix for RGA
@ -164,9 +204,14 @@ def read_txt_files_as_dict(filename : str ):
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
if timestamp_variables:
df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
#print(df_categorical_attrs)
categorical_variables = [item for item in df_categorical_attrs.columns]
####
@ -185,13 +230,12 @@ def read_txt_files_as_dict(filename : str ):
if numerical_variables:
dataset = {}
dataset['name'] = 'numerical_variables'
dataset['data'] = df_numerical_attrs.to_numpy()
dataset['name'] = 'table_numerical_variables'
dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
#dataset['data_units'] = file_obj['wave']['data_units']
file_dict['datasets'].append(dataset)
rows,cols = dataset['shape']
try:
dataset['attributes'] = description_dict['table_header'].copy()
@ -203,29 +247,29 @@ def read_txt_files_as_dict(filename : str ):
except ValueError as err:
print(err)
dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
#dataset = {}
#numerical_variables= [item.encode("utf-8") for item in numerical_variables]
#dataset['name'] = 'numerical_variable_names'
#dataset['data'] = np.array(numerical_variables).reshape((1,len(numerical_variables)))
#dataset['shape'] = dataset['data'].shape
#dataset['dtype'] = type(dataset['data'])
#file_dict['datasets'].append(dataset)
if categorical_variables:
dataset = {}
dataset['name'] = 'categorical_variables'
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
dataset['name'] = 'table_categorical_variables'
dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
dataset = {}
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
dataset['name'] = 'categorial_variable_names'
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
# dataset = {}
# categorical_variables = [item.encode("utf-8") for item in categorical_variables]
# dataset['name'] = 'categorial_variable_names'
# dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
# dataset['shape'] = dataset['data'].shape
# dataset['dtype'] = type(dataset['data'])
# file_dict['datasets'].append(dataset)
except:
return {}