Optimzed and included df to np structured array conversion. \n-Replaced loop plus append with list comprehension. \n-Replaced pd df column concatenation based on row-wise concatenation with df.aggr() method that uses column wise concatenation.
This commit is contained in:
@ -86,6 +86,37 @@ def infer_units(column_name):
|
||||
|
||||
return match
|
||||
|
||||
def dataframe_to_np_structured_array(df: pd.DataFrame):
|
||||
|
||||
# Define the dtype for the structured array, ensuring compatibility with h5py
|
||||
dtype = []
|
||||
for col in df.columns:
|
||||
col_dtype = df[col].dtype
|
||||
if pd.api.types.is_string_dtype(col_dtype):
|
||||
# Convert string dtype to fixed-length strings
|
||||
max_len = df[col].str.len().max()
|
||||
dtype.append((col, f'S{max_len}'))
|
||||
elif pd.api.types.is_integer_dtype(col_dtype):
|
||||
dtype.append((col, 'i4')) # Assuming 32-bit integer
|
||||
elif pd.api.types.is_float_dtype(col_dtype):
|
||||
dtype.append((col, 'f4')) # Assuming 32-bit float
|
||||
else:
|
||||
raise ValueError(f"Unsupported dtype: {col_dtype}")
|
||||
|
||||
# Convert the DataFrame to a structured array
|
||||
structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
|
||||
|
||||
#return structured_array
|
||||
#table_header = df.columns
|
||||
#table = df.to_numpy()
|
||||
|
||||
#rows,cols = table.shape
|
||||
#tmp = [tuple(table[i,:]) for i in range(rows)]
|
||||
#dtype_tmp = [(table_header[i],'f4') for i in range(cols)]
|
||||
#data = np.array(tmp, dtype=dtype_tmp)
|
||||
|
||||
return structured_array
|
||||
|
||||
def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
with open('src/text_data_sources.yaml','r') as stream:
|
||||
@ -131,9 +162,10 @@ def read_txt_files_as_dict(filename : str ):
|
||||
if table_header in line.decode(file_encoding):
|
||||
list_of_substrings = line.decode(file_encoding).split(separator)
|
||||
data_start = True
|
||||
column_names = []
|
||||
for i, name in enumerate(list_of_substrings):
|
||||
column_names.append(str(i)+'_'+name)
|
||||
column_names = [str(i)+'_'+name for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
@ -143,9 +175,17 @@ def read_txt_files_as_dict(filename : str ):
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join(list_of_substrings))# += new_line
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
header_dict["table_preamble"] = table_preamble
|
||||
# Represent string values as fixed length strings in the HDF5 file, which need
|
||||
# to be decoded as string when we read them. It provides better control than variable strings,
|
||||
# at the expense of flexibility.
|
||||
# https://docs.h5py.org/en/stable/strings.html
|
||||
|
||||
if table_preamble:
|
||||
max_length = max(len(item) for item in table_preamble)
|
||||
utf8_type = h5py.string_dtype('utf-8', max_length)
|
||||
header_dict["table_preamble"] = np.array(table_preamble,dtype=utf8_type)
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
@ -164,9 +204,14 @@ def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
|
||||
df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
#print(df_categorical_attrs)
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
@ -185,13 +230,12 @@ def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
if numerical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'numerical_variables'
|
||||
dataset['data'] = df_numerical_attrs.to_numpy()
|
||||
dataset['name'] = 'table_numerical_variables'
|
||||
dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||
file_dict['datasets'].append(dataset)
|
||||
rows,cols = dataset['shape']
|
||||
|
||||
try:
|
||||
dataset['attributes'] = description_dict['table_header'].copy()
|
||||
@ -203,29 +247,29 @@ def read_txt_files_as_dict(filename : str ):
|
||||
except ValueError as err:
|
||||
print(err)
|
||||
|
||||
dataset = {}
|
||||
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||
dataset['name'] = 'numerical_variable_names'
|
||||
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
file_dict['datasets'].append(dataset)
|
||||
#dataset = {}
|
||||
#numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||
#dataset['name'] = 'numerical_variable_names'
|
||||
#dataset['data'] = np.array(numerical_variables).reshape((1,len(numerical_variables)))
|
||||
#dataset['shape'] = dataset['data'].shape
|
||||
#dataset['dtype'] = type(dataset['data'])
|
||||
#file_dict['datasets'].append(dataset)
|
||||
|
||||
if categorical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'categorical_variables'
|
||||
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
dataset['name'] = 'table_categorical_variables'
|
||||
dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
dataset = {}
|
||||
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
|
||||
dataset['name'] = 'categorial_variable_names'
|
||||
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
file_dict['datasets'].append(dataset)
|
||||
# dataset = {}
|
||||
# categorical_variables = [item.encode("utf-8") for item in categorical_variables]
|
||||
# dataset['name'] = 'categorial_variable_names'
|
||||
# dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
|
||||
# dataset['shape'] = dataset['data'].shape
|
||||
# dataset['dtype'] = type(dataset['data'])
|
||||
# file_dict['datasets'].append(dataset)
|
||||
|
||||
except:
|
||||
return {}
|
||||
|
Reference in New Issue
Block a user