From a410bde23e7c99b4755e6ba503ad7256e87e4510 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Mon, 10 Jun 2024 16:18:51 +0200 Subject: [PATCH] Removed data table split into categorical and numerical variables and numering is only introduce to disambiguate repeated columns. --- src/g5505_file_reader.py | 60 +++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/src/g5505_file_reader.py b/src/g5505_file_reader.py index a253c60..fdf4009 100644 --- a/src/g5505_file_reader.py +++ b/src/g5505_file_reader.py @@ -112,6 +112,8 @@ def dataframe_to_np_structured_array(df: pd.DataFrame): return structured_array +from collections import Counter + def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): with open('src/instruments/text_data_sources.yaml','r') as stream: @@ -163,9 +165,15 @@ def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): for line_number, line in enumerate(f): if table_header in line.decode(file_encoding): - list_of_substrings = line.decode(file_encoding).split(separator) + list_of_substrings = line.decode(file_encoding).split(separator) + + # Count occurrences of each substring + substring_counts = Counter(list_of_substrings) data_start = True - column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)] + # Generate column names with appended index only for repeated substrings + column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)] + + #column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)] #column_names = [] #for i, name in enumerate(list_of_substrings): # column_names.append(str(i)+'_'+name) @@ -210,25 +218,28 @@ def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index] - df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1) + #df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1) + timestamps_name = ' '.join(timestamp_variables) + df_categorical_attrs[ timestamps_name] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1) valid_indices = [] if datetime_format: - df_categorical_attrs['timestamps'] = pd.to_datetime(df_categorical_attrs['timestamps'],format=datetime_format,errors='coerce') - valid_indices = df_categorical_attrs.dropna(subset=['timestamps']).index + df_categorical_attrs[ timestamps_name] = pd.to_datetime(df_categorical_attrs[ timestamps_name],format=datetime_format,errors='coerce') + valid_indices = df_categorical_attrs.dropna(subset=[timestamps_name]).index df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:] df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:] - df_categorical_attrs['timestamps'] = df_categorical_attrs['timestamps'].dt.strftime(config_dict['default']['desired_format']) - startdate = df_categorical_attrs['timestamps'].min() - enddate = df_categorical_attrs['timestamps'].max() + df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format']) + startdate = df_categorical_attrs[timestamps_name].min() + enddate = df_categorical_attrs[timestamps_name].max() - df_categorical_attrs['timestamps'] = df_categorical_attrs['timestamps'].astype(str) + df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].astype(str) #header_dict.update({'stastrrtdate':startdate,'enddate':enddate}) header_dict['startdate']= str(startdate) header_dict['enddate']=str(enddate) - df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables) + if len(timestamp_variables) > 1: + df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables) #df_categorical_attrs.reindex(drop=True) @@ -253,12 +264,11 @@ def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): if numerical_variables: dataset = {} - dataset['name'] = 'table_numerical_variables' - dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy() + dataset['name'] = 'data_table'#_numerical_variables' + dataset['data'] = dataframe_to_np_structured_array(pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)) #df_numerical_attrs.to_numpy() dataset['shape'] = dataset['data'].shape dataset['dtype'] = type(dataset['data']) - #dataset['data_units'] = file_obj['wave']['data_units'] - file_dict['datasets'].append(dataset) + #dataset['data_units'] = file_obj['wave']['data_units'] try: dataset['attributes'] = description_dict['table_header'].copy() @@ -267,19 +277,23 @@ def read_txt_files_as_dict(filename : str , work_with_copy : bool = True ): dataset['attributes'].pop(key) # delete key else: dataset['attributes'][key] = metadata.parse_attribute(dataset['attributes'][key]) + if timestamps_name in categorical_variables: + dataset['attributes'][timestamps_name] = metadata.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'}) except ValueError as err: print(err) + file_dict['datasets'].append(dataset) + - if categorical_variables: - dataset = {} - dataset['name'] = 'table_categorical_variables' - dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy() - dataset['shape'] = dataset['data'].shape - dataset['dtype'] = type(dataset['data']) - if 'timestamps' in categorical_variables: - dataset['attributes'] = {'timestamps': metadata.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})} - file_dict['datasets'].append(dataset) + #if categorical_variables: + # dataset = {} + # dataset['name'] = 'table_categorical_variables' + # dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy() + # dataset['shape'] = dataset['data'].shape + # dataset['dtype'] = type(dataset['data']) + # if timestamps_name in categorical_variables: + # dataset['attributes'] = {timestamps_name: metadata.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})} + # file_dict['datasets'].append(dataset)