import pandas as pd import matplotlib.pyplot as plt import os def read_smog_chamber_txt_files_as_dict(filename : str ,instrument_folder : str): if instrument_folder == 'smps': end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)' elif instrument_folder == 'gas': end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' else: raise ValueError('intrument_folder must be set as a either "smps" or "gas"') # Read header as a dictionary and detect where data table starts header_dict = {} data_start = False with open(filename,'r') as f: for line_number, line in enumerate(f): list = line.split('\t') if end_of_header in line: data_start = True column_names = [] for i, name in enumerate(line.split('\t')): column_names.append(str(i)+'_'+name) print(line_number, len(column_names )) break # TODO: update to extract information from lines formed by more than two elements separaed by '\t' key, value = list[0], list[1::] header_dict[key] = value if not data_start: raise ValueError('file appears to be invalid. Data start condition in txt file was not met.') df = pd.read_csv(filename, delimiter = "\t", header=line_number, #encoding='latin-1', encoding='latin-1', names=column_names, skip_blank_lines=True) df_numerical_attrs = df.select_dtypes(include ='number') df_categorical_attrs = df.select_dtypes(exclude='number') if instrument_folder == 'smps': df_categorical_attrs['1_Timestamp'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index] df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time']) elif instrument_folder == 'gas': df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : '0_Timestamp'}) data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns] output_dict = { 'header_dict':header_dict, 'data': df_numerical_attrs.to_numpy(), 'data_column_names':data_column_names, 'categ_data_dict':df_categorical_attrs.to_dict(orient='list') } #output_dict = {'header_dict':header_dict, # 'num_data_df':df_numerical_attrs.to_numpy(), # 'categ_data_df':df_categorical_attrs.to_dict(orient='list')} return output_dict def main(): #filename = 'M:\\gas\\20220705_000004_MSC_gases.txt' corrupted file #filename = 'M:\\gas\\20220726_101617_MSC_gases.txt' root_dir = '\\\\fs03\\Iron_Sulphate' instrument_folder = 'smps' filename_path = os.path.join(root_dir,'smps\\20220726\\20220726_num.TXT') instrument_folder = 'gas' filename_path = os.path.join(root_dir,'gas\\20220726_101617_MSC_gases.txt') result = read_smog_chamber_txt_files_as_dict(filename_path,instrument_folder) print(':)') return result if __name__ == '__main__': output_dict = main() print(output_dict['num_data_df'].columns, len(output_dict['num_data_df'].columns),'\n') print(output_dict['num_data_df'].columns, len(output_dict['categ_data_df'].columns))