diff --git a/smog_chamber_group_reader.py b/smog_chamber_group_reader.py new file mode 100644 index 0000000..403bfba --- /dev/null +++ b/smog_chamber_group_reader.py @@ -0,0 +1,86 @@ + +import pandas as pd +import matplotlib.pyplot as plt + + +def read_smog_chamber_txt_files_as_dict(filename : str ,instrument_folder : str): + + + if instrument_folder == 'smps': + end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)' + + elif instrument_folder == 'gas': + end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' + else: + raise ValueError('intrument_folder must be set as a either "smps" or "gas"') + + # Read header as a dictionary and detect where data table starts + header_dict = {} + data_start = False + with open(filename,'r') as f: + for line_number, line in enumerate(f): + list = line.split('\t') + if end_of_header in line: + data_start = True + column_names = [] + for i, name in enumerate(line.split('\t')): + column_names.append(str(i)+'_'+name) + + print(line_number, len(column_names )) + break + + # TODO: update to extract information from lines formed by more than two elements separaed by '\t' + key, value = list[0], list[1::] + header_dict[key] = value + + if not data_start: + raise ValueError('file appears to be invalid. Data start condition in txt file was not met.') + + df = pd.read_csv(filename, + delimiter = "\t", + header=line_number, + #encoding='latin-1', + encoding='latin-1', + names=column_names, + skip_blank_lines=True) + + df_numerical_attrs = df.select_dtypes(include ='number') + df_categorical_attrs = df.select_dtypes(exclude='number') + + data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns] + output_dict = { 'header_dict':header_dict, + 'data': df_numerical_attrs.to_numpy(), + 'data_column_names':data_column_names, + 'categ_data_dict':df_categorical_attrs.to_dict(orient='list') + } + + #output_dict = {'header_dict':header_dict, + # 'num_data_df':df_numerical_attrs.to_numpy(), + # 'categ_data_df':df_categorical_attrs.to_dict(orient='list')} + + return output_dict + +def main(): + + + + #filename = 'M:\\gas\\20220705_000004_MSC_gases.txt' corrupted file + filename = 'M:\\gas\\20220726_101617_MSC_gases.txt' + instrument_folder = 'gas' + + filename = 'M:\\smps\\20220726\\20220726_num.TXT' + instrument_folder = 'smps' + + result = read_smog_chamber_txt_files_as_dict(filename,instrument_folder) + + print(':)') + + return result + + +if __name__ == '__main__': + + output_dict = main() + + print(output_dict['num_data_df'].columns, len(output_dict['num_data_df'].columns),'\n') + print(output_dict['num_data_df'].columns, len(output_dict['categ_data_df'].columns)) \ No newline at end of file