Implemented a file reader module for smog chamber file formats, smog_chamber_group_reader.py. its use is equivalent to that of g5505_file_reader.py, to read file formats of the multiphase chemistry group.

This commit is contained in:
2024-02-13 16:44:32 +01:00
parent 03dcc62f9a
commit e082e066b1

View File

@ -0,0 +1,86 @@
import pandas as pd
import matplotlib.pyplot as plt
def read_smog_chamber_txt_files_as_dict(filename : str ,instrument_folder : str):
if instrument_folder == 'smps':
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
elif instrument_folder == 'gas':
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
else:
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
# Read header as a dictionary and detect where data table starts
header_dict = {}
data_start = False
with open(filename,'r') as f:
for line_number, line in enumerate(f):
list = line.split('\t')
if end_of_header in line:
data_start = True
column_names = []
for i, name in enumerate(line.split('\t')):
column_names.append(str(i)+'_'+name)
print(line_number, len(column_names ))
break
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
key, value = list[0], list[1::]
header_dict[key] = value
if not data_start:
raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
df = pd.read_csv(filename,
delimiter = "\t",
header=line_number,
#encoding='latin-1',
encoding='latin-1',
names=column_names,
skip_blank_lines=True)
df_numerical_attrs = df.select_dtypes(include ='number')
df_categorical_attrs = df.select_dtypes(exclude='number')
data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
output_dict = { 'header_dict':header_dict,
'data': df_numerical_attrs.to_numpy(),
'data_column_names':data_column_names,
'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
}
#output_dict = {'header_dict':header_dict,
# 'num_data_df':df_numerical_attrs.to_numpy(),
# 'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
return output_dict
def main():
#filename = 'M:\\gas\\20220705_000004_MSC_gases.txt' corrupted file
filename = 'M:\\gas\\20220726_101617_MSC_gases.txt'
instrument_folder = 'gas'
filename = 'M:\\smps\\20220726\\20220726_num.TXT'
instrument_folder = 'smps'
result = read_smog_chamber_txt_files_as_dict(filename,instrument_folder)
print(':)')
return result
if __name__ == '__main__':
output_dict = main()
print(output_dict['num_data_df'].columns, len(output_dict['num_data_df'].columns),'\n')
print(output_dict['num_data_df'].columns, len(output_dict['categ_data_df'].columns))