Moved smog_chamber_file_reader.py to src folder.
This commit is contained in:
98
src/smog_chamber_file_reader.py
Normal file
98
src/smog_chamber_file_reader.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
|
||||||
|
def read_txt_files_as_dict(filename : str ,instrument_folder : str):
|
||||||
|
|
||||||
|
|
||||||
|
if instrument_folder == 'smps':
|
||||||
|
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
||||||
|
|
||||||
|
elif instrument_folder == 'gas':
|
||||||
|
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||||
|
else:
|
||||||
|
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
||||||
|
|
||||||
|
# Read header as a dictionary and detect where data table starts
|
||||||
|
header_dict = {}
|
||||||
|
data_start = False
|
||||||
|
with open(filename,'r') as f:
|
||||||
|
for line_number, line in enumerate(f):
|
||||||
|
list = line.split('\t')
|
||||||
|
if end_of_header in line:
|
||||||
|
data_start = True
|
||||||
|
column_names = []
|
||||||
|
for i, name in enumerate(line.split('\t')):
|
||||||
|
column_names.append(str(i)+'_'+name)
|
||||||
|
|
||||||
|
print(line_number, len(column_names ))
|
||||||
|
break
|
||||||
|
|
||||||
|
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
||||||
|
key, value = list[0], list[1::]
|
||||||
|
header_dict[key] = value
|
||||||
|
|
||||||
|
if not data_start:
|
||||||
|
raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
|
||||||
|
|
||||||
|
df = pd.read_csv(filename,
|
||||||
|
delimiter = "\t",
|
||||||
|
header=line_number,
|
||||||
|
#encoding='latin-1',
|
||||||
|
encoding='latin-1',
|
||||||
|
names=column_names,
|
||||||
|
skip_blank_lines=True)
|
||||||
|
|
||||||
|
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||||
|
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||||
|
|
||||||
|
if instrument_folder == 'smps':
|
||||||
|
df_categorical_attrs['1_Timestamp'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
|
||||||
|
df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time'])
|
||||||
|
elif instrument_folder == 'gas':
|
||||||
|
df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : '0_Timestamp'})
|
||||||
|
|
||||||
|
data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
|
||||||
|
output_dict = { 'header_dict':header_dict,
|
||||||
|
'data': df_numerical_attrs.to_numpy(),
|
||||||
|
'data_column_names':data_column_names,
|
||||||
|
'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
|
||||||
|
}
|
||||||
|
|
||||||
|
#output_dict = {'header_dict':header_dict,
|
||||||
|
# 'num_data_df':df_numerical_attrs.to_numpy(),
|
||||||
|
# 'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
|
||||||
|
|
||||||
|
return output_dict
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#filename = 'M:\\gas\\20220705_000004_MSC_gases.txt' corrupted file
|
||||||
|
#filename = 'M:\\gas\\20220726_101617_MSC_gases.txt'
|
||||||
|
root_dir = '\\\\fs03\\Iron_Sulphate'
|
||||||
|
|
||||||
|
instrument_folder = 'smps'
|
||||||
|
filename_path = os.path.join(root_dir,'smps\\20220726\\20220726_num.TXT')
|
||||||
|
|
||||||
|
|
||||||
|
instrument_folder = 'gas'
|
||||||
|
filename_path = os.path.join(root_dir,'gas\\20220726_101617_MSC_gases.txt')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
result = read_txt_files_as_dict(filename_path,instrument_folder)
|
||||||
|
|
||||||
|
print(':)')
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
output_dict = main()
|
||||||
|
|
||||||
|
print(output_dict['num_data_df'].columns, len(output_dict['num_data_df'].columns),'\n')
|
||||||
|
print(output_dict['num_data_df'].columns, len(output_dict['categ_data_df'].columns))
|
Reference in New Issue
Block a user