Implemented a file reader module for smog chamber file formats, smog_chamber_group_reader.py. its use is equivalent to that of g5505_file_reader.py, to read file formats of the multiphase chemistry group.

2024-02-13 16:44:32 +01:00
parent 03dcc62f9a
commit e082e066b1
1 changed files with 86 additions and 0 deletions
--- a/smog_chamber_group_reader.py
+++ b/smog_chamber_group_reader.py
@@ -0,0 +1,86 @@
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def read_smog_chamber_txt_files_as_dict(filename : str ,instrument_folder : str):
+
+    
+    if instrument_folder == 'smps':
+        end_of_header = 'Sample #	Date	Start Time	Sample Temp (C)	Sample Pressure (kPa)'
+        
+    elif instrument_folder == 'gas':
+        end_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
+    else:
+        raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
+
+    # Read header as a dictionary and detect where data table starts
+    header_dict = {}
+    data_start = False    
+    with open(filename,'r') as f:
+        for line_number, line in enumerate(f):        
+            list = line.split('\t')
+            if end_of_header in line:
+                data_start = True  
+                column_names = []
+                for i, name in enumerate(line.split('\t')):
+                    column_names.append(str(i)+'_'+name) 
+
+                print(line_number, len(column_names ))
+                break
+
+            # TODO: update to extract information from lines formed by more than two elements separaed by '\t'
+            key, value = list[0], list[1::]
+            header_dict[key] = value
+
+    if not data_start:
+        raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
+    
+    df = pd.read_csv(filename, 
+                     delimiter = "\t", 
+                     header=line_number, 
+                     #encoding='latin-1',
+                     encoding='latin-1',
+                     names=column_names,
+                     skip_blank_lines=True)
+    
+    df_numerical_attrs = df.select_dtypes(include ='number')
+    df_categorical_attrs = df.select_dtypes(exclude='number')
+
+    data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
+    output_dict = { 'header_dict':header_dict,
+                    'data': df_numerical_attrs.to_numpy(),
+                    'data_column_names':data_column_names,
+                    'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
+                    }
+    
+   #output_dict = {'header_dict':header_dict,
+   #                'num_data_df':df_numerical_attrs.to_numpy(),
+   #                'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
+
+    return output_dict 
+
+def main():
+
+
+
+    #filename = 'M:\\gas\\20220705_000004_MSC_gases.txt' corrupted file
+    filename = 'M:\\gas\\20220726_101617_MSC_gases.txt'
+    instrument_folder = 'gas'
+
+    filename = 'M:\\smps\\20220726\\20220726_num.TXT'
+    instrument_folder = 'smps'
+
+    result = read_smog_chamber_txt_files_as_dict(filename,instrument_folder)
+
+    print(':)')
+
+    return result
+
+
+if __name__ == '__main__':
+
+    output_dict  =  main()
+
+    print(output_dict['num_data_df'].columns, len(output_dict['num_data_df'].columns),'\n')
+    print(output_dict['num_data_df'].columns, len(output_dict['categ_data_df'].columns))