Refactored code to read .dat and .txt files in binary mode first rb, then the prespecified encoding is used to decode the lines. This is to have more control over the decoding process and be able to better spot possible encoding errors.

2024-04-02 18:35:04 +02:00
parent f351f102b7
commit 9071120e50
1 changed files with 12 additions and 11 deletions
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@ -93,10 +93,12 @@ def read_txt_files_as_dict(filename : str ):
        file_encoding = 'latin-1'
    elif 'ICAD' in filename and 'HONO' in filename:
        table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
-        separator = '\t'        
+        separator = '\t' 
+        file_encoding = 'latin-1'       
    elif 'ICAD' in filename and 'NO2' in filename:
        table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
        separator = '\t'
+        file_encoding = 'latin-1'
    else:
        return {}
        #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
@ -107,33 +109,32 @@ def read_txt_files_as_dict(filename : str ):
    # Work with copy of the file for safety
    tmp_filename = utils.make_file_copy(source_file_path=filename)

-    with open(tmp_filename,'r',encoding=file_encoding,errors='ignore') as f:
-        #file_encoding = f.encoding
-        #table_preamble = ""
+    #with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
+    with open(tmp_filename,'rb') as f:
        table_preamble = []
        for line_number, line in enumerate(f):        
            
-            if table_header in line:   
-                list_of_substrings = line.split(separator)             
+            if table_header in line.decode(file_encoding):   
+                list_of_substrings = line.decode(file_encoding).split(separator)             
                data_start = True  
                column_names = []
                for i, name in enumerate(list_of_substrings):
                    column_names.append(str(i)+'_'+name) 

-                print(line_number, len(column_names ))
+                #print(line_number, len(column_names ),'\n')
                break
            # Subdivide line into words, and join them by single space. 
            # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
-            list_of_substrings = line.split()
+            list_of_substrings = line.decode(file_encoding).split()
            # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
            #line = ' '.join(list_of_substrings+['\n'])
-            line = ' '.join(list_of_substrings)     
-            table_preamble.append(line)# += new_line  
+            #line = ' '.join(list_of_substrings)     
+            table_preamble.append(' '.join(list_of_substrings))# += new_line  

        header_dict["table_preamble"] = table_preamble

   
-    # TODO: it does not work with separater as none :(. fix for RGA
+    # TODO: it does not work with separator as none :(. fix for RGA
    try:
        df = pd.read_csv(tmp_filename, 
                        delimiter = separator,