Refactored code to read .dat and .txt files in binary mode first rb, then the prespecified encoding is used to decode the lines. This is to have more control over the decoding process and be able to better spot possible encoding errors.
This commit is contained in:
@ -93,10 +93,12 @@ def read_txt_files_as_dict(filename : str ):
|
||||
file_encoding = 'latin-1'
|
||||
elif 'ICAD' in filename and 'HONO' in filename:
|
||||
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator = '\t'
|
||||
separator = '\t'
|
||||
file_encoding = 'latin-1'
|
||||
elif 'ICAD' in filename and 'NO2' in filename:
|
||||
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator = '\t'
|
||||
file_encoding = 'latin-1'
|
||||
else:
|
||||
return {}
|
||||
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
|
||||
@ -107,33 +109,32 @@ def read_txt_files_as_dict(filename : str ):
|
||||
# Work with copy of the file for safety
|
||||
tmp_filename = utils.make_file_copy(source_file_path=filename)
|
||||
|
||||
with open(tmp_filename,'r',encoding=file_encoding,errors='ignore') as f:
|
||||
#file_encoding = f.encoding
|
||||
#table_preamble = ""
|
||||
#with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
if table_header in line:
|
||||
list_of_substrings = line.split(separator)
|
||||
if table_header in line.decode(file_encoding):
|
||||
list_of_substrings = line.decode(file_encoding).split(separator)
|
||||
data_start = True
|
||||
column_names = []
|
||||
for i, name in enumerate(list_of_substrings):
|
||||
column_names.append(str(i)+'_'+name)
|
||||
|
||||
print(line_number, len(column_names ))
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.split()
|
||||
list_of_substrings = line.decode(file_encoding).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(line)# += new_line
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join(list_of_substrings))# += new_line
|
||||
|
||||
header_dict["table_preamble"] = table_preamble
|
||||
|
||||
|
||||
# TODO: it does not work with separater as none :(. fix for RGA
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator,
|
||||
|
Reference in New Issue
Block a user