Replaced attributes, previously extracted from the table_preamble in .txt and .dat files with a single dataset attribute called table_preamble that contains the whole table preamble.

This commit is contained in:
2024-03-19 11:40:35 +01:00
parent b886066133
commit 7fe254755f
2 changed files with 57 additions and 35 deletions

View File

@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename):
return file_dict return file_dict
def make_file_copy(source_file_path):
pathtail, filename = os.path.split(source_file_path)
backup_filename = 'backup_'+ filename
# Path
ROOT_DIR = os.path.abspath(os.curdir)
tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files')
if not os.path.exists(tmp_dirpath):
os.mkdir(tmp_dirpath)
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
shutil.copy(source_file_path, tmp_file_path)
return tmp_file_path
def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name): def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
# Create copy of original file to avoid possible file corruption and work with it. # Create copy of original file to avoid possible file corruption and work with it.
@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
def read_txt_files_as_dict(filename : str ): def read_txt_files_as_dict(filename : str ):
#if instrument_folder == 'smps':
# Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant #TODO: this may be prone to error if assumed folder structure is non compliant
if 'RGA' in filename: if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status', #end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8' table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
separator = None separator = None
@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ):
separator = '\t' separator = '\t'
#elif 'gas' in filename: #elif 'gas' in filename:
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' # end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
elif 'Humidity_Sensors' in filename:
table_header = 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
separator = '\t'
elif 'ICAD' in filename and 'HONO' in filename:
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
separator = '\t'
elif 'ICAD' in filename and 'NO2' in filename:
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
separator = '\t'
else: else:
return {} return {}
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"') #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ):
# Read header as a dictionary and detect where data table starts # Read header as a dictionary and detect where data table starts
header_dict = {} header_dict = {}
data_start = False data_start = False
# Work with copy of the file for safety
tmp_filename = make_file_copy(filename)
with open(filename,'r') as f: with open(filename,'r') as f:
file_encoding = f.encoding file_encoding = f.encoding
table_preamble = ""
for line_number, line in enumerate(f): for line_number, line in enumerate(f):
list_of_substrings = line.split(separator) list_of_substrings = line.split(separator)
if table_header in line: if not (line == '\n'):
table_preamble += line.strip() #+ "\n"
if table_header in line:
data_start = True data_start = True
column_names = [] column_names = []
#for i, name in enumerate(line.split('\t')):
for i, name in enumerate(list_of_substrings): for i, name in enumerate(list_of_substrings):
column_names.append(str(i)+'_'+name) column_names.append(str(i)+'_'+name)
print(line_number, len(column_names )) print(line_number, len(column_names ))
break break
else:
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]): header_dict["table_preamble"] = table_preamble
# line_numbers.append(line_number)
#break
if not data_start:
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
#if len(end_of_header) > 0:
# TODO: it does not work with separater as none :(. fix for RGA # TODO: it does not work with separater as none :(. fix for RGA
try: try:
df = pd.read_csv(filename, df = pd.read_csv(tmp_filename,
delimiter = separator, delimiter = separator,
header=line_number, header=line_number,
#encoding='latin-1', #encoding='latin-1',
@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ):
numerical_variables = [item for item in df_numerical_attrs.columns] numerical_variables = [item for item in df_numerical_attrs.columns]
# TODO: # TODO:
if 'Pressure' in filename: if 'Pressure' in tmp_filename:
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index] df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time']) df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ):
### ###
file_dict = {} file_dict = {}
path_tail, path_head = os.path.split(filename) path_tail, path_head = os.path.split(tmp_filename)
file_dict['name'] = path_head file_dict['name'] = path_head
# TODO: review this header dictionary, it may not be the best way to represent header data # TODO: review this header dictionary, it may not be the best way to represent header data

View File

@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ):
# Infer from filename whether txt file comes from smps or gas folder # Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant #TODO: this may be prone to error if assumed folder structure is non compliant
if 'smps' in filename: if 'smps' in filename:
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)' table_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
separator = '\t' separator = '\t'
elif 'gas' in filename: elif 'gas' in filename:
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' table_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
separator = '\t' separator = '\t'
else: else:
raise ValueError('intrument_folder must be set as a either "smps" or "gas"') raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ):
data_start = False data_start = False
with open(filename,'r') as f: with open(filename,'r') as f:
file_encoding = f.encoding file_encoding = f.encoding
table_preamble = ""
for line_number, line in enumerate(f): for line_number, line in enumerate(f):
list_of_substrings = line.split(separator) list_of_substrings = line.split(separator)
if end_of_header in line: if not (line == '\n'):
table_preamble += line.strip() #+ "\n"
if table_of_header in line:
data_start = True data_start = True
column_names = [] column_names = []
for i, name in enumerate(list_of_substrings): for i, name in enumerate(list_of_substrings):
@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ):
print(line_number, len(column_names )) print(line_number, len(column_names ))
break break
# TODO: update to extract information from lines formed by more than two elements separaed by '\t' header_dict["table_preamble"] = table_preamble
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
if not data_start: if not data_start:
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.') raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ):
file_dict['datasets'].append(dataset) file_dict['datasets'].append(dataset)
rows,cols = dataset['shape'] rows,cols = dataset['shape']
# This lines were added to test the structured array functionality
tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])]
dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])]
data = np.array(tmp, dtype=dtype_tmp)
dataset['data'] = data
dataset['shape'] = dataset['data'].shape
dataset = {} dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables] numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names' dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols)) dataset['data'] = np.array(numerical_variables).reshape((1,cols))
dataset['shape'] = dataset['data'].shape dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data']) dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset) file_dict['datasets'].append(dataset)
if 'timestamps' in categorical_variables: if 'timestamps' in categorical_variables:
dataset = {} dataset = {}