Replaced attributes, previously extracted from the table_preamble in .txt and .dat files with a single dataset attribute called table_preamble that contains the whole table preamble.

This commit is contained in:
2024-03-19 11:40:35 +01:00
parent b886066133
commit 7fe254755f
2 changed files with 57 additions and 35 deletions

View File

@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename):
return file_dict
def make_file_copy(source_file_path):
pathtail, filename = os.path.split(source_file_path)
backup_filename = 'backup_'+ filename
# Path
ROOT_DIR = os.path.abspath(os.curdir)
tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files')
if not os.path.exists(tmp_dirpath):
os.mkdir(tmp_dirpath)
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
shutil.copy(source_file_path, tmp_file_path)
return tmp_file_path
def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
# Create copy of original file to avoid possible file corruption and work with it.
@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
def read_txt_files_as_dict(filename : str ):
#if instrument_folder == 'smps':
# Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant
if 'RGA' in filename:
if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
separator = None
@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ):
separator = '\t'
#elif 'gas' in filename:
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
elif 'Humidity_Sensors' in filename:
table_header = 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
separator = '\t'
elif 'ICAD' in filename and 'HONO' in filename:
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
separator = '\t'
elif 'ICAD' in filename and 'NO2' in filename:
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
separator = '\t'
else:
return {}
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ):
# Read header as a dictionary and detect where data table starts
header_dict = {}
data_start = False
# Work with copy of the file for safety
tmp_filename = make_file_copy(filename)
with open(filename,'r') as f:
file_encoding = f.encoding
table_preamble = ""
for line_number, line in enumerate(f):
list_of_substrings = line.split(separator)
if table_header in line:
if not (line == '\n'):
table_preamble += line.strip() #+ "\n"
if table_header in line:
data_start = True
column_names = []
#for i, name in enumerate(line.split('\t')):
for i, name in enumerate(list_of_substrings):
column_names.append(str(i)+'_'+name)
print(line_number, len(column_names ))
break
else:
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
# line_numbers.append(line_number)
#break
header_dict["table_preamble"] = table_preamble
if not data_start:
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
#if len(end_of_header) > 0:
# TODO: it does not work with separater as none :(. fix for RGA
try:
df = pd.read_csv(filename,
df = pd.read_csv(tmp_filename,
delimiter = separator,
header=line_number,
#encoding='latin-1',
@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ):
numerical_variables = [item for item in df_numerical_attrs.columns]
# TODO:
if 'Pressure' in filename:
if 'Pressure' in tmp_filename:
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ):
###
file_dict = {}
path_tail, path_head = os.path.split(filename)
path_tail, path_head = os.path.split(tmp_filename)
file_dict['name'] = path_head
# TODO: review this header dictionary, it may not be the best way to represent header data

View File

@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ):
# Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant
if 'smps' in filename:
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
table_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
separator = '\t'
elif 'gas' in filename:
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
table_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
separator = '\t'
else:
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ):
data_start = False
with open(filename,'r') as f:
file_encoding = f.encoding
table_preamble = ""
for line_number, line in enumerate(f):
list_of_substrings = line.split(separator)
if end_of_header in line:
if not (line == '\n'):
table_preamble += line.strip() #+ "\n"
if table_of_header in line:
data_start = True
column_names = []
for i, name in enumerate(list_of_substrings):
@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ):
print(line_number, len(column_names ))
break
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
header_dict["table_preamble"] = table_preamble
if not data_start:
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ):
file_dict['datasets'].append(dataset)
rows,cols = dataset['shape']
# This lines were added to test the structured array functionality
tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])]
dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])]
data = np.array(tmp, dtype=dtype_tmp)
dataset['data'] = data
dataset['shape'] = dataset['data'].shape
dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
file_dict['datasets'].append(dataset)
if 'timestamps' in categorical_variables:
dataset = {}