Replaced attributes, previously extracted from the table_preamble in .txt and .dat files with a single dataset attribute called table_preamble that contains the whole table preamble.
This commit is contained in:
@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename):
|
||||
|
||||
return file_dict
|
||||
|
||||
def make_file_copy(source_file_path):
|
||||
|
||||
pathtail, filename = os.path.split(source_file_path)
|
||||
backup_filename = 'backup_'+ filename
|
||||
# Path
|
||||
ROOT_DIR = os.path.abspath(os.curdir)
|
||||
|
||||
tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files')
|
||||
if not os.path.exists(tmp_dirpath):
|
||||
os.mkdir(tmp_dirpath)
|
||||
|
||||
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
|
||||
shutil.copy(source_file_path, tmp_file_path)
|
||||
|
||||
return tmp_file_path
|
||||
|
||||
def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
|
||||
# Create copy of original file to avoid possible file corruption and work with it.
|
||||
|
||||
@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
|
||||
|
||||
def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
#if instrument_folder == 'smps':
|
||||
# Infer from filename whether txt file comes from smps or gas folder
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
if 'RGA' in filename:
|
||||
if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA
|
||||
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
|
||||
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
||||
separator = None
|
||||
@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ):
|
||||
separator = '\t'
|
||||
#elif 'gas' in filename:
|
||||
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||
elif 'Humidity_Sensors' in filename:
|
||||
table_header = 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
|
||||
separator = '\t'
|
||||
elif 'ICAD' in filename and 'HONO' in filename:
|
||||
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator = '\t'
|
||||
elif 'ICAD' in filename and 'NO2' in filename:
|
||||
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator = '\t'
|
||||
else:
|
||||
return {}
|
||||
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
|
||||
@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ):
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
tmp_filename = make_file_copy(filename)
|
||||
|
||||
with open(filename,'r') as f:
|
||||
file_encoding = f.encoding
|
||||
table_preamble = ""
|
||||
for line_number, line in enumerate(f):
|
||||
list_of_substrings = line.split(separator)
|
||||
if table_header in line:
|
||||
if not (line == '\n'):
|
||||
table_preamble += line.strip() #+ "\n"
|
||||
if table_header in line:
|
||||
data_start = True
|
||||
column_names = []
|
||||
#for i, name in enumerate(line.split('\t')):
|
||||
for i, name in enumerate(list_of_substrings):
|
||||
column_names.append(str(i)+'_'+name)
|
||||
|
||||
print(line_number, len(column_names ))
|
||||
|
||||
|
||||
|
||||
break
|
||||
else:
|
||||
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
||||
if list_of_substrings:
|
||||
key, value = list_of_substrings[0], list_of_substrings[1::]
|
||||
header_dict[key] = value
|
||||
|
||||
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
|
||||
# line_numbers.append(line_number)
|
||||
#break
|
||||
header_dict["table_preamble"] = table_preamble
|
||||
|
||||
|
||||
|
||||
if not data_start:
|
||||
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
||||
|
||||
#if len(end_of_header) > 0:
|
||||
|
||||
|
||||
# TODO: it does not work with separater as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(filename,
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator,
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ):
|
||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||
|
||||
# TODO:
|
||||
if 'Pressure' in filename:
|
||||
if 'Pressure' in tmp_filename:
|
||||
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
|
||||
|
||||
@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
###
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(filename)
|
||||
path_tail, path_head = os.path.split(tmp_filename)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
|
@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ):
|
||||
# Infer from filename whether txt file comes from smps or gas folder
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
if 'smps' in filename:
|
||||
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
||||
table_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
||||
separator = '\t'
|
||||
elif 'gas' in filename:
|
||||
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||
table_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||
separator = '\t'
|
||||
else:
|
||||
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
||||
@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ):
|
||||
data_start = False
|
||||
with open(filename,'r') as f:
|
||||
file_encoding = f.encoding
|
||||
table_preamble = ""
|
||||
for line_number, line in enumerate(f):
|
||||
list_of_substrings = line.split(separator)
|
||||
if end_of_header in line:
|
||||
if not (line == '\n'):
|
||||
table_preamble += line.strip() #+ "\n"
|
||||
if table_of_header in line:
|
||||
data_start = True
|
||||
column_names = []
|
||||
for i, name in enumerate(list_of_substrings):
|
||||
@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ):
|
||||
|
||||
print(line_number, len(column_names ))
|
||||
break
|
||||
|
||||
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
||||
if list_of_substrings:
|
||||
key, value = list_of_substrings[0], list_of_substrings[1::]
|
||||
header_dict[key] = value
|
||||
|
||||
header_dict["table_preamble"] = table_preamble
|
||||
|
||||
if not data_start:
|
||||
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
||||
@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ):
|
||||
file_dict['datasets'].append(dataset)
|
||||
rows,cols = dataset['shape']
|
||||
|
||||
# This lines were added to test the structured array functionality
|
||||
tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])]
|
||||
dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])]
|
||||
|
||||
data = np.array(tmp, dtype=dtype_tmp)
|
||||
dataset['data'] = data
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
|
||||
dataset = {}
|
||||
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||
dataset['name'] = 'numerical_variable_names'
|
||||
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
file_dict['datasets'].append(dataset)
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
|
||||
if 'timestamps' in categorical_variables:
|
||||
dataset = {}
|
||||
|
Reference in New Issue
Block a user