Replaced attributes, previously extracted from the table_preamble in .txt and .dat files with a single dataset attribute called table_preamble that contains the whole table preamble.
This commit is contained in:
@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename):
|
|||||||
|
|
||||||
return file_dict
|
return file_dict
|
||||||
|
|
||||||
|
def make_file_copy(source_file_path):
|
||||||
|
|
||||||
|
pathtail, filename = os.path.split(source_file_path)
|
||||||
|
backup_filename = 'backup_'+ filename
|
||||||
|
# Path
|
||||||
|
ROOT_DIR = os.path.abspath(os.curdir)
|
||||||
|
|
||||||
|
tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files')
|
||||||
|
if not os.path.exists(tmp_dirpath):
|
||||||
|
os.mkdir(tmp_dirpath)
|
||||||
|
|
||||||
|
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
|
||||||
|
shutil.copy(source_file_path, tmp_file_path)
|
||||||
|
|
||||||
|
return tmp_file_path
|
||||||
|
|
||||||
def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
|
def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
|
||||||
# Create copy of original file to avoid possible file corruption and work with it.
|
# Create copy of original file to avoid possible file corruption and work with it.
|
||||||
|
|
||||||
@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
|
|||||||
|
|
||||||
def read_txt_files_as_dict(filename : str ):
|
def read_txt_files_as_dict(filename : str ):
|
||||||
|
|
||||||
#if instrument_folder == 'smps':
|
|
||||||
# Infer from filename whether txt file comes from smps or gas folder
|
|
||||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||||
if 'RGA' in filename:
|
if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA
|
||||||
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
|
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
|
||||||
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
||||||
separator = None
|
separator = None
|
||||||
@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
separator = '\t'
|
separator = '\t'
|
||||||
#elif 'gas' in filename:
|
#elif 'gas' in filename:
|
||||||
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||||
|
elif 'Humidity_Sensors' in filename:
|
||||||
|
table_header = 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
|
||||||
|
separator = '\t'
|
||||||
|
elif 'ICAD' in filename and 'HONO' in filename:
|
||||||
|
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||||
|
separator = '\t'
|
||||||
|
elif 'ICAD' in filename and 'NO2' in filename:
|
||||||
|
table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||||
|
separator = '\t'
|
||||||
else:
|
else:
|
||||||
return {}
|
return {}
|
||||||
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
|
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
|
||||||
@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
# Read header as a dictionary and detect where data table starts
|
# Read header as a dictionary and detect where data table starts
|
||||||
header_dict = {}
|
header_dict = {}
|
||||||
data_start = False
|
data_start = False
|
||||||
|
# Work with copy of the file for safety
|
||||||
|
tmp_filename = make_file_copy(filename)
|
||||||
|
|
||||||
with open(filename,'r') as f:
|
with open(filename,'r') as f:
|
||||||
file_encoding = f.encoding
|
file_encoding = f.encoding
|
||||||
|
table_preamble = ""
|
||||||
for line_number, line in enumerate(f):
|
for line_number, line in enumerate(f):
|
||||||
list_of_substrings = line.split(separator)
|
list_of_substrings = line.split(separator)
|
||||||
if table_header in line:
|
if not (line == '\n'):
|
||||||
|
table_preamble += line.strip() #+ "\n"
|
||||||
|
if table_header in line:
|
||||||
data_start = True
|
data_start = True
|
||||||
column_names = []
|
column_names = []
|
||||||
#for i, name in enumerate(line.split('\t')):
|
|
||||||
for i, name in enumerate(list_of_substrings):
|
for i, name in enumerate(list_of_substrings):
|
||||||
column_names.append(str(i)+'_'+name)
|
column_names.append(str(i)+'_'+name)
|
||||||
|
|
||||||
print(line_number, len(column_names ))
|
print(line_number, len(column_names ))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
|
||||||
if list_of_substrings:
|
|
||||||
key, value = list_of_substrings[0], list_of_substrings[1::]
|
|
||||||
header_dict[key] = value
|
|
||||||
|
|
||||||
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
|
header_dict["table_preamble"] = table_preamble
|
||||||
# line_numbers.append(line_number)
|
|
||||||
#break
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if not data_start:
|
|
||||||
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
|
||||||
|
|
||||||
#if len(end_of_header) > 0:
|
|
||||||
|
|
||||||
# TODO: it does not work with separater as none :(. fix for RGA
|
# TODO: it does not work with separater as none :(. fix for RGA
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(filename,
|
df = pd.read_csv(tmp_filename,
|
||||||
delimiter = separator,
|
delimiter = separator,
|
||||||
header=line_number,
|
header=line_number,
|
||||||
#encoding='latin-1',
|
#encoding='latin-1',
|
||||||
@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
if 'Pressure' in filename:
|
if 'Pressure' in tmp_filename:
|
||||||
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||||
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
|
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
|
||||||
|
|
||||||
@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
|
|
||||||
###
|
###
|
||||||
file_dict = {}
|
file_dict = {}
|
||||||
path_tail, path_head = os.path.split(filename)
|
path_tail, path_head = os.path.split(tmp_filename)
|
||||||
|
|
||||||
file_dict['name'] = path_head
|
file_dict['name'] = path_head
|
||||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||||
|
@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
# Infer from filename whether txt file comes from smps or gas folder
|
# Infer from filename whether txt file comes from smps or gas folder
|
||||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||||
if 'smps' in filename:
|
if 'smps' in filename:
|
||||||
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
table_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
||||||
separator = '\t'
|
separator = '\t'
|
||||||
elif 'gas' in filename:
|
elif 'gas' in filename:
|
||||||
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
table_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||||
separator = '\t'
|
separator = '\t'
|
||||||
else:
|
else:
|
||||||
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
||||||
@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
data_start = False
|
data_start = False
|
||||||
with open(filename,'r') as f:
|
with open(filename,'r') as f:
|
||||||
file_encoding = f.encoding
|
file_encoding = f.encoding
|
||||||
|
table_preamble = ""
|
||||||
for line_number, line in enumerate(f):
|
for line_number, line in enumerate(f):
|
||||||
list_of_substrings = line.split(separator)
|
list_of_substrings = line.split(separator)
|
||||||
if end_of_header in line:
|
if not (line == '\n'):
|
||||||
|
table_preamble += line.strip() #+ "\n"
|
||||||
|
if table_of_header in line:
|
||||||
data_start = True
|
data_start = True
|
||||||
column_names = []
|
column_names = []
|
||||||
for i, name in enumerate(list_of_substrings):
|
for i, name in enumerate(list_of_substrings):
|
||||||
@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
|
|
||||||
print(line_number, len(column_names ))
|
print(line_number, len(column_names ))
|
||||||
break
|
break
|
||||||
|
|
||||||
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
header_dict["table_preamble"] = table_preamble
|
||||||
if list_of_substrings:
|
|
||||||
key, value = list_of_substrings[0], list_of_substrings[1::]
|
|
||||||
header_dict[key] = value
|
|
||||||
|
|
||||||
if not data_start:
|
if not data_start:
|
||||||
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
||||||
@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ):
|
|||||||
file_dict['datasets'].append(dataset)
|
file_dict['datasets'].append(dataset)
|
||||||
rows,cols = dataset['shape']
|
rows,cols = dataset['shape']
|
||||||
|
|
||||||
|
# This lines were added to test the structured array functionality
|
||||||
|
tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])]
|
||||||
|
dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])]
|
||||||
|
|
||||||
|
data = np.array(tmp, dtype=dtype_tmp)
|
||||||
|
dataset['data'] = data
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
|
||||||
dataset = {}
|
dataset = {}
|
||||||
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||||
dataset['name'] = 'numerical_variable_names'
|
dataset['name'] = 'numerical_variable_names'
|
||||||
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
||||||
dataset['shape'] = dataset['data'].shape
|
dataset['shape'] = dataset['data'].shape
|
||||||
dataset['dtype'] = type(dataset['data'])
|
dataset['dtype'] = type(dataset['data'])
|
||||||
file_dict['datasets'].append(dataset)
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if 'timestamps' in categorical_variables:
|
if 'timestamps' in categorical_variables:
|
||||||
dataset = {}
|
dataset = {}
|
||||||
|
Reference in New Issue
Block a user