Changed variable names, rearranged pieces of code, and set up data checks.

This commit is contained in:
2024-02-21 10:41:57 +01:00
parent 1a4294e0c2
commit 219435511b
2 changed files with 238 additions and 34 deletions

View File

@ -1,5 +1,5 @@
import numpy as np
import pandas as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
@ -26,12 +26,9 @@ def read_xps_ibw_file_as_dict(filename):
file_dict = {}
path_tail, path_head = os.path.split(filename)
file_dict['name'] = path_head
file_dict['data'] = file_obj['wave']['wData']
file_dict['data_units'] = file_obj['wave']['data_units']
file_dict['shape'] = file_dict['data'].shape
file_dict['dtype'] = type(file_dict['data'])
# Group name and attributes
file_dict['name'] = path_head
file_dict['attributes_dict'] = {}
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
@ -47,6 +44,21 @@ def read_xps_ibw_file_as_dict(filename):
# TODO: talk to Thorsten to see if there is an easier way to access the below attributes
dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
# Datasets and their attributes
file_dict['datasets'] = []
dataset = {}
dataset['name'] = 'spectrum'
dataset['data'] = file_obj['wave']['wData']
dataset['data_units'] = file_obj['wave']['data_units']
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
# TODO: include energy axis dataset
file_dict['datasets'].append(dataset)
return file_dict
@ -63,13 +75,152 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
if not os.path.exists(tmp_dirpath):
os.mkdir(tmp_dirpath)
shutil.copy(source_file_path, os.path.join(tmp_dirpath,backup_filename))
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
shutil.copy(source_file_path, tmp_file_path)
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name +'/'+filename)
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
if 'tmp_files' in tmp_file_path:
os.remove(tmp_file_path)
def read_txt_files_as_dict(filename : str ):
#if instrument_folder == 'smps':
# Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant
if 'RGA' in filename:
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
separator = None
elif 'Pressure' in filename:
table_header = 'Date Time Vapore-Pressure 1 in Vapore-Pressure 2 in Baratron 1 in Baratron 2 in Baratron 3 in Baratron 4 in Temp. Ice-Sample in Temp. Heated-Sample in Temp. Cooler 1 in Temp. Cooler 2 in Flow Gas 1 in Pressure Chamber in X in Y in Z in None in Temp. Sealing in Flow Ice-Sample in'
separator = '\t'
#elif 'gas' in filename:
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
else:
return {}
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
# Read header as a dictionary and detect where data table starts
header_dict = {}
data_start = False
with open(filename,'r') as f:
file_encoding = f.encoding
for line_number, line in enumerate(f):
list_of_substrings = line.split(separator)
if table_header in line:
data_start = True
column_names = []
#for i, name in enumerate(line.split('\t')):
for i, name in enumerate(list_of_substrings):
column_names.append(str(i)+'_'+name)
print(line_number, len(column_names ))
break
else:
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
# line_numbers.append(line_number)
#break
if not data_start:
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
#if len(end_of_header) > 0:
# TODO: it does not work with separater as none :(. fix for RGA
try:
df = pd.read_csv(filename,
delimiter = separator,
header=line_number,
#encoding='latin-1',
encoding = file_encoding,
names=column_names,
skip_blank_lines=True)
df_numerical_attrs = df.select_dtypes(include ='number')
df_categorical_attrs = df.select_dtypes(exclude='number')
numerical_variables = [item for item in df_numerical_attrs.columns]
# TODO:
if 'Pressure' in filename:
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
categorical_variables = [item for item in df_categorical_attrs.columns]
####
#elif 'RGA' in filename:
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
###
file_dict = {}
path_tail, path_head = os.path.split(filename)
file_dict['name'] = path_head
# TODO: review this header dictionary, it may not be the best way to represent header data
file_dict['attributes_dict'] = header_dict
file_dict['datasets'] = []
####
if numerical_variables:
dataset = {}
dataset['name'] = 'numerical_variables'
dataset['data'] = df_numerical_attrs.to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
#dataset['data_units'] = file_obj['wave']['data_units']
file_dict['datasets'].append(dataset)
rows,cols = dataset['shape']
dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
if 'timestamps' in categorical_variables:
dataset = {}
dataset['name'] = 'timestamps'
dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
categorical_variables.remove('timestamps')
if categorical_variables:
dataset = {}
dataset['name'] = 'categorical_variables'
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
dataset = {}
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
dataset['name'] = 'categorial_variable_names'
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
except:
return {}
return file_dict
def main():

View File

@ -1,16 +1,21 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
def read_txt_files_as_dict(filename : str ,instrument_folder : str):
#def read_txt_files_as_dict(filename : str ,instrument_folder : str):
def read_txt_files_as_dict(filename : str ):
if instrument_folder == 'smps':
#if instrument_folder == 'smps':
# Infer from filename whether txt file comes from smps or gas folder
#TODO: this may be prone to error if assumed folder structure is non compliant
if 'smps' in filename:
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
elif instrument_folder == 'gas':
separator = '\t'
elif 'gas' in filename:
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
separator = '\t'
else:
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
@ -18,53 +23,101 @@ def read_txt_files_as_dict(filename : str ,instrument_folder : str):
header_dict = {}
data_start = False
with open(filename,'r') as f:
file_encoding = f.encoding
for line_number, line in enumerate(f):
list = line.split('\t')
list_of_substrings = line.split(separator)
if end_of_header in line:
data_start = True
column_names = []
for i, name in enumerate(line.split('\t')):
for i, name in enumerate(list_of_substrings):
column_names.append(str(i)+'_'+name)
print(line_number, len(column_names ))
break
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
key, value = list[0], list[1::]
header_dict[key] = value
if list_of_substrings:
key, value = list_of_substrings[0], list_of_substrings[1::]
header_dict[key] = value
if not data_start:
raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
df = pd.read_csv(filename,
delimiter = "\t",
delimiter = separator,
header=line_number,
#encoding='latin-1',
encoding='latin-1',
encoding= file_encoding,
names=column_names,
skip_blank_lines=True)
df_numerical_attrs = df.select_dtypes(include ='number')
df_categorical_attrs = df.select_dtypes(exclude='number')
if instrument_folder == 'smps':
df_categorical_attrs['1_Timestamp'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
if 'smps' in filename:
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time'])
elif instrument_folder == 'gas':
df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : '0_Timestamp'})
elif 'gas' in filename:
df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : 'timestamps'})
data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
output_dict = { 'header_dict':header_dict,
'data': df_numerical_attrs.to_numpy(),
'data_column_names':data_column_names,
'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
}
#data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
numerical_variables = [item for item in df_numerical_attrs.columns]
categorical_variables = [item for item in df_categorical_attrs.columns]
###
file_dict = {}
path_tail, path_head = os.path.split(filename)
file_dict['name'] = path_head
# TODO: review this header dictionary, it may not be the best way to represent header data
file_dict['attributes_dict'] = header_dict
file_dict['datasets'] = []
####
if numerical_variables:
dataset = {}
dataset['name'] = 'numerical_variables'
dataset['data'] = df_numerical_attrs.to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
#dataset['data_units'] = file_obj['wave']['data_units']
file_dict['datasets'].append(dataset)
rows,cols = dataset['shape']
dataset = {}
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
dataset['name'] = 'numerical_variable_names'
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
#output_dict = {'header_dict':header_dict,
# 'num_data_df':df_numerical_attrs.to_numpy(),
# 'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
if 'timestamps' in categorical_variables:
dataset = {}
dataset['name'] = 'timestamps'
dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
categorical_variables.remove('timestamps')
return output_dict
if categorical_variables:
dataset = {}
dataset['name'] = 'categorical_variables'
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
dataset = {}
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
dataset['name'] = 'categorial_variable_names'
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
file_dict['datasets'].append(dataset)
return file_dict
def main():