Changed variable names, rearranged pieces of code, and set up data checks.
This commit is contained in:
@ -1,5 +1,5 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as np
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import plotly.graph_objects as go
|
import plotly.graph_objects as go
|
||||||
@ -26,12 +26,9 @@ def read_xps_ibw_file_as_dict(filename):
|
|||||||
|
|
||||||
file_dict = {}
|
file_dict = {}
|
||||||
path_tail, path_head = os.path.split(filename)
|
path_tail, path_head = os.path.split(filename)
|
||||||
file_dict['name'] = path_head
|
|
||||||
file_dict['data'] = file_obj['wave']['wData']
|
|
||||||
file_dict['data_units'] = file_obj['wave']['data_units']
|
|
||||||
file_dict['shape'] = file_dict['data'].shape
|
|
||||||
file_dict['dtype'] = type(file_dict['data'])
|
|
||||||
|
|
||||||
|
# Group name and attributes
|
||||||
|
file_dict['name'] = path_head
|
||||||
file_dict['attributes_dict'] = {}
|
file_dict['attributes_dict'] = {}
|
||||||
|
|
||||||
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
|
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
|
||||||
@ -48,6 +45,21 @@ def read_xps_ibw_file_as_dict(filename):
|
|||||||
dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
|
dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
|
||||||
file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
|
file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
|
||||||
|
|
||||||
|
# Datasets and their attributes
|
||||||
|
|
||||||
|
file_dict['datasets'] = []
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'spectrum'
|
||||||
|
dataset['data'] = file_obj['wave']['wData']
|
||||||
|
dataset['data_units'] = file_obj['wave']['data_units']
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
|
||||||
|
# TODO: include energy axis dataset
|
||||||
|
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
|
||||||
return file_dict
|
return file_dict
|
||||||
|
|
||||||
@ -63,13 +75,152 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
|
|||||||
if not os.path.exists(tmp_dirpath):
|
if not os.path.exists(tmp_dirpath):
|
||||||
os.mkdir(tmp_dirpath)
|
os.mkdir(tmp_dirpath)
|
||||||
|
|
||||||
shutil.copy(source_file_path, os.path.join(tmp_dirpath,backup_filename))
|
tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
|
||||||
|
shutil.copy(source_file_path, tmp_file_path)
|
||||||
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
|
# Open backup h5 file and copy complet filesystem directory onto a group in h5file
|
||||||
with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
|
with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
|
||||||
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name +'/'+filename)
|
dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
|
||||||
|
|
||||||
|
if 'tmp_files' in tmp_file_path:
|
||||||
|
os.remove(tmp_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def read_txt_files_as_dict(filename : str ):
|
||||||
|
|
||||||
|
#if instrument_folder == 'smps':
|
||||||
|
# Infer from filename whether txt file comes from smps or gas folder
|
||||||
|
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||||
|
if 'RGA' in filename:
|
||||||
|
#end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status',
|
||||||
|
table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
||||||
|
separator = None
|
||||||
|
elif 'Pressure' in filename:
|
||||||
|
table_header = 'Date Time Vapore-Pressure 1 in Vapore-Pressure 2 in Baratron 1 in Baratron 2 in Baratron 3 in Baratron 4 in Temp. Ice-Sample in Temp. Heated-Sample in Temp. Cooler 1 in Temp. Cooler 2 in Flow Gas 1 in Pressure Chamber in X in Y in Z in None in Temp. Sealing in Flow Ice-Sample in'
|
||||||
|
separator = '\t'
|
||||||
|
#elif 'gas' in filename:
|
||||||
|
# end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
#raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
|
||||||
|
|
||||||
|
# Read header as a dictionary and detect where data table starts
|
||||||
|
header_dict = {}
|
||||||
|
data_start = False
|
||||||
|
|
||||||
|
with open(filename,'r') as f:
|
||||||
|
file_encoding = f.encoding
|
||||||
|
for line_number, line in enumerate(f):
|
||||||
|
list_of_substrings = line.split(separator)
|
||||||
|
if table_header in line:
|
||||||
|
data_start = True
|
||||||
|
column_names = []
|
||||||
|
#for i, name in enumerate(line.split('\t')):
|
||||||
|
for i, name in enumerate(list_of_substrings):
|
||||||
|
column_names.append(str(i)+'_'+name)
|
||||||
|
|
||||||
|
print(line_number, len(column_names ))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
||||||
|
if list_of_substrings:
|
||||||
|
key, value = list_of_substrings[0], list_of_substrings[1::]
|
||||||
|
header_dict[key] = value
|
||||||
|
|
||||||
|
#if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
|
||||||
|
# line_numbers.append(line_number)
|
||||||
|
#break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if not data_start:
|
||||||
|
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
||||||
|
|
||||||
|
#if len(end_of_header) > 0:
|
||||||
|
|
||||||
|
# TODO: it does not work with separater as none :(. fix for RGA
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(filename,
|
||||||
|
delimiter = separator,
|
||||||
|
header=line_number,
|
||||||
|
#encoding='latin-1',
|
||||||
|
encoding = file_encoding,
|
||||||
|
names=column_names,
|
||||||
|
skip_blank_lines=True)
|
||||||
|
|
||||||
|
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||||
|
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||||
|
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
if 'Pressure' in filename:
|
||||||
|
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||||
|
df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
|
||||||
|
|
||||||
|
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||||
|
####
|
||||||
|
#elif 'RGA' in filename:
|
||||||
|
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||||
|
|
||||||
|
###
|
||||||
|
file_dict = {}
|
||||||
|
path_tail, path_head = os.path.split(filename)
|
||||||
|
|
||||||
|
file_dict['name'] = path_head
|
||||||
|
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||||
|
file_dict['attributes_dict'] = header_dict
|
||||||
|
file_dict['datasets'] = []
|
||||||
|
####
|
||||||
|
|
||||||
|
if numerical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'numerical_variables'
|
||||||
|
dataset['data'] = df_numerical_attrs.to_numpy()
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
rows,cols = dataset['shape']
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||||
|
dataset['name'] = 'numerical_variable_names'
|
||||||
|
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
if 'timestamps' in categorical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'timestamps'
|
||||||
|
dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
categorical_variables.remove('timestamps')
|
||||||
|
|
||||||
|
if categorical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'categorical_variables'
|
||||||
|
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
|
||||||
|
dataset['name'] = 'categorial_variable_names'
|
||||||
|
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
except:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return file_dict
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def read_txt_files_as_dict(filename : str ,instrument_folder : str):
|
#def read_txt_files_as_dict(filename : str ,instrument_folder : str):
|
||||||
|
def read_txt_files_as_dict(filename : str ):
|
||||||
|
|
||||||
|
#if instrument_folder == 'smps':
|
||||||
if instrument_folder == 'smps':
|
# Infer from filename whether txt file comes from smps or gas folder
|
||||||
|
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||||
|
if 'smps' in filename:
|
||||||
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)'
|
||||||
|
separator = '\t'
|
||||||
elif instrument_folder == 'gas':
|
elif 'gas' in filename:
|
||||||
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4'
|
||||||
|
separator = '\t'
|
||||||
else:
|
else:
|
||||||
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
|
||||||
|
|
||||||
@ -18,53 +23,101 @@ def read_txt_files_as_dict(filename : str ,instrument_folder : str):
|
|||||||
header_dict = {}
|
header_dict = {}
|
||||||
data_start = False
|
data_start = False
|
||||||
with open(filename,'r') as f:
|
with open(filename,'r') as f:
|
||||||
|
file_encoding = f.encoding
|
||||||
for line_number, line in enumerate(f):
|
for line_number, line in enumerate(f):
|
||||||
list = line.split('\t')
|
list_of_substrings = line.split(separator)
|
||||||
if end_of_header in line:
|
if end_of_header in line:
|
||||||
data_start = True
|
data_start = True
|
||||||
column_names = []
|
column_names = []
|
||||||
for i, name in enumerate(line.split('\t')):
|
for i, name in enumerate(list_of_substrings):
|
||||||
column_names.append(str(i)+'_'+name)
|
column_names.append(str(i)+'_'+name)
|
||||||
|
|
||||||
print(line_number, len(column_names ))
|
print(line_number, len(column_names ))
|
||||||
break
|
break
|
||||||
|
|
||||||
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
# TODO: update to extract information from lines formed by more than two elements separaed by '\t'
|
||||||
key, value = list[0], list[1::]
|
if list_of_substrings:
|
||||||
header_dict[key] = value
|
key, value = list_of_substrings[0], list_of_substrings[1::]
|
||||||
|
header_dict[key] = value
|
||||||
|
|
||||||
if not data_start:
|
if not data_start:
|
||||||
raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
|
raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
|
||||||
|
|
||||||
df = pd.read_csv(filename,
|
df = pd.read_csv(filename,
|
||||||
delimiter = "\t",
|
delimiter = separator,
|
||||||
header=line_number,
|
header=line_number,
|
||||||
#encoding='latin-1',
|
#encoding='latin-1',
|
||||||
encoding='latin-1',
|
encoding= file_encoding,
|
||||||
names=column_names,
|
names=column_names,
|
||||||
skip_blank_lines=True)
|
skip_blank_lines=True)
|
||||||
|
|
||||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||||
|
|
||||||
if instrument_folder == 'smps':
|
if 'smps' in filename:
|
||||||
df_categorical_attrs['1_Timestamp'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
|
df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
|
||||||
df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time'])
|
df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time'])
|
||||||
elif instrument_folder == 'gas':
|
elif 'gas' in filename:
|
||||||
df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : '0_Timestamp'})
|
df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : 'timestamps'})
|
||||||
|
|
||||||
data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
|
#data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
|
||||||
output_dict = { 'header_dict':header_dict,
|
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||||
'data': df_numerical_attrs.to_numpy(),
|
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||||
'data_column_names':data_column_names,
|
|
||||||
'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
|
|
||||||
}
|
|
||||||
|
|
||||||
#output_dict = {'header_dict':header_dict,
|
###
|
||||||
# 'num_data_df':df_numerical_attrs.to_numpy(),
|
file_dict = {}
|
||||||
# 'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
|
path_tail, path_head = os.path.split(filename)
|
||||||
|
|
||||||
return output_dict
|
file_dict['name'] = path_head
|
||||||
|
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||||
|
file_dict['attributes_dict'] = header_dict
|
||||||
|
file_dict['datasets'] = []
|
||||||
|
####
|
||||||
|
|
||||||
|
if numerical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'numerical_variables'
|
||||||
|
dataset['data'] = df_numerical_attrs.to_numpy()
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
rows,cols = dataset['shape']
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
numerical_variables= [item.encode("utf-8") for item in numerical_variables]
|
||||||
|
dataset['name'] = 'numerical_variable_names'
|
||||||
|
dataset['data'] = np.array(numerical_variables).reshape((1,cols))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
if 'timestamps' in categorical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'timestamps'
|
||||||
|
dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
categorical_variables.remove('timestamps')
|
||||||
|
|
||||||
|
if categorical_variables:
|
||||||
|
dataset = {}
|
||||||
|
dataset['name'] = 'categorical_variables'
|
||||||
|
dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
categorical_variables = [item.encode("utf-8") for item in categorical_variables]
|
||||||
|
dataset['name'] = 'categorial_variable_names'
|
||||||
|
dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
|
||||||
|
dataset['shape'] = dataset['data'].shape
|
||||||
|
dataset['dtype'] = type(dataset['data'])
|
||||||
|
file_dict['datasets'].append(dataset)
|
||||||
|
|
||||||
|
return file_dict
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user