Synch with remote repo
This commit is contained in:
@ -1,223 +1,223 @@
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import collections
|
||||
import yaml
|
||||
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
|
||||
|
||||
def read_acsm_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):
|
||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||
if not instruments_dir:
|
||||
# Assuming the instruments folder is one level up from the source module directory
|
||||
module_dir = os.path.dirname(__file__)
|
||||
instruments_dir = os.path.join(module_dir, '..')
|
||||
|
||||
# Normalize the path (resolves any '..' in the path)
|
||||
instrument_configs_path = os.path.abspath(os.path.join(instruments_dir,'dictionaries','ACSM_TOFWARE.yaml'))
|
||||
|
||||
with open(instrument_configs_path,'r') as stream:
|
||||
try:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
# Verify if file can be read by available intrument configurations.
|
||||
#if not any(key in filename.replace(os.sep,'/') for key in config_dict.keys()):
|
||||
# return {}
|
||||
|
||||
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
|
||||
|
||||
description_dict = config_dict.get('table_header',{})
|
||||
|
||||
file_encoding = config_dict['config_text_reader'].get('file_encoding','utf-8')
|
||||
separator = config_dict['config_text_reader'].get('separator',None)
|
||||
table_header = config_dict['config_text_reader'].get('table_header',None)
|
||||
timestamp_variables = config_dict['config_text_reader'].get('timestamp',[])
|
||||
datetime_format = config_dict['config_text_reader'].get('datetime_format',[])
|
||||
|
||||
|
||||
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
if work_with_copy:
|
||||
tmp_filename = utils.make_file_copy(source_file_path=filename)
|
||||
else:
|
||||
tmp_filename = filename
|
||||
|
||||
if not isinstance(table_header, list):
|
||||
table_header = [table_header]
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
|
||||
#df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
timestamps_name = ' '.join(timestamp_variables)
|
||||
df_categorical_attrs[ timestamps_name] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
valid_indices = []
|
||||
if datetime_format:
|
||||
df_categorical_attrs[ timestamps_name] = pd.to_datetime(df_categorical_attrs[ timestamps_name],format=datetime_format,errors='coerce')
|
||||
valid_indices = df_categorical_attrs.dropna(subset=[timestamps_name]).index
|
||||
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
|
||||
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
|
||||
startdate = df_categorical_attrs[timestamps_name].min()
|
||||
enddate = df_categorical_attrs[timestamps_name].max()
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].astype(str)
|
||||
#header_dict.update({'stastrrtdate':startdate,'enddate':enddate})
|
||||
header_dict['startdate']= str(startdate)
|
||||
header_dict['enddate']=str(enddate)
|
||||
|
||||
if len(timestamp_variables) > 1:
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
|
||||
|
||||
#df_categorical_attrs.reindex(drop=True)
|
||||
#df_numerical_attrs.reindex(drop=True)
|
||||
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
#elif 'RGA' in filename:
|
||||
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||
|
||||
###
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(tmp_filename)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = header_dict
|
||||
file_dict['datasets'] = []
|
||||
####
|
||||
|
||||
df = pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)
|
||||
|
||||
#if numerical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||
#
|
||||
# Create attribute descriptions based on description_dict
|
||||
dataset['attributes'] = {}
|
||||
|
||||
# Annotate column headers if description_dict is non empty
|
||||
if description_dict:
|
||||
for column_name in df.columns:
|
||||
column_attr_dict = description_dict.get(column_name,
|
||||
{'note':'there was no description available. Review instrument files.'})
|
||||
dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)})
|
||||
|
||||
#try:
|
||||
# dataset['attributes'] = description_dict['table_header'].copy()
|
||||
# for key in description_dict['table_header'].keys():
|
||||
# if not key in numerical_variables:
|
||||
# dataset['attributes'].pop(key) # delete key
|
||||
# else:
|
||||
# dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})
|
||||
#except ValueError as err:
|
||||
# print(err)
|
||||
|
||||
# Represent string values as fixed length strings in the HDF5 file, which need
|
||||
# to be decoded as string when we read them. It provides better control than variable strings,
|
||||
# at the expense of flexibility.
|
||||
# https://docs.h5py.org/en/stable/strings.html
|
||||
|
||||
|
||||
if table_preamble:
|
||||
#header_dict["table_preamble"] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset = {}
|
||||
tp_dataset['name'] = "table_preamble"
|
||||
tp_dataset['data'] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset['shape'] = tp_dataset['data'].shape
|
||||
tp_dataset['dtype'] = type(tp_dataset['data'])
|
||||
tp_dataset['attributes'] = {}
|
||||
file_dict['datasets'].append(tp_dataset)
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
#if categorical_variables:
|
||||
# dataset = {}
|
||||
# dataset['name'] = 'table_categorical_variables'
|
||||
# dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
# dataset['shape'] = dataset['data'].shape
|
||||
# dataset['dtype'] = type(dataset['data'])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
|
||||
# file_dict['datasets'].append(dataset)
|
||||
except:
|
||||
return {}
|
||||
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import collections
|
||||
import yaml
|
||||
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
|
||||
|
||||
def read_acsm_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):
|
||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||
if not instruments_dir:
|
||||
# Assuming the instruments folder is one level up from the source module directory
|
||||
module_dir = os.path.dirname(__file__)
|
||||
instruments_dir = os.path.join(module_dir, '..')
|
||||
|
||||
# Normalize the path (resolves any '..' in the path)
|
||||
instrument_configs_path = os.path.abspath(os.path.join(instruments_dir,'dictionaries','ACSM_TOFWARE.yaml'))
|
||||
|
||||
with open(instrument_configs_path,'r') as stream:
|
||||
try:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
# Verify if file can be read by available intrument configurations.
|
||||
#if not any(key in filename.replace(os.sep,'/') for key in config_dict.keys()):
|
||||
# return {}
|
||||
|
||||
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
|
||||
|
||||
description_dict = config_dict.get('table_header',{})
|
||||
|
||||
file_encoding = config_dict['config_text_reader'].get('file_encoding','utf-8')
|
||||
separator = config_dict['config_text_reader'].get('separator',None)
|
||||
table_header = config_dict['config_text_reader'].get('table_header',None)
|
||||
timestamp_variables = config_dict['config_text_reader'].get('timestamp',[])
|
||||
datetime_format = config_dict['config_text_reader'].get('datetime_format',[])
|
||||
|
||||
|
||||
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
if work_with_copy:
|
||||
tmp_filename = utils.make_file_copy(source_file_path=filename)
|
||||
else:
|
||||
tmp_filename = filename
|
||||
|
||||
if not isinstance(table_header, list):
|
||||
table_header = [table_header]
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
|
||||
#df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
timestamps_name = ' '.join(timestamp_variables)
|
||||
df_categorical_attrs[ timestamps_name] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
valid_indices = []
|
||||
if datetime_format:
|
||||
df_categorical_attrs[ timestamps_name] = pd.to_datetime(df_categorical_attrs[ timestamps_name],format=datetime_format,errors='coerce')
|
||||
valid_indices = df_categorical_attrs.dropna(subset=[timestamps_name]).index
|
||||
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
|
||||
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
|
||||
startdate = df_categorical_attrs[timestamps_name].min()
|
||||
enddate = df_categorical_attrs[timestamps_name].max()
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].astype(str)
|
||||
#header_dict.update({'stastrrtdate':startdate,'enddate':enddate})
|
||||
header_dict['startdate']= str(startdate)
|
||||
header_dict['enddate']=str(enddate)
|
||||
|
||||
if len(timestamp_variables) > 1:
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
|
||||
|
||||
#df_categorical_attrs.reindex(drop=True)
|
||||
#df_numerical_attrs.reindex(drop=True)
|
||||
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
#elif 'RGA' in filename:
|
||||
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||
|
||||
###
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(tmp_filename)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = header_dict
|
||||
file_dict['datasets'] = []
|
||||
####
|
||||
|
||||
df = pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)
|
||||
|
||||
#if numerical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||
#
|
||||
# Create attribute descriptions based on description_dict
|
||||
dataset['attributes'] = {}
|
||||
|
||||
# Annotate column headers if description_dict is non empty
|
||||
if description_dict:
|
||||
for column_name in df.columns:
|
||||
column_attr_dict = description_dict.get(column_name,
|
||||
{'note':'there was no description available. Review instrument files.'})
|
||||
dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)})
|
||||
|
||||
#try:
|
||||
# dataset['attributes'] = description_dict['table_header'].copy()
|
||||
# for key in description_dict['table_header'].keys():
|
||||
# if not key in numerical_variables:
|
||||
# dataset['attributes'].pop(key) # delete key
|
||||
# else:
|
||||
# dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})
|
||||
#except ValueError as err:
|
||||
# print(err)
|
||||
|
||||
# Represent string values as fixed length strings in the HDF5 file, which need
|
||||
# to be decoded as string when we read them. It provides better control than variable strings,
|
||||
# at the expense of flexibility.
|
||||
# https://docs.h5py.org/en/stable/strings.html
|
||||
|
||||
|
||||
if table_preamble:
|
||||
#header_dict["table_preamble"] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset = {}
|
||||
tp_dataset['name'] = "table_preamble"
|
||||
tp_dataset['data'] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset['shape'] = tp_dataset['data'].shape
|
||||
tp_dataset['dtype'] = type(tp_dataset['data'])
|
||||
tp_dataset['attributes'] = {}
|
||||
file_dict['datasets'].append(tp_dataset)
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
#if categorical_variables:
|
||||
# dataset = {}
|
||||
# dataset['name'] = 'table_categorical_variables'
|
||||
# dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
# dataset['shape'] = dataset['data'].shape
|
||||
# dataset['dtype'] = type(dataset['data'])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
|
||||
# file_dict['datasets'].append(dataset)
|
||||
except:
|
||||
return {}
|
||||
|
||||
return file_dict
|
@ -1,112 +1,112 @@
|
||||
default:
|
||||
file_encoding : 'utf-8'
|
||||
separator : 'None'
|
||||
table_header : 'None'
|
||||
desired_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
|
||||
RGA:
|
||||
table_header : 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
||||
separator : '\t'
|
||||
link_to_description: 'dictionaries/RGA.yaml'
|
||||
|
||||
Pressure:
|
||||
table_header : 'Date Time Vapore-Pressure 1 in Vapore-Pressure 2 in Baratron 1 in Baratron 2 in Baratron 3 in Baratron 4 in Temp. Ice-Sample in Temp. Heated-Sample in Temp. Cooler 1 in Temp. Cooler 2 in Flow Gas 1 in Pressure Chamber in X in Y in Z in None in Temp. Sealing in Flow Ice-Sample in'
|
||||
separator : '\t'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Preassure.yaml'
|
||||
|
||||
Humidity_Sensors:
|
||||
table_header : 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Humidity_Sensors.yaml'
|
||||
|
||||
HONO: #ICAD/HONO:
|
||||
table_header : 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) HONO (ppb) HONO Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) O4 (ppb) O4 Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Start Date/Time (UTC)']
|
||||
datetime_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
link_to_description: 'dictionaries/ICAD_HONO.yaml'
|
||||
|
||||
NO2: #ICAD/NO2:
|
||||
table_header : 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Start Date/Time (UTC)']
|
||||
datetime_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
link_to_description: 'dictionaries/ICAD_NO2.yaml'
|
||||
|
||||
Lopap:
|
||||
#table_header : 'Date;Time;Ch1;490.1;500.2;510.0;520.0;530.1;540.0;550.7;603.2;700.3;800.0;Ch2;500.5;510.3;520.5;530.7;540.8;550.5;550.8;560.9;570.9;581.2;586.2;591.2;596.1;601.1;606.4;611.3;'
|
||||
table_header : 'Date;Time;Ch1;'
|
||||
separator : ';'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Lopap.yaml'
|
||||
|
||||
T200_NOx:
|
||||
table_header : 'Date Time NO NO2 NOx'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/T200_NOx.yaml'
|
||||
|
||||
T360U_CO2:
|
||||
table_header : 'Date Time CO2'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/T360U_CO2.yaml'
|
||||
|
||||
smps:
|
||||
table_header: 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa) Relative Humidity (%) Mean Free Path (m) Gas Viscosity (Pa*s) Diameter Midpoint (nm) 15.7 16.3 16.8 17.5 18.1 18.8 19.5 20.2 20.9 21.7 22.5 23.3 24.1 25.0 25.9 26.9 27.9 28.9 30.0 31.1 32.2 33.4 34.6 35.9 37.2 38.5 40.0 41.4 42.9 44.5 46.1 47.8 49.6 51.4 53.3 55.2 57.3 59.4 61.5 63.8 66.1 68.5 71.0 73.7 76.4 79.1 82.0 85.1 88.2 91.4 94.7 98.2 101.8 105.5 109.4 113.4 117.6 121.9 126.3 131.0 135.8 140.7 145.9 151.2 156.8 162.5 168.5 174.7 181.1 187.7 194.6 201.7 209.1 216.7 224.7 232.9 241.4 250.3 259.5 269.0 278.8 289.0 299.6 310.6 322.0 333.8 346.0 358.7 371.8 385.4 399.5 414.2 429.4 445.1 461.4 478.3 495.8 514.0 532.8 552.3 572.5 593.5 615.3 637.8 Scan Time (s) Retrace Time (s) Scan Resolution (Hz) Scans Per Sample Sheath Flow (L/min) Aerosol Flow (L/min) Bypass Flow (L/min) Low Voltage (V) High Voltage (V) Lower Size (nm) Upper Size (nm) Density (g/cm³) td + 0.5 (s) tf (s) D50 (nm) Neutralizer'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Start Time']
|
||||
datetime_format: '%d/%m/%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/smps.yaml'
|
||||
|
||||
gas:
|
||||
table_header : 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4 APHA370THC HygroclipRH HygroclipT ML9850SO2 ozone49c PAMrh PAMt xxxal xxxal xxxal xxxal ThermoCouple0 ThermoCouple1 ThermoCouple2 ThermoCouple3 xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC CPC xxx LicorH2Odelta LicorCO2delta xxx 2BO2 xxx xxx HoribaCO xxx'
|
||||
separator : '\t'
|
||||
file_encoding : 'utf-8'
|
||||
timestamp: ['Date_Time']
|
||||
datetime_format: '%Y.%m.%d %H:%M:%S'
|
||||
link_to_description: 'dictionaries/gas.yaml'
|
||||
|
||||
ACSM_TOFWARE:
|
||||
table_header:
|
||||
#txt:
|
||||
- 't_base VaporizerTemp_C HeaterBias_V FlowRefWave FlowRate_mb FlowRate_ccs FilamentEmission_mA Detector_V AnalogInput06_V ABRefWave ABsamp ABCorrFact'
|
||||
- 't_start_Buf,Chl_11000,NH4_11000,SO4_11000,NO3_11000,Org_11000,SO4_48_11000,SO4_62_11000,SO4_82_11000,SO4_81_11000,SO4_98_11000,NO3_30_11000,Org_60_11000,Org_43_11000,Org_44_11000'
|
||||
#csv:
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- 'MSS_base'
|
||||
- 'tseries'
|
||||
separator:
|
||||
#txt:
|
||||
- "\t"
|
||||
- ","
|
||||
#csv:
|
||||
- "\t"
|
||||
- "\t"
|
||||
- "None"
|
||||
- "None"
|
||||
file_encoding:
|
||||
#txt:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
#csv:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
|
||||
default:
|
||||
file_encoding : 'utf-8'
|
||||
separator : 'None'
|
||||
table_header : 'None'
|
||||
desired_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
|
||||
RGA:
|
||||
table_header : 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8'
|
||||
separator : '\t'
|
||||
link_to_description: 'dictionaries/RGA.yaml'
|
||||
|
||||
Pressure:
|
||||
table_header : 'Date Time Vapore-Pressure 1 in Vapore-Pressure 2 in Baratron 1 in Baratron 2 in Baratron 3 in Baratron 4 in Temp. Ice-Sample in Temp. Heated-Sample in Temp. Cooler 1 in Temp. Cooler 2 in Flow Gas 1 in Pressure Chamber in X in Y in Z in None in Temp. Sealing in Flow Ice-Sample in'
|
||||
separator : '\t'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Preassure.yaml'
|
||||
|
||||
Humidity_Sensors:
|
||||
table_header : 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Humidity_Sensors.yaml'
|
||||
|
||||
HONO: #ICAD/HONO:
|
||||
table_header : 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) HONO (ppb) HONO Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) O4 (ppb) O4 Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Start Date/Time (UTC)']
|
||||
datetime_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
link_to_description: 'dictionaries/ICAD_HONO.yaml'
|
||||
|
||||
NO2: #ICAD/NO2:
|
||||
table_header : 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Start Date/Time (UTC)']
|
||||
datetime_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
link_to_description: 'dictionaries/ICAD_NO2.yaml'
|
||||
|
||||
Lopap:
|
||||
#table_header : 'Date;Time;Ch1;490.1;500.2;510.0;520.0;530.1;540.0;550.7;603.2;700.3;800.0;Ch2;500.5;510.3;520.5;530.7;540.8;550.5;550.8;560.9;570.9;581.2;586.2;591.2;596.1;601.1;606.4;611.3;'
|
||||
table_header : 'Date;Time;Ch1;'
|
||||
separator : ';'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/Lopap.yaml'
|
||||
|
||||
T200_NOx:
|
||||
table_header : 'Date Time NO NO2 NOx'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/T200_NOx.yaml'
|
||||
|
||||
T360U_CO2:
|
||||
table_header : 'Date Time CO2'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Time']
|
||||
datetime_format: '%d.%m.%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/T360U_CO2.yaml'
|
||||
|
||||
smps:
|
||||
table_header: 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa) Relative Humidity (%) Mean Free Path (m) Gas Viscosity (Pa*s) Diameter Midpoint (nm) 15.7 16.3 16.8 17.5 18.1 18.8 19.5 20.2 20.9 21.7 22.5 23.3 24.1 25.0 25.9 26.9 27.9 28.9 30.0 31.1 32.2 33.4 34.6 35.9 37.2 38.5 40.0 41.4 42.9 44.5 46.1 47.8 49.6 51.4 53.3 55.2 57.3 59.4 61.5 63.8 66.1 68.5 71.0 73.7 76.4 79.1 82.0 85.1 88.2 91.4 94.7 98.2 101.8 105.5 109.4 113.4 117.6 121.9 126.3 131.0 135.8 140.7 145.9 151.2 156.8 162.5 168.5 174.7 181.1 187.7 194.6 201.7 209.1 216.7 224.7 232.9 241.4 250.3 259.5 269.0 278.8 289.0 299.6 310.6 322.0 333.8 346.0 358.7 371.8 385.4 399.5 414.2 429.4 445.1 461.4 478.3 495.8 514.0 532.8 552.3 572.5 593.5 615.3 637.8 Scan Time (s) Retrace Time (s) Scan Resolution (Hz) Scans Per Sample Sheath Flow (L/min) Aerosol Flow (L/min) Bypass Flow (L/min) Low Voltage (V) High Voltage (V) Lower Size (nm) Upper Size (nm) Density (g/cm³) td + 0.5 (s) tf (s) D50 (nm) Neutralizer'
|
||||
separator : '\t'
|
||||
file_encoding : 'latin-1'
|
||||
timestamp: ['Date','Start Time']
|
||||
datetime_format: '%d/%m/%Y %H:%M:%S'
|
||||
link_to_description: 'dictionaries/smps.yaml'
|
||||
|
||||
gas:
|
||||
table_header : 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4 APHA370THC HygroclipRH HygroclipT ML9850SO2 ozone49c PAMrh PAMt xxxal xxxal xxxal xxxal ThermoCouple0 ThermoCouple1 ThermoCouple2 ThermoCouple3 xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC xxxTC CPC xxx LicorH2Odelta LicorCO2delta xxx 2BO2 xxx xxx HoribaCO xxx'
|
||||
separator : '\t'
|
||||
file_encoding : 'utf-8'
|
||||
timestamp: ['Date_Time']
|
||||
datetime_format: '%Y.%m.%d %H:%M:%S'
|
||||
link_to_description: 'dictionaries/gas.yaml'
|
||||
|
||||
ACSM_TOFWARE:
|
||||
table_header:
|
||||
#txt:
|
||||
- 't_base VaporizerTemp_C HeaterBias_V FlowRefWave FlowRate_mb FlowRate_ccs FilamentEmission_mA Detector_V AnalogInput06_V ABRefWave ABsamp ABCorrFact'
|
||||
- 't_start_Buf,Chl_11000,NH4_11000,SO4_11000,NO3_11000,Org_11000,SO4_48_11000,SO4_62_11000,SO4_82_11000,SO4_81_11000,SO4_98_11000,NO3_30_11000,Org_60_11000,Org_43_11000,Org_44_11000'
|
||||
#csv:
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- 'MSS_base'
|
||||
- 'tseries'
|
||||
separator:
|
||||
#txt:
|
||||
- "\t"
|
||||
- ","
|
||||
#csv:
|
||||
- "\t"
|
||||
- "\t"
|
||||
- "None"
|
||||
- "None"
|
||||
file_encoding:
|
||||
#txt:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
#csv:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
|
||||
|
@ -1,80 +1,80 @@
|
||||
import os
|
||||
import sys
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
|
||||
from instruments.readers.xps_ibw_reader import read_xps_ibw_file_as_dict
|
||||
from instruments.readers.g5505_text_reader import read_txt_files_as_dict
|
||||
|
||||
|
||||
file_extensions = ['.ibw','.txt','.dat','.h5','.TXT','.csv','.pkl','.json','.yaml']
|
||||
|
||||
# Define the instruments directory (modify this as needed or set to None)
|
||||
default_instruments_dir = None # or provide an absolute path
|
||||
|
||||
file_readers = {
|
||||
'ibw': lambda a1: read_xps_ibw_file_as_dict(a1),
|
||||
'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'TXT': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
|
||||
}
|
||||
|
||||
# Add new "instrument reader (Data flagging app data)"
|
||||
|
||||
from instruments.readers.acsm_tofware_reader import read_acsm_files_as_dict
|
||||
file_extensions.append('.txt')
|
||||
file_readers.update({'ACSM_TOFWARE_txt' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
|
||||
|
||||
file_extensions.append('.csv')
|
||||
file_readers.update({'ACSM_TOFWARE_csv' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
|
||||
|
||||
from instruments.readers.flag_reader import read_jsonflag_as_dict
|
||||
file_extensions.append('.json')
|
||||
file_readers.update({'ACSM_TOFWARE_flags_json' : lambda x: read_jsonflag_as_dict(x)})
|
||||
|
||||
def compute_filereader_key_from_path(hdf5_file_path):
|
||||
"""Constructs the key 'instrumentname_ext' based on hdf5_file_path, structured as
|
||||
/instrumentname/to/filename.ext, which access the file reader that should be used to read such a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hdf5_file_path : str
|
||||
_description_
|
||||
|
||||
Returns
|
||||
-------
|
||||
_type_
|
||||
_description_
|
||||
"""
|
||||
|
||||
parts = hdf5_file_path.strip('/').split('/')
|
||||
|
||||
# Extract the filename and its extension
|
||||
filename, file_extension = os.path.splitext(parts[-1])
|
||||
|
||||
# Extract the first directory directly under the root directory '/' in the hdf5 file
|
||||
subfolder_name = parts[0] if len(parts) > 1 else ""
|
||||
|
||||
# Remove leading dot from the file extension
|
||||
file_extension = file_extension.lstrip('.')
|
||||
|
||||
# Construct the resulting string
|
||||
full_string = f"{subfolder_name}_{file_extension}"
|
||||
|
||||
return full_string, file_extension
|
||||
|
||||
def select_file_reader(path):
|
||||
full_string, extension = compute_filereader_key_from_path(path)
|
||||
|
||||
# First, try to match the full string
|
||||
if full_string in file_readers:
|
||||
return file_readers[full_string]
|
||||
|
||||
# If no match, try to match the reader using only the extension
|
||||
if extension in file_readers:
|
||||
return file_readers[extension]
|
||||
|
||||
# Default case if no reader is found
|
||||
import os
|
||||
import sys
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
|
||||
from instruments.readers.xps_ibw_reader import read_xps_ibw_file_as_dict
|
||||
from instruments.readers.g5505_text_reader import read_txt_files_as_dict
|
||||
|
||||
|
||||
file_extensions = ['.ibw','.txt','.dat','.h5','.TXT','.csv','.pkl','.json','.yaml']
|
||||
|
||||
# Define the instruments directory (modify this as needed or set to None)
|
||||
default_instruments_dir = None # or provide an absolute path
|
||||
|
||||
file_readers = {
|
||||
'ibw': lambda a1: read_xps_ibw_file_as_dict(a1),
|
||||
'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'TXT': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
|
||||
}
|
||||
|
||||
# Add new "instrument reader (Data flagging app data)"
|
||||
|
||||
from instruments.readers.acsm_tofware_reader import read_acsm_files_as_dict
|
||||
file_extensions.append('.txt')
|
||||
file_readers.update({'ACSM_TOFWARE_txt' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
|
||||
|
||||
file_extensions.append('.csv')
|
||||
file_readers.update({'ACSM_TOFWARE_csv' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
|
||||
|
||||
from instruments.readers.flag_reader import read_jsonflag_as_dict
|
||||
file_extensions.append('.json')
|
||||
file_readers.update({'ACSM_TOFWARE_flags_json' : lambda x: read_jsonflag_as_dict(x)})
|
||||
|
||||
def compute_filereader_key_from_path(hdf5_file_path):
|
||||
"""Constructs the key 'instrumentname_ext' based on hdf5_file_path, structured as
|
||||
/instrumentname/to/filename.ext, which access the file reader that should be used to read such a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hdf5_file_path : str
|
||||
_description_
|
||||
|
||||
Returns
|
||||
-------
|
||||
_type_
|
||||
_description_
|
||||
"""
|
||||
|
||||
parts = hdf5_file_path.strip('/').split('/')
|
||||
|
||||
# Extract the filename and its extension
|
||||
filename, file_extension = os.path.splitext(parts[-1])
|
||||
|
||||
# Extract the first directory directly under the root directory '/' in the hdf5 file
|
||||
subfolder_name = parts[0] if len(parts) > 1 else ""
|
||||
|
||||
# Remove leading dot from the file extension
|
||||
file_extension = file_extension.lstrip('.')
|
||||
|
||||
# Construct the resulting string
|
||||
full_string = f"{subfolder_name}_{file_extension}"
|
||||
|
||||
return full_string, file_extension
|
||||
|
||||
def select_file_reader(path):
|
||||
full_string, extension = compute_filereader_key_from_path(path)
|
||||
|
||||
# First, try to match the full string
|
||||
if full_string in file_readers:
|
||||
return file_readers[full_string]
|
||||
|
||||
# If no match, try to match the reader using only the extension
|
||||
if extension in file_readers:
|
||||
return file_readers[extension]
|
||||
|
||||
# Default case if no reader is found
|
||||
return lambda x : None
|
@ -1,39 +1,39 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
#print(__file__)
|
||||
|
||||
#from instruments.readers import set_dima_path as configpath
|
||||
#configpath.set_dima_path()
|
||||
|
||||
from utils import g5505_utils
|
||||
|
||||
|
||||
def read_jsonflag_as_dict(path_to_file):
|
||||
|
||||
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(path_to_file)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = {}
|
||||
file_dict['datasets'] = []
|
||||
|
||||
try:
|
||||
with open(path_to_file, 'r') as stream:
|
||||
flag = json.load(stream)#, Loader=json.FullLoader)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as exc:
|
||||
print(exc)
|
||||
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = g5505_utils.convert_attrdict_to_np_structured_array(flag) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
#print(__file__)
|
||||
|
||||
#from instruments.readers import set_dima_path as configpath
|
||||
#configpath.set_dima_path()
|
||||
|
||||
from utils import g5505_utils
|
||||
|
||||
|
||||
def read_jsonflag_as_dict(path_to_file):
|
||||
|
||||
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(path_to_file)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = {}
|
||||
file_dict['datasets'] = []
|
||||
|
||||
try:
|
||||
with open(path_to_file, 'r') as stream:
|
||||
flag = json.load(stream)#, Loader=json.FullLoader)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as exc:
|
||||
print(exc)
|
||||
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = g5505_utils.convert_attrdict_to_np_structured_array(flag) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
return file_dict
|
@ -1,239 +1,239 @@
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import collections
|
||||
import yaml
|
||||
|
||||
# Import project modules
|
||||
root_dir = os.path.abspath(os.curdir)
|
||||
sys.path.append(root_dir)
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
|
||||
|
||||
def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):
|
||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||
if not instruments_dir:
|
||||
# Assuming the instruments folder is one level up from the source module directory
|
||||
module_dir = os.path.dirname(__file__)
|
||||
instruments_dir = os.path.join(module_dir, '..')
|
||||
|
||||
# Normalize the path (resolves any '..' in the path)
|
||||
instrument_configs_path = os.path.abspath(os.path.join(instruments_dir,'readers','config_text_reader.yaml'))
|
||||
|
||||
with open(instrument_configs_path,'r') as stream:
|
||||
try:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
# Verify if file can be read by available intrument configurations.
|
||||
#if not any(key in filename.replace(os.sep,'/') for key in config_dict.keys()):
|
||||
# return {}
|
||||
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
file_encoding = config_dict['default']['file_encoding'] #'utf-8'
|
||||
separator = config_dict['default']['separator']
|
||||
table_header = config_dict['default']['table_header']
|
||||
|
||||
for key in config_dict.keys():
|
||||
if key.replace('/',os.sep) in filename:
|
||||
file_encoding = config_dict[key].get('file_encoding',file_encoding)
|
||||
separator = config_dict[key].get('separator',separator)
|
||||
table_header = config_dict[key].get('table_header',table_header)
|
||||
timestamp_variables = config_dict[key].get('timestamp',[])
|
||||
datetime_format = config_dict[key].get('datetime_format',[])
|
||||
|
||||
description_dict = {}
|
||||
link_to_description = config_dict[key].get('link_to_description', '').replace('/', os.sep)
|
||||
|
||||
if link_to_description:
|
||||
path = os.path.join(instruments_dir, link_to_description)
|
||||
try:
|
||||
with open(path, 'r') as stream:
|
||||
description_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except (FileNotFoundError, yaml.YAMLError) as exc:
|
||||
print(exc)
|
||||
#if 'None' in table_header:
|
||||
# return {}
|
||||
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
if work_with_copy:
|
||||
tmp_filename = utils.make_file_copy(source_file_path=filename)
|
||||
else:
|
||||
tmp_filename = filename
|
||||
|
||||
#with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
|
||||
|
||||
if not isinstance(table_header, list):
|
||||
table_header = [table_header]
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
|
||||
#df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
timestamps_name = ' '.join(timestamp_variables)
|
||||
df_categorical_attrs[ timestamps_name] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
valid_indices = []
|
||||
if datetime_format:
|
||||
df_categorical_attrs[ timestamps_name] = pd.to_datetime(df_categorical_attrs[ timestamps_name],format=datetime_format,errors='coerce')
|
||||
valid_indices = df_categorical_attrs.dropna(subset=[timestamps_name]).index
|
||||
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
|
||||
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
|
||||
startdate = df_categorical_attrs[timestamps_name].min()
|
||||
enddate = df_categorical_attrs[timestamps_name].max()
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].astype(str)
|
||||
#header_dict.update({'stastrrtdate':startdate,'enddate':enddate})
|
||||
header_dict['startdate']= str(startdate)
|
||||
header_dict['enddate']=str(enddate)
|
||||
|
||||
if len(timestamp_variables) > 1:
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
|
||||
|
||||
#df_categorical_attrs.reindex(drop=True)
|
||||
#df_numerical_attrs.reindex(drop=True)
|
||||
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
#elif 'RGA' in filename:
|
||||
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||
|
||||
###
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(tmp_filename)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = header_dict
|
||||
file_dict['datasets'] = []
|
||||
####
|
||||
|
||||
df = pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)
|
||||
|
||||
#if numerical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||
#
|
||||
# Create attribute descriptions based on description_dict
|
||||
dataset['attributes'] = {}
|
||||
|
||||
# Annotate column headers if description_dict is non empty
|
||||
if description_dict:
|
||||
for column_name in df.columns:
|
||||
column_attr_dict = description_dict['table_header'].get(column_name,
|
||||
{'note':'there was no description available. Review instrument files.'})
|
||||
dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)})
|
||||
|
||||
#try:
|
||||
# dataset['attributes'] = description_dict['table_header'].copy()
|
||||
# for key in description_dict['table_header'].keys():
|
||||
# if not key in numerical_variables:
|
||||
# dataset['attributes'].pop(key) # delete key
|
||||
# else:
|
||||
# dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})
|
||||
#except ValueError as err:
|
||||
# print(err)
|
||||
|
||||
# Represent string values as fixed length strings in the HDF5 file, which need
|
||||
# to be decoded as string when we read them. It provides better control than variable strings,
|
||||
# at the expense of flexibility.
|
||||
# https://docs.h5py.org/en/stable/strings.html
|
||||
|
||||
|
||||
if table_preamble:
|
||||
#header_dict["table_preamble"] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset = {}
|
||||
tp_dataset['name'] = "table_preamble"
|
||||
tp_dataset['data'] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset['shape'] = tp_dataset['data'].shape
|
||||
tp_dataset['dtype'] = type(tp_dataset['data'])
|
||||
tp_dataset['attributes'] = {}
|
||||
file_dict['datasets'].append(tp_dataset)
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
#if categorical_variables:
|
||||
# dataset = {}
|
||||
# dataset['name'] = 'table_categorical_variables'
|
||||
# dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
# dataset['shape'] = dataset['data'].shape
|
||||
# dataset['dtype'] = type(dataset['data'])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
|
||||
# file_dict['datasets'].append(dataset)
|
||||
except:
|
||||
return {}
|
||||
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import collections
|
||||
import yaml
|
||||
|
||||
# Import project modules
|
||||
root_dir = os.path.abspath(os.curdir)
|
||||
sys.path.append(root_dir)
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
|
||||
|
||||
def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):
|
||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||
if not instruments_dir:
|
||||
# Assuming the instruments folder is one level up from the source module directory
|
||||
module_dir = os.path.dirname(__file__)
|
||||
instruments_dir = os.path.join(module_dir, '..')
|
||||
|
||||
# Normalize the path (resolves any '..' in the path)
|
||||
instrument_configs_path = os.path.abspath(os.path.join(instruments_dir,'readers','config_text_reader.yaml'))
|
||||
|
||||
with open(instrument_configs_path,'r') as stream:
|
||||
try:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
# Verify if file can be read by available intrument configurations.
|
||||
#if not any(key in filename.replace(os.sep,'/') for key in config_dict.keys()):
|
||||
# return {}
|
||||
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
file_encoding = config_dict['default']['file_encoding'] #'utf-8'
|
||||
separator = config_dict['default']['separator']
|
||||
table_header = config_dict['default']['table_header']
|
||||
|
||||
for key in config_dict.keys():
|
||||
if key.replace('/',os.sep) in filename:
|
||||
file_encoding = config_dict[key].get('file_encoding',file_encoding)
|
||||
separator = config_dict[key].get('separator',separator)
|
||||
table_header = config_dict[key].get('table_header',table_header)
|
||||
timestamp_variables = config_dict[key].get('timestamp',[])
|
||||
datetime_format = config_dict[key].get('datetime_format',[])
|
||||
|
||||
description_dict = {}
|
||||
link_to_description = config_dict[key].get('link_to_description', '').replace('/', os.sep)
|
||||
|
||||
if link_to_description:
|
||||
path = os.path.join(instruments_dir, link_to_description)
|
||||
try:
|
||||
with open(path, 'r') as stream:
|
||||
description_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except (FileNotFoundError, yaml.YAMLError) as exc:
|
||||
print(exc)
|
||||
#if 'None' in table_header:
|
||||
# return {}
|
||||
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
if work_with_copy:
|
||||
tmp_filename = utils.make_file_copy(source_file_path=filename)
|
||||
else:
|
||||
tmp_filename = filename
|
||||
|
||||
#with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
|
||||
|
||||
if not isinstance(table_header, list):
|
||||
table_header = [table_header]
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||
numerical_variables = [item for item in df_numerical_attrs.columns]
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
|
||||
#df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
timestamps_name = ' '.join(timestamp_variables)
|
||||
df_categorical_attrs[ timestamps_name] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
valid_indices = []
|
||||
if datetime_format:
|
||||
df_categorical_attrs[ timestamps_name] = pd.to_datetime(df_categorical_attrs[ timestamps_name],format=datetime_format,errors='coerce')
|
||||
valid_indices = df_categorical_attrs.dropna(subset=[timestamps_name]).index
|
||||
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
|
||||
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
|
||||
startdate = df_categorical_attrs[timestamps_name].min()
|
||||
enddate = df_categorical_attrs[timestamps_name].max()
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].astype(str)
|
||||
#header_dict.update({'stastrrtdate':startdate,'enddate':enddate})
|
||||
header_dict['startdate']= str(startdate)
|
||||
header_dict['enddate']=str(enddate)
|
||||
|
||||
if len(timestamp_variables) > 1:
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
|
||||
|
||||
#df_categorical_attrs.reindex(drop=True)
|
||||
#df_numerical_attrs.reindex(drop=True)
|
||||
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
#elif 'RGA' in filename:
|
||||
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||
|
||||
###
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(tmp_filename)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
# TODO: review this header dictionary, it may not be the best way to represent header data
|
||||
file_dict['attributes_dict'] = header_dict
|
||||
file_dict['datasets'] = []
|
||||
####
|
||||
|
||||
df = pd.concat((df_categorical_attrs,df_numerical_attrs),axis=1)
|
||||
|
||||
#if numerical_variables:
|
||||
dataset = {}
|
||||
dataset['name'] = 'data_table'#_numerical_variables'
|
||||
dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
#dataset['data_units'] = file_obj['wave']['data_units']
|
||||
#
|
||||
# Create attribute descriptions based on description_dict
|
||||
dataset['attributes'] = {}
|
||||
|
||||
# Annotate column headers if description_dict is non empty
|
||||
if description_dict:
|
||||
for column_name in df.columns:
|
||||
column_attr_dict = description_dict['table_header'].get(column_name,
|
||||
{'note':'there was no description available. Review instrument files.'})
|
||||
dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)})
|
||||
|
||||
#try:
|
||||
# dataset['attributes'] = description_dict['table_header'].copy()
|
||||
# for key in description_dict['table_header'].keys():
|
||||
# if not key in numerical_variables:
|
||||
# dataset['attributes'].pop(key) # delete key
|
||||
# else:
|
||||
# dataset['attributes'][key] = utils.parse_attribute(dataset['attributes'][key])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'][timestamps_name] = utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})
|
||||
#except ValueError as err:
|
||||
# print(err)
|
||||
|
||||
# Represent string values as fixed length strings in the HDF5 file, which need
|
||||
# to be decoded as string when we read them. It provides better control than variable strings,
|
||||
# at the expense of flexibility.
|
||||
# https://docs.h5py.org/en/stable/strings.html
|
||||
|
||||
|
||||
if table_preamble:
|
||||
#header_dict["table_preamble"] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset = {}
|
||||
tp_dataset['name'] = "table_preamble"
|
||||
tp_dataset['data'] = utils.convert_string_to_bytes(table_preamble)
|
||||
tp_dataset['shape'] = tp_dataset['data'].shape
|
||||
tp_dataset['dtype'] = type(tp_dataset['data'])
|
||||
tp_dataset['attributes'] = {}
|
||||
file_dict['datasets'].append(tp_dataset)
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
#if categorical_variables:
|
||||
# dataset = {}
|
||||
# dataset['name'] = 'table_categorical_variables'
|
||||
# dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
|
||||
# dataset['shape'] = dataset['data'].shape
|
||||
# dataset['dtype'] = type(dataset['data'])
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
|
||||
# file_dict['datasets'].append(dataset)
|
||||
except:
|
||||
return {}
|
||||
|
||||
return file_dict
|
@ -1,79 +1,79 @@
|
||||
import os
|
||||
from igor2.binarywave import load as loadibw
|
||||
|
||||
def read_xps_ibw_file_as_dict(filename):
|
||||
"""
|
||||
Reads IBW files from the Multiphase Chemistry Group, which contain XPS spectra and acquisition settings,
|
||||
and formats the data into a dictionary with the structure {datasets: list of datasets}. Each dataset in the
|
||||
list has the following structure:
|
||||
|
||||
{
|
||||
'name': 'name',
|
||||
'data': data_array,
|
||||
'data_units': 'units',
|
||||
'shape': data_shape,
|
||||
'dtype': data_type
|
||||
}
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
The IBW filename from the Multiphase Chemistry Group beamline.
|
||||
|
||||
Returns
|
||||
-------
|
||||
file_dict : dict
|
||||
A dictionary containing the datasets from the IBW file.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the input IBW file is not a valid IBW file.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
file_obj = loadibw(filename)
|
||||
|
||||
required_keys = ['wData','data_units','dimension_units','note']
|
||||
if sum([item in required_keys for item in file_obj['wave'].keys()]) < len(required_keys):
|
||||
raise ValueError('This is not a valid xps ibw file. It does not satisfy minimum adimissibility criteria.')
|
||||
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(filename)
|
||||
|
||||
# Group name and attributes
|
||||
file_dict['name'] = path_head
|
||||
file_dict['attributes_dict'] = {}
|
||||
|
||||
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
|
||||
notes_list = file_obj['wave']['note'].decode("utf-8").split('\r')
|
||||
exclude_list = ['Excitation Energy']
|
||||
for item in notes_list:
|
||||
if '=' in item:
|
||||
key, value = tuple(item.split('='))
|
||||
# TODO: check if value can be converted into a numeric type. Now all values are string type
|
||||
if not key in exclude_list:
|
||||
file_dict['attributes_dict'][key] = value
|
||||
|
||||
# TODO: talk to Thorsten to see if there is an easier way to access the below attributes
|
||||
dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
|
||||
file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
|
||||
|
||||
# Datasets and their attributes
|
||||
|
||||
file_dict['datasets'] = []
|
||||
|
||||
dataset = {}
|
||||
dataset['name'] = 'spectrum'
|
||||
dataset['data'] = file_obj['wave']['wData']
|
||||
dataset['data_units'] = file_obj['wave']['data_units']
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
|
||||
# TODO: include energy axis dataset
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
import os
|
||||
from igor2.binarywave import load as loadibw
|
||||
|
||||
def read_xps_ibw_file_as_dict(filename):
|
||||
"""
|
||||
Reads IBW files from the Multiphase Chemistry Group, which contain XPS spectra and acquisition settings,
|
||||
and formats the data into a dictionary with the structure {datasets: list of datasets}. Each dataset in the
|
||||
list has the following structure:
|
||||
|
||||
{
|
||||
'name': 'name',
|
||||
'data': data_array,
|
||||
'data_units': 'units',
|
||||
'shape': data_shape,
|
||||
'dtype': data_type
|
||||
}
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
The IBW filename from the Multiphase Chemistry Group beamline.
|
||||
|
||||
Returns
|
||||
-------
|
||||
file_dict : dict
|
||||
A dictionary containing the datasets from the IBW file.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the input IBW file is not a valid IBW file.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
file_obj = loadibw(filename)
|
||||
|
||||
required_keys = ['wData','data_units','dimension_units','note']
|
||||
if sum([item in required_keys for item in file_obj['wave'].keys()]) < len(required_keys):
|
||||
raise ValueError('This is not a valid xps ibw file. It does not satisfy minimum adimissibility criteria.')
|
||||
|
||||
file_dict = {}
|
||||
path_tail, path_head = os.path.split(filename)
|
||||
|
||||
# Group name and attributes
|
||||
file_dict['name'] = path_head
|
||||
file_dict['attributes_dict'] = {}
|
||||
|
||||
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
|
||||
notes_list = file_obj['wave']['note'].decode("utf-8").split('\r')
|
||||
exclude_list = ['Excitation Energy']
|
||||
for item in notes_list:
|
||||
if '=' in item:
|
||||
key, value = tuple(item.split('='))
|
||||
# TODO: check if value can be converted into a numeric type. Now all values are string type
|
||||
if not key in exclude_list:
|
||||
file_dict['attributes_dict'][key] = value
|
||||
|
||||
# TODO: talk to Thorsten to see if there is an easier way to access the below attributes
|
||||
dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
|
||||
file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
|
||||
|
||||
# Datasets and their attributes
|
||||
|
||||
file_dict['datasets'] = []
|
||||
|
||||
dataset = {}
|
||||
dataset['name'] = 'spectrum'
|
||||
dataset['data'] = file_obj['wave']['wData']
|
||||
dataset['data_units'] = file_obj['wave']['data_units']
|
||||
dataset['shape'] = dataset['data'].shape
|
||||
dataset['dtype'] = type(dataset['data'])
|
||||
|
||||
# TODO: include energy axis dataset
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
|
||||
return file_dict
|
Reference in New Issue
Block a user