Modify instruments/readers/g5505_text_reader.py to include new instrument CEDOAS, which produces multi-format files. The updated file dependencies.

This commit is contained in:
2025-06-25 12:00:55 +02:00
parent 334335387e
commit e6df345578
5 changed files with 257 additions and 153 deletions

View File

@ -0,0 +1,42 @@
table_header:
w_CenterTime:
description: time between start and stop of the measurement
units: YYYY/MM/DD HH:MM:SS
rename_as: center_time
w_StartTime:
description: Start time of the measurement
units: YYYY/MM/DD HH:MM:SS
rename_as: start_time
w_StopTime:
description: Stop time of the measurement
units: YYYY/MM/DD HH:MM:SS
rename_as: stop_time
w_I2_molec_cm3:
description: I2 concentration
units: cm^-1
rename_as: i2_concentration
w_I2_SlCol:
description: I2 concentration sl #?
units: ppb #?
rename_as: i2_sl
w_I2_SlErr:
description: Uncertainty in I2 concentration sl #?
units: ppb #?
rename_as: i2_sl_uncertainty
w_I2_VMR:
description: I2 concentration vmr #?
units: ppb #?
rename_as: i2_vmr
w_I2_VMRErr:
description: Uncertainty in I2 concentration vmr
units: ppb #?
rename_as: i2_vmr_uncertainty
w_Rho:
description: Rho #?
units: ppb #?
rename_as: rho
w_RMS:
description: RMS #?
units: ppb #?
rename_as: rms

View File

@ -36,6 +36,8 @@ file_readers = {
'ACSM_TOFWARE_flags_json' : lambda x: read_jsonflag_as_dict(x),
'ACSM_TOFWARE_nas' : lambda x: read_nasa_ames_as_dict(x)}
file_readers.update({'CEDOAS_txt' : lambda x: read_txt_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
REGISTRY_FILE = "registry.yaml" #os.path.join(os.path.dirname(__file__), "registry.yaml")
def load_registry():

View File

@ -81,32 +81,18 @@ gas:
datetime_format: '%Y.%m.%d %H:%M:%S'
link_to_description: 'dictionaries/gas.yaml'
ACSM_TOFWARE:
table_header:
#txt:
- 't_base VaporizerTemp_C HeaterBias_V FlowRefWave FlowRate_mb FlowRate_ccs FilamentEmission_mA Detector_V AnalogInput06_V ABRefWave ABsamp ABCorrFact'
- 't_start_Buf,Chl_11000,NH4_11000,SO4_11000,NO3_11000,Org_11000,SO4_48_11000,SO4_62_11000,SO4_82_11000,SO4_81_11000,SO4_98_11000,NO3_30_11000,Org_60_11000,Org_43_11000,Org_44_11000'
#csv:
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
- 'MSS_base'
- 'tseries'
separator:
#txt:
- "\t"
- ","
#csv:
- "\t"
- "\t"
- "None"
- "None"
file_encoding:
#txt:
- "utf-8"
- "utf-8"
#csv:
- "utf-8"
- "utf-8"
- "utf-8"
- "utf-8"
CEDOAS: #CE-DOAS/I2:
formats:
- table_header: 'w_CenterTime w_StartTime w_StopTime w_I2_molec_cm3 w_I2_SlCol w_I2_SlErr w_I2_VMR w_I2_VMRErr w_Rho w_RMS'
separator: '\t'
file_encoding: 'utf-8'
timestamp: ['w_CenterTime']
datetime_format: '%Y/%m/%d %H:%M:%S'
- table_header: 'TimeStamp,Seconds_Midnight,Year,Month,Day,Hour,Minute,Second,HK0,HK1,HK2,HK3,HK4,HK5,HK6,HK7,HK8,HK9,HK10,HK11,HK12,HK13,HK14,HK15,RTD0_OO1,RTD1_LED,RTD2,RTD3_CBox,RTD4_Gas1,RTD5,RTD6,RTD7,Temp0,Temp1,Temp2,Temp3,DutyCycle0,DutyCycle1,DutyCycle2,DutyCycle3,Relay4,Relay5,Shutter0,Shutter1,Diode0Threshold,Diode0Hysteresis,Diode1Threshold,Diode1Hysteresis,SWTargetPosition,SWCurrentPosition,ELTargetPosition'
separator: ','
file_encoding: 'utf-8'
#timestamp: []
#datetime_format:
link_to_description: 'dictionaries/CEDOAS.yaml'

View File

@ -23,90 +23,6 @@ import warnings
import utils.g5505_utils as utils
def detect_table_header_line(filepath, table_header_list, encoding_list, separator_list, verbose=False):
"""
Detects the table header line in the file and returns:
- header_line_idx (int)
- column_names (List[str])
- tb_idx used
- preamble_lines (List[str])
Returns (-1, [], None, []) if not found.
"""
preamble_lines = []
header_line_idx = -1
column_names = []
tb_idx = None
with open(filepath, 'rb') as f:
for line_number, line in enumerate(f):
decoded_line = line.decode(encoding_list[0]) # assume consistent encoding initially
for idx, tb in enumerate(table_header_list):
if tb in decoded_line:
tb_idx = idx
list_of_substrings = decoded_line.split(separator_list[idx].replace('\\t', '\t'))
counts = collections.Counter(list_of_substrings)
column_names = [f"{i}_{name.strip()}" if counts[name] > 1 else name.strip()
for i, name in enumerate(list_of_substrings)]
header_line_idx = line_number
if verbose:
print(f"[Detected header] Line {line_number}: {column_names}")
return header_line_idx, column_names, tb_idx, preamble_lines
preamble_lines.append(' '.join(decoded_line.split()))
warnings.warn("Table header was not detected using known patterns. Will attempt inference mode.")
return -1, [], None, preamble_lines
def load_file_reader_parameters(filename: str, instruments_dir: str) -> tuple:
"""
Load file reader configuration parameters based on the file and instrument directory.
Returns:
- config_dict: Full configuration dictionary
- file_encoding
- separator
- table_header
- timestamp_variables
- datetime_format
- description_dict
"""
config_path = os.path.abspath(os.path.join(instruments_dir, 'readers', 'config_text_reader.yaml'))
try:
with open(config_path, 'r') as stream:
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(f"[YAML Load Error] {exc}")
return {}, '', '', '', [], [], {}
# Defaults
file_encoding = config_dict.get('default', {}).get('file_encoding', 'utf-8')
separator = config_dict.get('default', {}).get('separator', ',')
table_header = config_dict.get('default', {}).get('table_header', 'infer')
timestamp_variables = []
datetime_format = []
description_dict = {}
for instFolder in config_dict.keys():
if instFolder in filename.split(os.sep):
file_encoding = config_dict[instFolder].get('file_encoding', file_encoding)
separator = config_dict[instFolder].get('separator', separator)
table_header = config_dict[instFolder].get('table_header', table_header)
timestamp_variables = config_dict[instFolder].get('timestamp', [])
datetime_format = config_dict[instFolder].get('datetime_format', [])
link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep)
if link_to_description:
path = os.path.join(instruments_dir, link_to_description)
try:
with open(path, 'r') as stream:
description_dict = yaml.load(stream, Loader=yaml.FullLoader)
except (FileNotFoundError, yaml.YAMLError) as exc:
print(f"[Description Load Error] {exc}")
return (config_dict, file_encoding, separator, table_header,
timestamp_variables, datetime_format, description_dict)
def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):
filename = os.path.normpath(filename)
@ -116,13 +32,16 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
module_dir = os.path.dirname(__file__)
instruments_dir = os.path.join(module_dir, '..')
(config_dict,
file_encoding,
separator,
table_header,
timestamp_variables,
datetime_format,
description_dict) = load_file_reader_parameters(filename, instruments_dir)
#(config_dict,
#file_encoding,
#separator,
#table_header,
#timestamp_variables,
#datetime_format,
#description_dict) = load_file_reader_parameters(filename, instruments_dir)
format_variants, description_dict = load_file_reader_parameters(filename, instruments_dir)
# Read header as a dictionary and detect where data table starts
header_dict = {'actris_level': 0, 'processing_date':utils.created_at(), 'processing_script' : os.path.relpath(thisFilePath,dimaPath)}
@ -133,40 +52,36 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
else:
tmp_filename = filename
#with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
# Run header detection
header_line_number, column_names, fmt_dict, table_preamble = detect_table_header_line(tmp_filename, format_variants)
if not isinstance(table_header, list):
table_header = [table_header]
file_encoding = [file_encoding]
separator = [separator]
# Unpack validated format info
table_header = fmt_dict['table_header']
separator = fmt_dict['separator']
file_encoding = fmt_dict['file_encoding']
timestamp_variables = fmt_dict.get('timestamp', [])
datetime_format = fmt_dict.get('datetime_format', None)
desired_datetime_fmt = fmt_dict['desired_datetime_format']
table_preamble = []
line_number = 0
if 'infer' not in table_header:
header_line_idx, column_names, tb_idx, table_preamble = detect_table_header_line(
tmp_filename, table_header, file_encoding, separator)
# Ensure separator is valid
if not isinstance(separator, str) or not separator.strip():
raise ValueError(f"Invalid separator found in format: {repr(separator)}")
if header_line_idx == -1:
table_header = ['infer'] # fallback to pandas' inference
# TODO: it does not work with separator as none :(. fix for RGA
# Load DataFrame
try:
if not 'infer' in table_header:
df = pd.read_csv(tmp_filename,
delimiter = separator[tb_idx].replace('\\t','\t'),
header=header_line_idx,
encoding = file_encoding[tb_idx],
if 'infer' not in table_header:
df = pd.read_csv(tmp_filename,
delimiter=separator,
header=header_line_number,
encoding=file_encoding,
names=column_names,
skip_blank_lines=True)
else:
df = pd.read_csv(tmp_filename,
delimiter = separator[tb_idx].replace('\\t','\t'),
header=line_number,
encoding = file_encoding[tb_idx],
skip_blank_lines=True)
df = pd.read_csv(tmp_filename,
delimiter=separator,
header=header_line_number,
encoding=file_encoding,
skip_blank_lines=True)
df_numerical_attrs = df.select_dtypes(include ='number')
df_categorical_attrs = df.select_dtypes(exclude='number')
@ -174,6 +89,10 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
if timestamp_variables:
if not all(col in df_categorical_attrs.columns for col in timestamp_variables):
raise ValueError(f"Invalid timestamp columns: {[col for col in timestamp_variables if col not in df_categorical_attrs.columns]}.")
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
@ -189,7 +108,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(desired_datetime_fmt)
startdate = df_categorical_attrs[timestamps_name].min()
enddate = df_categorical_attrs[timestamps_name].max()
@ -202,12 +121,6 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
#df_categorical_attrs.reindex(drop=True)
#df_numerical_attrs.reindex(drop=True)
categorical_variables = [item for item in df_categorical_attrs.columns]
####
#elif 'RGA' in filename:
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
@ -282,13 +195,169 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
# if timestamps_name in categorical_variables:
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
# file_dict['datasets'].append(dataset)
#except Exception as e:
except Exception as e:
#raise RuntimeError(f"Failed to read file with detected format: {e}")
print(e)
return {}
return file_dict
## Supporting functions
def detect_table_header_line(filepath, format_variants, verbose=False):
"""
Tries multiple format variants to detect the table header line in the file.
Args:
filepath (str): Path to file.
format_variants (List[Dict]): Each must contain:
- 'file_encoding' (str)
- 'separator' (str)
- 'table_header' (str or list of str)
verbose (bool): If True, prints debug info.
Returns:
Tuple:
- header_line_idx (int)
- column_names (List[str])
- matched_format (Dict[str, Any]) # full format dict (validated)
- preamble_lines (List[str])
"""
import collections
import warnings
for idx, fmt in enumerate(format_variants):
# Validate format dict
if 'file_encoding' not in fmt or not isinstance(fmt['file_encoding'], str):
raise ValueError(f"[Format {idx}] 'file_encoding' must be a string.")
if 'separator' not in fmt or not isinstance(fmt['separator'], str):
raise ValueError(f"[Format {idx}] 'separator' must be a string.")
if 'table_header' not in fmt or not isinstance(fmt['table_header'], (str, list)):
raise ValueError(f"[Format {idx}] 'table_header' must be a string or list of strings.")
encoding = fmt['file_encoding']
separator = fmt['separator']
header_patterns = fmt['table_header']
if isinstance(header_patterns, str):
header_patterns = [header_patterns]
preamble_lines = []
try:
with open(filepath, 'rb') as f:
for line_number, line in enumerate(f):
try:
decoded_line = line.decode(encoding)
except UnicodeDecodeError:
break # Try next format
for pattern in header_patterns:
if pattern in decoded_line:
substrings = decoded_line.split(separator.replace('\\t', '\t'))
counts = collections.Counter(substrings)
column_names = [
f"{i}_{name.strip()}" if counts[name] > 1 else name.strip()
for i, name in enumerate(substrings)
]
if verbose:
print(f"[Detected header] Line {line_number}: {column_names}")
return line_number, column_names, fmt, preamble_lines
preamble_lines.append(' '.join(decoded_line.split()))
except Exception as e:
if verbose:
print(f"[Format {idx}] Attempt failed: {e}")
continue
warnings.warn("Table header was not detected using known patterns. Will attempt inference mode.")
# Return fallback format with 'infer' but retain encoding/separator from first variant
fallback_fmt = {
'file_encoding': 'utf-8',
'separator': ',',
'table_header': ['infer']
}
return -1, [], fallback_fmt, []
def load_file_reader_parameters(filename: str, instruments_dir: str) -> tuple:
"""
Load file reader configuration parameters based on the file and instrument directory.
Returns:
- format_variants: List of dicts with keys:
'file_encoding', 'separator', 'table_header', 'timestamp', 'datetime_format', 'desired_datetime_format'
- description_dict: Dict loaded from instrument's description YAML
"""
config_path = os.path.abspath(os.path.join(instruments_dir, 'readers', 'config_text_reader.yaml'))
if not os.path.exists(config_dict):
config_path = os.path.join(dimaPath,'instruments','readers', 'config_text_reader.yaml')
try:
with open(config_path, 'r') as stream:
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
print(f"[YAML Load Error] {exc}")
return {}, [], {}
default_config = config_dict.get('default', {})
default_format = {
'file_encoding': default_config.get('file_encoding', 'utf-8'),
'separator': default_config.get('separator', ',').replace('\\t','\t'),
'table_header': default_config.get('table_header', 'infer'),
'timestamp': [],
'datetime_format': default_config.get('datetime_format', '%Y-%m-%d %H:%M:%S.%f'),
'desired_datetime_format' : default_config.get('desired_format', '%Y-%m-%d %H:%M:%S.%f')
}
format_variants = []
description_dict = {}
# Match instrument key by folder name in file path
filename = os.path.normpath(filename)
for instFolder in config_dict.keys():
if instFolder in filename.split(os.sep):
inst_config = config_dict[instFolder]
# New style: has 'formats' block
if 'formats' in inst_config:
for fmt in inst_config['formats']:
format_variants.append({
'file_encoding': fmt.get('file_encoding', default_format['file_encoding']),
'separator': fmt.get('separator', default_format['separator']),
'table_header': fmt.get('table_header', default_format['table_header']),
'timestamp': fmt.get('timestamp', []),
'datetime_format': fmt.get('datetime_format', default_config['datetime_format']),
'desired_datetime_format' : default_config['desired_datetime_format']
})
else:
# Old style: flat format
format_variants.append({
'file_encoding': inst_config.get('file_encoding', default_format['file_encoding']),
'separator': inst_config.get('separator', default_format['separator']),
'table_header': inst_config.get('table_header', default_format['table_header']),
'timestamp': inst_config.get('timestamp', []),
'datetime_format': inst_config.get('datetime_format', default_config['datetime_format']),
'desired_datetime_format' : default_config['desired_datetime_format']
})
# Description loading
link_to_description = inst_config.get('link_to_description', '').replace('/', os.sep)
if link_to_description:
desc_path = os.path.join(instruments_dir, link_to_description)
try:
with open(desc_path, 'r') as desc_stream:
description_dict = yaml.load(desc_stream, Loader=yaml.FullLoader)
except (FileNotFoundError, yaml.YAMLError) as exc:
print(f"[Description Load Error] {exc}")
break # Stop after first match
# Always return config_dict + list of formats + description
return format_variants, description_dict
if __name__ == "__main__":

View File

@ -83,3 +83,8 @@ instruments:
fileExtension: yaml,yml,json
fileReaderPath: instruments/readers/read_structured_file_as_dict.py
InstrumentDictionaryPath: instruments/dictionaries/EBAS.yaml
- instrumentFolderName: CEDOAS
fileExtension: txt
fileReaderPath: instruments/readers/g5505_text_reader.py
InstrumentDictionaryPath: instruments/dictionaries/CEDOAS.yaml