Modify instruments/readers/g5505_text_reader.py to include new instrument CEDOAS, which produces multi-format files. The updated file dependencies.

2025-06-25 12:00:55 +02:00
parent 334335387e
commit e6df345578
5 changed files with 257 additions and 153 deletions
--- a/instruments/dictionaries/CEDOAS.yaml
+++ b/instruments/dictionaries/CEDOAS.yaml
@ -0,0 +1,42 @@
+table_header:
+  w_CenterTime:
+    description: time between start and stop of the measurement
+    units: YYYY/MM/DD HH:MM:SS
+    rename_as: center_time
+  w_StartTime:
+    description: Start time of the measurement
+    units: YYYY/MM/DD HH:MM:SS
+    rename_as: start_time
+  w_StopTime:
+    description: Stop time of the measurement
+    units: YYYY/MM/DD HH:MM:SS
+    rename_as: stop_time
+  w_I2_molec_cm3:
+    description: I2 concentration
+    units: cm^-1
+    rename_as: i2_concentration
+  w_I2_SlCol:
+    description: I2 concentration sl #?
+    units: ppb #?
+    rename_as: i2_sl
+  w_I2_SlErr:
+    description: Uncertainty in I2 concentration sl #?
+    units: ppb #?
+    rename_as: i2_sl_uncertainty
+  w_I2_VMR:
+    description: I2 concentration vmr #?
+    units: ppb #?
+    rename_as: i2_vmr
+  w_I2_VMRErr:
+    description: Uncertainty in I2 concentration vmr
+    units: ppb #?
+    rename_as: i2_vmr_uncertainty
+  w_Rho:
+    description: Rho #?
+    units: ppb #?
+    rename_as: rho
+  w_RMS:
+    description: RMS #?
+    units: ppb #?
+    rename_as: rms
+  
--- a/instruments/filereader_registry.py
+++ b/instruments/filereader_registry.py
@ -36,6 +36,8 @@ file_readers = {
    'ACSM_TOFWARE_flags_json' : lambda x: read_jsonflag_as_dict(x),
    'ACSM_TOFWARE_nas' : lambda x: read_nasa_ames_as_dict(x)}

+file_readers.update({'CEDOAS_txt' : lambda x: read_txt_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
+
 REGISTRY_FILE = "registry.yaml" #os.path.join(os.path.dirname(__file__), "registry.yaml")

 def load_registry():
--- a/instruments/readers/config_text_reader.yaml
+++ b/instruments/readers/config_text_reader.yaml
@ -81,32 +81,18 @@ gas:
  datetime_format: '%Y.%m.%d %H:%M:%S'
  link_to_description: 'dictionaries/gas.yaml'

-ACSM_TOFWARE:
-  table_header:
-  #txt:
-    - 't_base	VaporizerTemp_C	HeaterBias_V	FlowRefWave	FlowRate_mb	FlowRate_ccs	FilamentEmission_mA	Detector_V	AnalogInput06_V	ABRefWave	ABsamp	ABCorrFact'
-    - 't_start_Buf,Chl_11000,NH4_11000,SO4_11000,NO3_11000,Org_11000,SO4_48_11000,SO4_62_11000,SO4_82_11000,SO4_81_11000,SO4_98_11000,NO3_30_11000,Org_60_11000,Org_43_11000,Org_44_11000'
-  #csv:
-    - "X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14	X15	X16	X17	X18	X19	X20	X21	X22	X23	X24	X25	X26	X27	X28	X29	X30	X31	X32	X33	X34	X35	X36	X37	X38	X39	X40	X41	X42	X43	X44	X45	X46	X47	X48	X49	X50	X51	X52	X53	X54	X55	X56	X57	X58	X59	X60	X61	X62	X63	X64	X65	X66	X67	X68	X69	X70	X71	X72	X73	X74	X75	X76	X77	X78	X79	X80	X81	X82	X83	X84	X85	X86	X87	X88	X89	X90	X91	X92	X93	X94	X95	X96	X97	X98	X99	X100	X101	X102	X103	X104	X105	X106	X107	X108	X109	X110	X111	X112	X113	X114	X115	X116	X117	X118	X119	X120	X121	X122	X123	X124	X125	X126	X127	X128	X129	X130	X131	X132	X133	X134	X135	X136	X137	X138	X139	X140	X141	X142	X143	X144	X145	X146	X147	X148	X149	X150	X151	X152	X153	X154	X155	X156	X157	X158	X159	X160	X161	X162	X163	X164	X165	X166	X167	X168	X169	X170	X171	X172	X173	X174	X175	X176	X177	X178	X179	X180	X181	X182	X183	X184	X185	X186	X187	X188	X189	X190	X191	X192	X193	X194	X195	X196	X197	X198	X199	X200	X201	X202	X203	X204	X205	X206	X207	X208	X209	X210	X211	X212	X213	X214	X215	X216	X217	X218	X219"
-    - "X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14	X15	X16	X17	X18	X19	X20	X21	X22	X23	X24	X25	X26	X27	X28	X29	X30	X31	X32	X33	X34	X35	X36	X37	X38	X39	X40	X41	X42	X43	X44	X45	X46	X47	X48	X49	X50	X51	X52	X53	X54	X55	X56	X57	X58	X59	X60	X61	X62	X63	X64	X65	X66	X67	X68	X69	X70	X71	X72	X73	X74	X75	X76	X77	X78	X79	X80	X81	X82	X83	X84	X85	X86	X87	X88	X89	X90	X91	X92	X93	X94	X95	X96	X97	X98	X99	X100	X101	X102	X103	X104	X105	X106	X107	X108	X109	X110	X111	X112	X113	X114	X115	X116	X117	X118	X119	X120	X121	X122	X123	X124	X125	X126	X127	X128	X129	X130	X131	X132	X133	X134	X135	X136	X137	X138	X139	X140	X141	X142	X143	X144	X145	X146	X147	X148	X149	X150	X151	X152	X153	X154	X155	X156	X157	X158	X159	X160	X161	X162	X163	X164	X165	X166	X167	X168	X169	X170	X171	X172	X173	X174	X175	X176	X177	X178	X179	X180	X181	X182	X183	X184	X185	X186	X187	X188	X189	X190	X191	X192	X193	X194	X195	X196	X197	X198	X199	X200	X201	X202	X203	X204	X205	X206	X207	X208	X209	X210	X211	X212	X213	X214	X215	X216	X217	X218	X219"
-    - 'MSS_base'
-    - 'tseries'
-  separator:
-  #txt: 
-    - "\t"
-    - ","
-  #csv:
-    - "\t"
-    - "\t"
-    - "None"
-    - "None"
-  file_encoding:
-  #txt:
-    - "utf-8"
-    - "utf-8"
-  #csv:
-    - "utf-8"
-    - "utf-8"
-    - "utf-8"
-    - "utf-8"
+CEDOAS: #CE-DOAS/I2:
+  formats:
+    - table_header: 'w_CenterTime	w_StartTime	w_StopTime	w_I2_molec_cm3	w_I2_SlCol	w_I2_SlErr	w_I2_VMR	w_I2_VMRErr	w_Rho	w_RMS'
+      separator: '\t'
+      file_encoding: 'utf-8'
+      timestamp: ['w_CenterTime']
+      datetime_format: '%Y/%m/%d %H:%M:%S'

+    - table_header: 'TimeStamp,Seconds_Midnight,Year,Month,Day,Hour,Minute,Second,HK0,HK1,HK2,HK3,HK4,HK5,HK6,HK7,HK8,HK9,HK10,HK11,HK12,HK13,HK14,HK15,RTD0_OO1,RTD1_LED,RTD2,RTD3_CBox,RTD4_Gas1,RTD5,RTD6,RTD7,Temp0,Temp1,Temp2,Temp3,DutyCycle0,DutyCycle1,DutyCycle2,DutyCycle3,Relay4,Relay5,Shutter0,Shutter1,Diode0Threshold,Diode0Hysteresis,Diode1Threshold,Diode1Hysteresis,SWTargetPosition,SWCurrentPosition,ELTargetPosition'
+      separator: ','
+      file_encoding: 'utf-8'      
+      #timestamp: []
+      #datetime_format: 
+
+  link_to_description: 'dictionaries/CEDOAS.yaml'
--- a/instruments/readers/g5505_text_reader.py
+++ b/instruments/readers/g5505_text_reader.py
@ -23,90 +23,6 @@ import warnings
 import utils.g5505_utils as utils


-def detect_table_header_line(filepath, table_header_list, encoding_list, separator_list, verbose=False):
-    """
-    Detects the table header line in the file and returns:
-    - header_line_idx (int)
-    - column_names (List[str])
-    - tb_idx used
-    - preamble_lines (List[str])
-    Returns (-1, [], None, []) if not found.
-    """
-    preamble_lines = []
-    header_line_idx = -1
-    column_names = []
-    tb_idx = None
-
-    with open(filepath, 'rb') as f:
-        for line_number, line in enumerate(f):
-            decoded_line = line.decode(encoding_list[0])  # assume consistent encoding initially
-            for idx, tb in enumerate(table_header_list):
-                if tb in decoded_line:
-                    tb_idx = idx
-                    list_of_substrings = decoded_line.split(separator_list[idx].replace('\\t', '\t'))
-                    counts = collections.Counter(list_of_substrings)
-                    column_names = [f"{i}_{name.strip()}" if counts[name] > 1 else name.strip()
-                                    for i, name in enumerate(list_of_substrings)]
-                    header_line_idx = line_number
-                    if verbose:
-                        print(f"[Detected header] Line {line_number}: {column_names}")
-                    return header_line_idx, column_names, tb_idx, preamble_lines
-            preamble_lines.append(' '.join(decoded_line.split()))
-    
-    warnings.warn("Table header was not detected using known patterns. Will attempt inference mode.")
-    return -1, [], None, preamble_lines
-
-def load_file_reader_parameters(filename: str, instruments_dir: str) -> tuple:
-    """
-    Load file reader configuration parameters based on the file and instrument directory.
-
-    Returns:
-        - config_dict: Full configuration dictionary
-        - file_encoding
-        - separator
-        - table_header
-        - timestamp_variables
-        - datetime_format
-        - description_dict
-    """
-    config_path = os.path.abspath(os.path.join(instruments_dir, 'readers', 'config_text_reader.yaml'))
-
-    try:
-        with open(config_path, 'r') as stream:
-            config_dict = yaml.load(stream, Loader=yaml.FullLoader)
-    except yaml.YAMLError as exc:
-        print(f"[YAML Load Error] {exc}")
-        return {}, '', '', '', [], [], {}
-
-    # Defaults
-    file_encoding = config_dict.get('default', {}).get('file_encoding', 'utf-8')
-    separator = config_dict.get('default', {}).get('separator', ',')
-    table_header = config_dict.get('default', {}).get('table_header', 'infer')
-    timestamp_variables = []
-    datetime_format = []
-    description_dict = {}
-
-    for instFolder in config_dict.keys():
-        if instFolder in filename.split(os.sep):
-            file_encoding = config_dict[instFolder].get('file_encoding', file_encoding)
-            separator = config_dict[instFolder].get('separator', separator)
-            table_header = config_dict[instFolder].get('table_header', table_header)
-            timestamp_variables = config_dict[instFolder].get('timestamp', [])
-            datetime_format = config_dict[instFolder].get('datetime_format', [])
-
-            link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep)
-            if link_to_description:
-                path = os.path.join(instruments_dir, link_to_description)
-                try:
-                    with open(path, 'r') as stream:
-                        description_dict = yaml.load(stream, Loader=yaml.FullLoader)
-                except (FileNotFoundError, yaml.YAMLError) as exc:
-                    print(f"[Description Load Error] {exc}")
-
-    return (config_dict, file_encoding, separator, table_header,
-            timestamp_variables, datetime_format, description_dict)
-
-
 def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with_copy: bool = True):

    filename = os.path.normpath(filename)
@ -116,13 +32,16 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
        module_dir = os.path.dirname(__file__)
        instruments_dir = os.path.join(module_dir, '..')

-    (config_dict,
-    file_encoding,
-    separator,
-    table_header,
-    timestamp_variables,
-    datetime_format,
-    description_dict) = load_file_reader_parameters(filename, instruments_dir)
+    #(config_dict,
+    #file_encoding,
+    #separator,
+    #table_header,
+    #timestamp_variables,
+    #datetime_format,
+    #description_dict) = load_file_reader_parameters(filename, instruments_dir)
+
+    format_variants, description_dict = load_file_reader_parameters(filename, instruments_dir)
+

    # Read header as a dictionary and detect where data table starts
    header_dict = {'actris_level': 0, 'processing_date':utils.created_at(), 'processing_script' : os.path.relpath(thisFilePath,dimaPath)}
@ -133,40 +52,36 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
    else:
        tmp_filename = filename

-    #with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
+    # Run header detection
+    header_line_number, column_names, fmt_dict, table_preamble = detect_table_header_line(tmp_filename, format_variants)

-    if not isinstance(table_header, list):
-        
-        table_header = [table_header]
-        file_encoding = [file_encoding]
-        separator = [separator]
+    # Unpack validated format info
+    table_header = fmt_dict['table_header']
+    separator = fmt_dict['separator']
+    file_encoding = fmt_dict['file_encoding']
+    timestamp_variables = fmt_dict.get('timestamp', [])
+    datetime_format = fmt_dict.get('datetime_format', None)
+    desired_datetime_fmt = fmt_dict['desired_datetime_format']

-    table_preamble = []
-    line_number = 0
-    if 'infer' not in table_header:
-        header_line_idx, column_names, tb_idx, table_preamble = detect_table_header_line(
-            tmp_filename, table_header, file_encoding, separator)
+    # Ensure separator is valid
+    if not isinstance(separator, str) or not separator.strip():
+        raise ValueError(f"Invalid separator found in format: {repr(separator)}")

-        if header_line_idx == -1:
-            table_header = ['infer']  # fallback to pandas' inference
-
-  
-    # TODO: it does not work with separator as none :(. fix for RGA
-    
+    # Load DataFrame
    try:
-        if not 'infer' in table_header:            
-            df = pd.read_csv(tmp_filename, 
-                            delimiter = separator[tb_idx].replace('\\t','\t'), 
-                            header=header_line_idx, 
-                            encoding = file_encoding[tb_idx],
+        if 'infer' not in table_header:
+            df = pd.read_csv(tmp_filename,
+                            delimiter=separator,
+                            header=header_line_number,
+                            encoding=file_encoding,
                            names=column_names,
                            skip_blank_lines=True)
        else:
-            df = pd.read_csv(tmp_filename, 
-                delimiter = separator[tb_idx].replace('\\t','\t'), 
-                header=line_number, 
-                encoding = file_encoding[tb_idx],
-                skip_blank_lines=True)
+            df = pd.read_csv(tmp_filename,
+                            delimiter=separator,
+                            header=header_line_number,
+                            encoding=file_encoding,
+                            skip_blank_lines=True)
   
        df_numerical_attrs = df.select_dtypes(include ='number')
        df_categorical_attrs = df.select_dtypes(exclude='number')
@ -174,6 +89,10 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
        
        # Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
        if timestamp_variables:
+
+            if not all(col in df_categorical_attrs.columns for col in timestamp_variables):
+                raise ValueError(f"Invalid timestamp columns: {[col for col in timestamp_variables if col not in df_categorical_attrs.columns]}.")
+
            #df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
            #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
            
@ -189,7 +108,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
                df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
                df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]

-                df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
+                df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(desired_datetime_fmt)
                startdate = df_categorical_attrs[timestamps_name].min()
                enddate = df_categorical_attrs[timestamps_name].max()

@ -202,12 +121,6 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
                df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
            
                
-                #df_categorical_attrs.reindex(drop=True)
-                #df_numerical_attrs.reindex(drop=True)
-
-
-
-        categorical_variables = [item for item in df_categorical_attrs.columns]
        ####
        #elif 'RGA' in filename:
        #    df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
@ -282,13 +195,169 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
        #    if timestamps_name in categorical_variables:
        #        dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
        #    file_dict['datasets'].append(dataset) 
+    #except Exception as e:
    except Exception as e:
+        #raise RuntimeError(f"Failed to read file with detected format: {e}")
        print(e)
        return {}

    return file_dict

+## Supporting functions

+def detect_table_header_line(filepath, format_variants, verbose=False):
+    """
+    Tries multiple format variants to detect the table header line in the file.
+
+    Args:
+        filepath (str): Path to file.
+        format_variants (List[Dict]): Each must contain:
+            - 'file_encoding' (str)
+            - 'separator' (str)
+            - 'table_header' (str or list of str)
+        verbose (bool): If True, prints debug info.
+
+    Returns:
+        Tuple:
+            - header_line_idx (int)
+            - column_names (List[str])
+            - matched_format (Dict[str, Any])  # full format dict (validated)
+            - preamble_lines (List[str])
+    """
+    import collections
+    import warnings
+
+    for idx, fmt in enumerate(format_variants):
+        # Validate format dict
+        if 'file_encoding' not in fmt or not isinstance(fmt['file_encoding'], str):
+            raise ValueError(f"[Format {idx}] 'file_encoding' must be a string.")
+        if 'separator' not in fmt or not isinstance(fmt['separator'], str):
+            raise ValueError(f"[Format {idx}] 'separator' must be a string.")
+        if 'table_header' not in fmt or not isinstance(fmt['table_header'], (str, list)):
+            raise ValueError(f"[Format {idx}] 'table_header' must be a string or list of strings.")
+
+        encoding = fmt['file_encoding']
+        separator = fmt['separator']
+        header_patterns = fmt['table_header']
+        if isinstance(header_patterns, str):
+            header_patterns = [header_patterns]
+
+        preamble_lines = []
+        try:
+            with open(filepath, 'rb') as f:
+                for line_number, line in enumerate(f):
+                    try:
+                        decoded_line = line.decode(encoding)
+                    except UnicodeDecodeError:
+                        break  # Try next format
+
+                    for pattern in header_patterns:
+                        if pattern in decoded_line:
+                            substrings = decoded_line.split(separator.replace('\\t', '\t'))
+                            counts = collections.Counter(substrings)
+                            column_names = [
+                                f"{i}_{name.strip()}" if counts[name] > 1 else name.strip()
+                                for i, name in enumerate(substrings)
+                            ]
+                            if verbose:
+                                print(f"[Detected header] Line {line_number}: {column_names}")
+                            return line_number, column_names, fmt, preamble_lines
+
+                    preamble_lines.append(' '.join(decoded_line.split()))
+        except Exception as e:
+            if verbose:
+                print(f"[Format {idx}] Attempt failed: {e}")
+            continue
+
+    warnings.warn("Table header was not detected using known patterns. Will attempt inference mode.")
+    
+    # Return fallback format with 'infer' but retain encoding/separator from first variant
+    fallback_fmt = {
+        'file_encoding':  'utf-8', 
+        'separator': ',',
+        'table_header': ['infer']
+    }
+    return -1, [], fallback_fmt, []
+
+
+def load_file_reader_parameters(filename: str, instruments_dir: str) -> tuple:
+    """
+    Load file reader configuration parameters based on the file and instrument directory.
+
+    Returns:
+        - format_variants: List of dicts with keys:
+            'file_encoding', 'separator', 'table_header', 'timestamp', 'datetime_format', 'desired_datetime_format'
+        - description_dict: Dict loaded from instrument's description YAML
+    """
+    config_path = os.path.abspath(os.path.join(instruments_dir, 'readers', 'config_text_reader.yaml'))
+    if not os.path.exists(config_dict):
+        config_path = os.path.join(dimaPath,'instruments','readers', 'config_text_reader.yaml')
+    
+
+    try:
+        with open(config_path, 'r') as stream:
+            config_dict = yaml.load(stream, Loader=yaml.FullLoader)
+    except yaml.YAMLError as exc:
+        print(f"[YAML Load Error] {exc}")
+        return {}, [], {}
+
+    default_config = config_dict.get('default', {})
+    default_format = {
+        'file_encoding': default_config.get('file_encoding', 'utf-8'),
+        'separator': default_config.get('separator', ',').replace('\\t','\t'),
+        'table_header': default_config.get('table_header', 'infer'),
+        'timestamp': [],
+        'datetime_format': default_config.get('datetime_format', '%Y-%m-%d %H:%M:%S.%f'),
+        'desired_datetime_format' : default_config.get('desired_format', '%Y-%m-%d %H:%M:%S.%f')
+    }
+
+    format_variants = []
+    description_dict = {}
+
+    # Match instrument key by folder name in file path
+    filename = os.path.normpath(filename)
+
+    for instFolder in config_dict.keys():
+        if instFolder in filename.split(os.sep):
+            inst_config = config_dict[instFolder]
+
+            # New style: has 'formats' block
+            if 'formats' in inst_config:
+                for fmt in inst_config['formats']:
+                    format_variants.append({
+                        'file_encoding': fmt.get('file_encoding', default_format['file_encoding']),
+                        'separator': fmt.get('separator', default_format['separator']),
+                        'table_header': fmt.get('table_header', default_format['table_header']),
+                        'timestamp': fmt.get('timestamp', []),
+                        'datetime_format': fmt.get('datetime_format', default_config['datetime_format']),
+                        'desired_datetime_format' : default_config['desired_datetime_format']
+                    })
+
+            else:
+                # Old style: flat format
+                format_variants.append({
+                    'file_encoding': inst_config.get('file_encoding', default_format['file_encoding']),
+                    'separator': inst_config.get('separator', default_format['separator']),
+                    'table_header': inst_config.get('table_header', default_format['table_header']),
+                    'timestamp': inst_config.get('timestamp', []),
+                    'datetime_format': inst_config.get('datetime_format', default_config['datetime_format']),
+                    'desired_datetime_format' : default_config['desired_datetime_format']
+                })
+
+            # Description loading
+            link_to_description = inst_config.get('link_to_description', '').replace('/', os.sep)
+            if link_to_description:
+                desc_path = os.path.join(instruments_dir, link_to_description)
+                try:
+                    with open(desc_path, 'r') as desc_stream:
+                        description_dict = yaml.load(desc_stream, Loader=yaml.FullLoader)
+                except (FileNotFoundError, yaml.YAMLError) as exc:
+                    print(f"[Description Load Error] {exc}")
+
+            break  # Stop after first match
+
+    # Always return config_dict + list of formats + description
+    return format_variants, description_dict

 if __name__ == "__main__":

--- a/instruments/registry.yaml
+++ b/instruments/registry.yaml
@ -83,3 +83,8 @@ instruments:
    fileExtension: yaml,yml,json
    fileReaderPath: instruments/readers/read_structured_file_as_dict.py
    InstrumentDictionaryPath: instruments/dictionaries/EBAS.yaml
+
+  - instrumentFolderName: CEDOAS
+    fileExtension: txt
+    fileReaderPath: instruments/readers/g5505_text_reader.py
+    InstrumentDictionaryPath: instruments/dictionaries/CEDOAS.yaml