Split header in three parts and detect variables and variable descriptions added to attribute dictionary

2025-05-21 09:19:16 +02:00
parent a22532d08d
commit e4b2a4cd5a
1 changed files with 94 additions and 10 deletions
--- a/instruments/readers/nasa_ames_reader.py
+++ b/instruments/readers/nasa_ames_reader.py
@ -23,6 +23,62 @@ import argparse
 import utils.g5505_utils as utils
 def split_header(header_lines):
    header_lines_copy = []
    for line in header_lines:
        if isinstance(line, bytes):
            decoded_line = line.decode('utf-8', errors='ignore').strip()
            header_lines_copy.append(decoded_line)
        else:
            header_lines_copy.append(line.strip())
    # Find the index where the variable descriptions start
    var_desc_marker = "Days from the file reference point (start_time)"
    try:
        var_start_idx = header_lines_copy.index(var_desc_marker)
    except ValueError:
        raise Exception("Expected variable description marker not found.")
    # Part 1: Everything before variable description
    part1 = header_lines[:var_start_idx]
    # Part 2: Variable descriptions — until the first key-value line (contains ':')
    part2 = []
    part3 = []
    in_part3 = False
    for line in header_lines[var_start_idx:]:
        if not in_part3 and ':' in line.decode(encoding = "utf-8"):
            in_part3 = True  # We assume this is where key-value pairs begin
        if in_part3:
            part3.append(line)
        else:
            part2.append(line)
    return part1, part2, part3
 def extract_var_descriptions(part2):
    nvars = int(part2[1].decode(encoding='utf-8').strip())
    if not sum(float(i) for i in part2[2].decode(encoding='utf-8').strip().split()) == nvars:
        line1 = part2[1].decode(encoding='utf-8')
        line2 = part2[2].decode(encoding='utf-8')
        raise RuntimeError(f'Inconsistent lines. Check lines {line1} and {line2}')
    descriptions = []
    for line_idx in range(4,4+nvars):
        descriptions.append(part2[line_idx])
    return descriptions
 def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True):
    # If instruments_dir is not provided, use the default path relative to the module directory
@ -43,23 +99,48 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
    description_dict = config_dict.get('table_header',{})
    # Read all lines once
-    with open(filename, 'r') as file:
+    with open(filename, 'rb') as file:
        lines = file.readlines()
    # Extract header length from the first line
    header_length = int(lines[0].split()[0])
    file_header = lines[:header_length]
-    # Extract start date from line 7
+    # Split header in three parts, header preamble, var descriptions, and metadata pairs 
-    date_header = lines[6].split()
+    part1, part2, part3 = split_header(file_header)
-    start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
+
-    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
+    var_descriptions = extract_var_descriptions(part2)
    table_header = part3[len(part3)-1]
    part3.remove(table_header)
    for line in part3:
        if 'Startdate:' in line.decode(encoding = "utf-8"):
            line_parts = line.decode(encoding = "utf-8").split(':',1)
            attribute_name = line_parts[0]
            attribute_value = line_parts[1]
            print(attribute_name,attribute_value)
            #date_header = lines[6].split()
            # Split the string by '.'
            #filename_parts = attribute_value.split('.')
            # Extract the datetime strings
            start_str = attribute_value.strip()
            #end_str = filename_parts[2]
            # Parse into datetime objects
            start_date = datetime.strptime(start_str, "%Y%m%d%H%M%S")
            #end_date = datetime.strptime(end_str, "%Y%m%d%H%M%S")
    #start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
    #start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    # Extract number of dependent variables from line 10
-    num_dep_vars = int(lines[9].split()[0])
+    #num_dep_vars = int(lines[9].split()[0])
    # Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars)
-    vars_list = ["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
+    vars_list = table_header.decode(encoding="utf-8").strip().split() #["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
    # Get the last line of the header (data column names)
    dat_head_line = lines[header_length - 1]
@ -79,10 +160,13 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
        # Create header metadata dictionary
        header_metadata_dict = {
            'header_length': header_length,
-            'start_date': start_date_str,
+            'start_date': start_str,
-            'num_dep_vars': num_dep_vars,
+            #'num_dep_vars': num_dep_vars,
            'variable_names': vars_list,
-            'raw_header': file_header
+            'variable_descriptions' : var_descriptions,
            'raw_header_part1': part1,
            'raw_header_part2': part2,
            'raw_header_part3': part3
        }