Split header in three parts and detect variables and variable descriptions added to attribute dictionary

This commit is contained in:
2025-05-21 09:19:16 +02:00
parent a22532d08d
commit e4b2a4cd5a

View File

@ -23,6 +23,62 @@ import argparse
import utils.g5505_utils as utils import utils.g5505_utils as utils
def split_header(header_lines):
header_lines_copy = []
for line in header_lines:
if isinstance(line, bytes):
decoded_line = line.decode('utf-8', errors='ignore').strip()
header_lines_copy.append(decoded_line)
else:
header_lines_copy.append(line.strip())
# Find the index where the variable descriptions start
var_desc_marker = "Days from the file reference point (start_time)"
try:
var_start_idx = header_lines_copy.index(var_desc_marker)
except ValueError:
raise Exception("Expected variable description marker not found.")
# Part 1: Everything before variable description
part1 = header_lines[:var_start_idx]
# Part 2: Variable descriptions — until the first key-value line (contains ':')
part2 = []
part3 = []
in_part3 = False
for line in header_lines[var_start_idx:]:
if not in_part3 and ':' in line.decode(encoding = "utf-8"):
in_part3 = True # We assume this is where key-value pairs begin
if in_part3:
part3.append(line)
else:
part2.append(line)
return part1, part2, part3
def extract_var_descriptions(part2):
nvars = int(part2[1].decode(encoding='utf-8').strip())
if not sum(float(i) for i in part2[2].decode(encoding='utf-8').strip().split()) == nvars:
line1 = part2[1].decode(encoding='utf-8')
line2 = part2[2].decode(encoding='utf-8')
raise RuntimeError(f'Inconsistent lines. Check lines {line1} and {line2}')
descriptions = []
for line_idx in range(4,4+nvars):
descriptions.append(part2[line_idx])
return descriptions
def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True): def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True):
# If instruments_dir is not provided, use the default path relative to the module directory # If instruments_dir is not provided, use the default path relative to the module directory
@ -43,23 +99,48 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
description_dict = config_dict.get('table_header',{}) description_dict = config_dict.get('table_header',{})
# Read all lines once # Read all lines once
with open(filename, 'r') as file: with open(filename, 'rb') as file:
lines = file.readlines() lines = file.readlines()
# Extract header length from the first line # Extract header length from the first line
header_length = int(lines[0].split()[0]) header_length = int(lines[0].split()[0])
file_header = lines[:header_length] file_header = lines[:header_length]
# Extract start date from line 7 # Split header in three parts, header preamble, var descriptions, and metadata pairs
date_header = lines[6].split() part1, part2, part3 = split_header(file_header)
start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
start_date = datetime.strptime(start_date_str, "%Y-%m-%d") var_descriptions = extract_var_descriptions(part2)
table_header = part3[len(part3)-1]
part3.remove(table_header)
for line in part3:
if 'Startdate:' in line.decode(encoding = "utf-8"):
line_parts = line.decode(encoding = "utf-8").split(':',1)
attribute_name = line_parts[0]
attribute_value = line_parts[1]
print(attribute_name,attribute_value)
#date_header = lines[6].split()
# Split the string by '.'
#filename_parts = attribute_value.split('.')
# Extract the datetime strings
start_str = attribute_value.strip()
#end_str = filename_parts[2]
# Parse into datetime objects
start_date = datetime.strptime(start_str, "%Y%m%d%H%M%S")
#end_date = datetime.strptime(end_str, "%Y%m%d%H%M%S")
#start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
#start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
# Extract number of dependent variables from line 10 # Extract number of dependent variables from line 10
num_dep_vars = int(lines[9].split()[0]) #num_dep_vars = int(lines[9].split()[0])
# Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars) # Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars)
vars_list = ["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)] vars_list = table_header.decode(encoding="utf-8").strip().split() #["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
# Get the last line of the header (data column names) # Get the last line of the header (data column names)
dat_head_line = lines[header_length - 1] dat_head_line = lines[header_length - 1]
@ -79,10 +160,13 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
# Create header metadata dictionary # Create header metadata dictionary
header_metadata_dict = { header_metadata_dict = {
'header_length': header_length, 'header_length': header_length,
'start_date': start_date_str, 'start_date': start_str,
'num_dep_vars': num_dep_vars, #'num_dep_vars': num_dep_vars,
'variable_names': vars_list, 'variable_names': vars_list,
'raw_header': file_header 'variable_descriptions' : var_descriptions,
'raw_header_part1': part1,
'raw_header_part2': part2,
'raw_header_part3': part3
} }