Split header in three parts and detect variables and variable descriptions added to attribute dictionary
This commit is contained in:
@ -23,6 +23,62 @@ import argparse
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
def split_header(header_lines):
|
||||
header_lines_copy = []
|
||||
|
||||
for line in header_lines:
|
||||
if isinstance(line, bytes):
|
||||
decoded_line = line.decode('utf-8', errors='ignore').strip()
|
||||
header_lines_copy.append(decoded_line)
|
||||
else:
|
||||
header_lines_copy.append(line.strip())
|
||||
|
||||
# Find the index where the variable descriptions start
|
||||
var_desc_marker = "Days from the file reference point (start_time)"
|
||||
|
||||
try:
|
||||
var_start_idx = header_lines_copy.index(var_desc_marker)
|
||||
except ValueError:
|
||||
raise Exception("Expected variable description marker not found.")
|
||||
|
||||
# Part 1: Everything before variable description
|
||||
part1 = header_lines[:var_start_idx]
|
||||
|
||||
# Part 2: Variable descriptions — until the first key-value line (contains ':')
|
||||
part2 = []
|
||||
part3 = []
|
||||
in_part3 = False
|
||||
|
||||
for line in header_lines[var_start_idx:]:
|
||||
if not in_part3 and ':' in line.decode(encoding = "utf-8"):
|
||||
in_part3 = True # We assume this is where key-value pairs begin
|
||||
|
||||
if in_part3:
|
||||
part3.append(line)
|
||||
else:
|
||||
part2.append(line)
|
||||
|
||||
return part1, part2, part3
|
||||
|
||||
def extract_var_descriptions(part2):
|
||||
|
||||
nvars = int(part2[1].decode(encoding='utf-8').strip())
|
||||
if not sum(float(i) for i in part2[2].decode(encoding='utf-8').strip().split()) == nvars:
|
||||
line1 = part2[1].decode(encoding='utf-8')
|
||||
line2 = part2[2].decode(encoding='utf-8')
|
||||
raise RuntimeError(f'Inconsistent lines. Check lines {line1} and {line2}')
|
||||
|
||||
descriptions = []
|
||||
for line_idx in range(4,4+nvars):
|
||||
descriptions.append(part2[line_idx])
|
||||
|
||||
return descriptions
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True):
|
||||
|
||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||
@ -43,23 +99,48 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
|
||||
description_dict = config_dict.get('table_header',{})
|
||||
|
||||
# Read all lines once
|
||||
with open(filename, 'r') as file:
|
||||
with open(filename, 'rb') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# Extract header length from the first line
|
||||
header_length = int(lines[0].split()[0])
|
||||
file_header = lines[:header_length]
|
||||
|
||||
# Extract start date from line 7
|
||||
date_header = lines[6].split()
|
||||
start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
|
||||
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
||||
# Split header in three parts, header preamble, var descriptions, and metadata pairs
|
||||
part1, part2, part3 = split_header(file_header)
|
||||
|
||||
var_descriptions = extract_var_descriptions(part2)
|
||||
|
||||
table_header = part3[len(part3)-1]
|
||||
part3.remove(table_header)
|
||||
|
||||
|
||||
for line in part3:
|
||||
if 'Startdate:' in line.decode(encoding = "utf-8"):
|
||||
line_parts = line.decode(encoding = "utf-8").split(':',1)
|
||||
attribute_name = line_parts[0]
|
||||
attribute_value = line_parts[1]
|
||||
print(attribute_name,attribute_value)
|
||||
#date_header = lines[6].split()
|
||||
# Split the string by '.'
|
||||
#filename_parts = attribute_value.split('.')
|
||||
|
||||
# Extract the datetime strings
|
||||
start_str = attribute_value.strip()
|
||||
#end_str = filename_parts[2]
|
||||
|
||||
# Parse into datetime objects
|
||||
start_date = datetime.strptime(start_str, "%Y%m%d%H%M%S")
|
||||
#end_date = datetime.strptime(end_str, "%Y%m%d%H%M%S")
|
||||
|
||||
#start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
|
||||
#start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
||||
|
||||
# Extract number of dependent variables from line 10
|
||||
num_dep_vars = int(lines[9].split()[0])
|
||||
#num_dep_vars = int(lines[9].split()[0])
|
||||
|
||||
# Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars)
|
||||
vars_list = ["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
|
||||
vars_list = table_header.decode(encoding="utf-8").strip().split() #["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
|
||||
|
||||
# Get the last line of the header (data column names)
|
||||
dat_head_line = lines[header_length - 1]
|
||||
@ -79,10 +160,13 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
|
||||
# Create header metadata dictionary
|
||||
header_metadata_dict = {
|
||||
'header_length': header_length,
|
||||
'start_date': start_date_str,
|
||||
'num_dep_vars': num_dep_vars,
|
||||
'start_date': start_str,
|
||||
#'num_dep_vars': num_dep_vars,
|
||||
'variable_names': vars_list,
|
||||
'raw_header': file_header
|
||||
'variable_descriptions' : var_descriptions,
|
||||
'raw_header_part1': part1,
|
||||
'raw_header_part2': part2,
|
||||
'raw_header_part3': part3
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user