Split header in three parts and detect variables and variable descriptions added to attribute dictionary
This commit is contained in:
@ -23,6 +23,62 @@ import argparse
|
|||||||
|
|
||||||
import utils.g5505_utils as utils
|
import utils.g5505_utils as utils
|
||||||
|
|
||||||
|
|
||||||
|
def split_header(header_lines):
|
||||||
|
header_lines_copy = []
|
||||||
|
|
||||||
|
for line in header_lines:
|
||||||
|
if isinstance(line, bytes):
|
||||||
|
decoded_line = line.decode('utf-8', errors='ignore').strip()
|
||||||
|
header_lines_copy.append(decoded_line)
|
||||||
|
else:
|
||||||
|
header_lines_copy.append(line.strip())
|
||||||
|
|
||||||
|
# Find the index where the variable descriptions start
|
||||||
|
var_desc_marker = "Days from the file reference point (start_time)"
|
||||||
|
|
||||||
|
try:
|
||||||
|
var_start_idx = header_lines_copy.index(var_desc_marker)
|
||||||
|
except ValueError:
|
||||||
|
raise Exception("Expected variable description marker not found.")
|
||||||
|
|
||||||
|
# Part 1: Everything before variable description
|
||||||
|
part1 = header_lines[:var_start_idx]
|
||||||
|
|
||||||
|
# Part 2: Variable descriptions — until the first key-value line (contains ':')
|
||||||
|
part2 = []
|
||||||
|
part3 = []
|
||||||
|
in_part3 = False
|
||||||
|
|
||||||
|
for line in header_lines[var_start_idx:]:
|
||||||
|
if not in_part3 and ':' in line.decode(encoding = "utf-8"):
|
||||||
|
in_part3 = True # We assume this is where key-value pairs begin
|
||||||
|
|
||||||
|
if in_part3:
|
||||||
|
part3.append(line)
|
||||||
|
else:
|
||||||
|
part2.append(line)
|
||||||
|
|
||||||
|
return part1, part2, part3
|
||||||
|
|
||||||
|
def extract_var_descriptions(part2):
|
||||||
|
|
||||||
|
nvars = int(part2[1].decode(encoding='utf-8').strip())
|
||||||
|
if not sum(float(i) for i in part2[2].decode(encoding='utf-8').strip().split()) == nvars:
|
||||||
|
line1 = part2[1].decode(encoding='utf-8')
|
||||||
|
line2 = part2[2].decode(encoding='utf-8')
|
||||||
|
raise RuntimeError(f'Inconsistent lines. Check lines {line1} and {line2}')
|
||||||
|
|
||||||
|
descriptions = []
|
||||||
|
for line_idx in range(4,4+nvars):
|
||||||
|
descriptions.append(part2[line_idx])
|
||||||
|
|
||||||
|
return descriptions
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True):
|
def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy: bool = True):
|
||||||
|
|
||||||
# If instruments_dir is not provided, use the default path relative to the module directory
|
# If instruments_dir is not provided, use the default path relative to the module directory
|
||||||
@ -43,23 +99,48 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
|
|||||||
description_dict = config_dict.get('table_header',{})
|
description_dict = config_dict.get('table_header',{})
|
||||||
|
|
||||||
# Read all lines once
|
# Read all lines once
|
||||||
with open(filename, 'r') as file:
|
with open(filename, 'rb') as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
# Extract header length from the first line
|
# Extract header length from the first line
|
||||||
header_length = int(lines[0].split()[0])
|
header_length = int(lines[0].split()[0])
|
||||||
file_header = lines[:header_length]
|
file_header = lines[:header_length]
|
||||||
|
|
||||||
# Extract start date from line 7
|
# Split header in three parts, header preamble, var descriptions, and metadata pairs
|
||||||
date_header = lines[6].split()
|
part1, part2, part3 = split_header(file_header)
|
||||||
start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
|
|
||||||
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
var_descriptions = extract_var_descriptions(part2)
|
||||||
|
|
||||||
|
table_header = part3[len(part3)-1]
|
||||||
|
part3.remove(table_header)
|
||||||
|
|
||||||
|
|
||||||
|
for line in part3:
|
||||||
|
if 'Startdate:' in line.decode(encoding = "utf-8"):
|
||||||
|
line_parts = line.decode(encoding = "utf-8").split(':',1)
|
||||||
|
attribute_name = line_parts[0]
|
||||||
|
attribute_value = line_parts[1]
|
||||||
|
print(attribute_name,attribute_value)
|
||||||
|
#date_header = lines[6].split()
|
||||||
|
# Split the string by '.'
|
||||||
|
#filename_parts = attribute_value.split('.')
|
||||||
|
|
||||||
|
# Extract the datetime strings
|
||||||
|
start_str = attribute_value.strip()
|
||||||
|
#end_str = filename_parts[2]
|
||||||
|
|
||||||
|
# Parse into datetime objects
|
||||||
|
start_date = datetime.strptime(start_str, "%Y%m%d%H%M%S")
|
||||||
|
#end_date = datetime.strptime(end_str, "%Y%m%d%H%M%S")
|
||||||
|
|
||||||
|
#start_date_str = f"{date_header[0]}-{date_header[1]}-{date_header[2]}"
|
||||||
|
#start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
|
||||||
|
|
||||||
# Extract number of dependent variables from line 10
|
# Extract number of dependent variables from line 10
|
||||||
num_dep_vars = int(lines[9].split()[0])
|
#num_dep_vars = int(lines[9].split()[0])
|
||||||
|
|
||||||
# Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars)
|
# Get variable names: start_time + vars from lines 13 to 13+num_dep_vars-1 (zero-indexed: 12 to 12+num_dep_vars)
|
||||||
vars_list = ["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
|
vars_list = table_header.decode(encoding="utf-8").strip().split() #["start_time"] + [lines[i].strip() for i in range(12, 12 + num_dep_vars)]
|
||||||
|
|
||||||
# Get the last line of the header (data column names)
|
# Get the last line of the header (data column names)
|
||||||
dat_head_line = lines[header_length - 1]
|
dat_head_line = lines[header_length - 1]
|
||||||
@ -79,10 +160,13 @@ def read_nasa_ames_as_dict(filename, instruments_dir: str = None, work_with_copy
|
|||||||
# Create header metadata dictionary
|
# Create header metadata dictionary
|
||||||
header_metadata_dict = {
|
header_metadata_dict = {
|
||||||
'header_length': header_length,
|
'header_length': header_length,
|
||||||
'start_date': start_date_str,
|
'start_date': start_str,
|
||||||
'num_dep_vars': num_dep_vars,
|
#'num_dep_vars': num_dep_vars,
|
||||||
'variable_names': vars_list,
|
'variable_names': vars_list,
|
||||||
'raw_header': file_header
|
'variable_descriptions' : var_descriptions,
|
||||||
|
'raw_header_part1': part1,
|
||||||
|
'raw_header_part2': part2,
|
||||||
|
'raw_header_part3': part3
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user