mirror of
https://gitea.psi.ch/APOG/acsm-fairifier.git
synced 2025-07-12 18:31:49 +02:00
Extend pipelines/steps/adjust_uncertainty_column_in_nas_file.py to handle list of variables.
This commit is contained in:
@ -21,7 +21,29 @@ import pandas as pd
|
|||||||
from dima.instruments.readers.nasa_ames_reader import read_nasa_ames_as_dict
|
from dima.instruments.readers.nasa_ames_reader import read_nasa_ames_as_dict
|
||||||
from pipelines.steps.utils import compute_uncertainty_estimate
|
from pipelines.steps.utils import compute_uncertainty_estimate
|
||||||
|
|
||||||
def main(path_to_data_file, base_column_name):
|
def main(path_to_data_file, base_column_names : list):
|
||||||
|
|
||||||
|
"""Adjust the error or uncertainty columns of data table, where data table is available
|
||||||
|
in input nas file and by specifying a list of columns to adjust.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
------------
|
||||||
|
path_to_data_file (str) : Path to nas file
|
||||||
|
base_column_names (list) : list of column names
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
RuntimeError
|
||||||
|
_description_
|
||||||
|
ValueError
|
||||||
|
_description_
|
||||||
|
ValueError
|
||||||
|
_description_
|
||||||
|
ValueError
|
||||||
|
_description_
|
||||||
|
RuntimeError
|
||||||
|
_description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
if not path_to_data_file.endswith('.nas'):
|
if not path_to_data_file.endswith('.nas'):
|
||||||
@ -42,8 +64,25 @@ def main(path_to_data_file, base_column_name):
|
|||||||
data_table = dataset['data'] # structured numpy array
|
data_table = dataset['data'] # structured numpy array
|
||||||
df = pd.DataFrame(data_table)
|
df = pd.DataFrame(data_table)
|
||||||
|
|
||||||
if base_column_name not in df.columns:
|
if any(col not in df.columns for col in base_column_names):
|
||||||
raise ValueError(f"Base column '{base_column_name}' not found in dataset.")
|
raise ValueError(f"Base column '{col}' not found in dataset.")
|
||||||
|
|
||||||
|
# filter out columns with name starting in 'err_'
|
||||||
|
base_column_names_cleaned = [col for col in base_column_names if not col.startswith('err_')]
|
||||||
|
|
||||||
|
|
||||||
|
# Read original lines from file
|
||||||
|
with open(path_to_data_file, 'rb') as file:
|
||||||
|
raw_lines = file.readlines()
|
||||||
|
|
||||||
|
header_length = header_metadata_dict['header_length']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for col in base_column_names_cleaned:
|
||||||
|
|
||||||
|
data_table_lines = []
|
||||||
|
base_column_name = col
|
||||||
|
|
||||||
err_column = f"err_{base_column_name}"
|
err_column = f"err_{base_column_name}"
|
||||||
if err_column not in df.columns:
|
if err_column not in df.columns:
|
||||||
@ -52,14 +91,6 @@ def main(path_to_data_file, base_column_name):
|
|||||||
# Apply callback to base column
|
# Apply callback to base column
|
||||||
|
|
||||||
err_index = data_table.dtype.names.index(err_column)
|
err_index = data_table.dtype.names.index(err_column)
|
||||||
|
|
||||||
# Read original lines from file
|
|
||||||
with open(path_to_data_file, 'rb') as file:
|
|
||||||
raw_lines = file.readlines()
|
|
||||||
|
|
||||||
header_length = header_metadata_dict['header_length']
|
|
||||||
data_table_lines = []
|
|
||||||
|
|
||||||
# Iterate through data table lines
|
# Iterate through data table lines
|
||||||
cnt = 0
|
cnt = 0
|
||||||
for line_idx in range(len(raw_lines)):
|
for line_idx in range(len(raw_lines)):
|
||||||
@ -103,6 +134,9 @@ def main(path_to_data_file, base_column_name):
|
|||||||
new_line = line[:start] + formatted_bytes + line[end:]
|
new_line = line[:start] + formatted_bytes + line[end:]
|
||||||
data_table_lines.append(new_line)
|
data_table_lines.append(new_line)
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
# update raw lines
|
||||||
|
for line_idx in range(header_length - 1, len(raw_lines)):
|
||||||
|
raw_lines[line_idx] = data_table_lines[line_idx - header_length + 1]
|
||||||
|
|
||||||
# Reconstruct the file
|
# Reconstruct the file
|
||||||
processed_lines = (
|
processed_lines = (
|
||||||
|
@ -93,8 +93,28 @@ def generate_missing_value_code(max_val, num_decimals):
|
|||||||
|
|
||||||
return missing_code
|
return missing_code
|
||||||
|
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
def compute_uncertainty_estimate(x, x_err):
|
def compute_uncertainty_estimate(x, x_err):
|
||||||
return ((0.5*x_err)**2+(0.5*x)**2)**0.5
|
"""
|
||||||
|
Computes uncertainty estimate: sqrt((0.5 * x_err)^2 + (0.5 * x)^2)
|
||||||
|
for scalar inputs. Prints errors if inputs are invalid.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
x = float(x)
|
||||||
|
x_err = float(x_err)
|
||||||
|
|
||||||
|
if math.isnan(x) or math.isnan(x_err):
|
||||||
|
print(f"Warning: One or both inputs are NaN -> x: {x}, x_err: {x_err}")
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
return math.sqrt((0.5 * x_err)**2 + (0.5 * x)**2)
|
||||||
|
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
print(f"Error computing uncertainty for x: {x}, x_err: {x_err} -> {e}")
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_error_dataframe(df: pd.DataFrame, datetime_var):
|
def generate_error_dataframe(df: pd.DataFrame, datetime_var):
|
||||||
|
Reference in New Issue
Block a user