Extend pipelines/steps/adjust_uncertainty_column_in_nas_file.py to handle list of variables.

2025-06-24 21:21:08 +02:00 · 2025-05-27 09:53:12 +02:00
parent 38fe2b8774
commit f3f830487e
2 changed files with 103 additions and 49 deletions
--- a/pipelines/steps/adjust_uncertainty_column_in_nas_file.py
+++ b/pipelines/steps/adjust_uncertainty_column_in_nas_file.py
@ -21,7 +21,29 @@ import pandas as pd
 from dima.instruments.readers.nasa_ames_reader import read_nasa_ames_as_dict
 from pipelines.steps.utils import compute_uncertainty_estimate

-def main(path_to_data_file, base_column_name):
+def main(path_to_data_file, base_column_names : list):
+
+    """Adjust the error or uncertainty columns of data table, where data table is available
+     in input nas file and by specifying a list of columns to adjust.
+
+    Parameters:
+    ------------
+    path_to_data_file (str) : Path to nas file
+    base_column_names (list) : list of column names 
+
+    Raises
+    ------
+    RuntimeError
+        _description_
+    ValueError
+        _description_
+    ValueError
+        _description_
+    ValueError
+        _description_
+    RuntimeError
+        _description_
+    """


    if not path_to_data_file.endswith('.nas'):
@ -42,8 +64,25 @@ def main(path_to_data_file, base_column_name):
    data_table = dataset['data']  # structured numpy array
    df = pd.DataFrame(data_table)

-    if base_column_name not in df.columns:
-        raise ValueError(f"Base column '{base_column_name}' not found in dataset.")
+    if any(col not in df.columns for col in base_column_names):
+        raise ValueError(f"Base column '{col}' not found in dataset.")
+    
+    # filter out columns with name starting in 'err_'
+    base_column_names_cleaned = [col for col in base_column_names if not col.startswith('err_')]
+
+
+    # Read original lines from file
+    with open(path_to_data_file, 'rb') as file:
+        raw_lines = file.readlines()
+
+    header_length = header_metadata_dict['header_length']
+    
+    
+
+    for col in base_column_names_cleaned:
+        
+        data_table_lines = []
+        base_column_name = col

        err_column = f"err_{base_column_name}"
        if err_column not in df.columns:
@ -52,14 +91,6 @@ def main(path_to_data_file, base_column_name):
        # Apply callback to base column

        err_index = data_table.dtype.names.index(err_column)
-
-    # Read original lines from file
-    with open(path_to_data_file, 'rb') as file:
-        raw_lines = file.readlines()
-
-    header_length = header_metadata_dict['header_length']
-    data_table_lines = []
-
        # Iterate through data table lines
        cnt = 0
        for line_idx in range(len(raw_lines)):
@ -103,6 +134,9 @@ def main(path_to_data_file, base_column_name):
                    new_line = line[:start] + formatted_bytes + line[end:]
                    data_table_lines.append(new_line)
                    cnt += 1
+        # update raw lines
+        for line_idx in range(header_length - 1, len(raw_lines)):
+            raw_lines[line_idx] = data_table_lines[line_idx - header_length + 1]        

    # Reconstruct the file
    processed_lines = (
--- a/pipelines/steps/utils.py
+++ b/pipelines/steps/utils.py
@ -93,8 +93,28 @@ def generate_missing_value_code(max_val, num_decimals):

    return missing_code

+import math
+import numpy as np
+
 def compute_uncertainty_estimate(x, x_err):
-    return ((0.5*x_err)**2+(0.5*x)**2)**0.5
+    """
+    Computes uncertainty estimate: sqrt((0.5 * x_err)^2 + (0.5 * x)^2)
+    for scalar inputs. Prints errors if inputs are invalid.
+    """
+    try:
+        x = float(x)
+        x_err = float(x_err)
+
+        if math.isnan(x) or math.isnan(x_err):
+            print(f"Warning: One or both inputs are NaN -> x: {x}, x_err: {x_err}")
+            return np.nan
+
+        return math.sqrt((0.5 * x_err)**2 + (0.5 * x)**2)
+
+    except (ValueError, TypeError) as e:
+        print(f"Error computing uncertainty for x: {x}, x_err: {x_err} -> {e}")
+        return np.nan
+


 def generate_error_dataframe(df: pd.DataFrame, datetime_var):