Implement new step pipelines/steps/adjust_uncertainty_column_in_nas_file.py.

2025-06-24 13:11:08 +02:00 · 2025-05-26 19:50:18 +02:00
parent b1b8b426fc
commit 08ba10dc48
2 changed files with 128 additions and 1 deletions
--- a/pipelines/steps/adjust_uncertainty_column_in_nas_file.py
+++ b/pipelines/steps/adjust_uncertainty_column_in_nas_file.py
@ -0,0 +1,126 @@
+import sys, os
+import re
+
+try:
+    thisFilePath = os.path.abspath(__file__)
+    print(thisFilePath)
+except NameError:
+    print("[Notice] The __file__ attribute is unavailable in this environment (e.g., Jupyter or IDLE).")
+    print("When using a terminal, make sure the working directory is set to the script's location to prevent path issues (for the DIMA submodule)")
+    #print("Otherwise, path to submodule DIMA may not be resolved properly.")
+    thisFilePath = os.getcwd()  # Use current directory or specify a default
+
+
+projectPath = os.path.normpath(os.path.join(thisFilePath, "..", "..",'..'))  # Move up to project root
+
+if projectPath not in sys.path:
+    sys.path.insert(0,projectPath)
+
+import numpy as np
+import pandas as pd
+from dima.instruments.readers.nasa_ames_reader import read_nasa_ames_as_dict
+from pipelines.steps.utils import compute_uncertainty_estimate
+
+def main(path_to_data_file, base_column_name):
+
+
+    if not path_to_data_file.endswith('.nas'):
+        raise RuntimeError(f'Invalid file extension. The input file {path_to_data_file} must be a .nas file.')
+
+    # Read and extract data
+    idr_dict = read_nasa_ames_as_dict(path_to_data_file)
+    header_metadata_dict = idr_dict['attributes_dict']
+
+    # Locate data table
+    dataset = None
+    for d in idr_dict['datasets']:
+        if d['name'] == 'data_table':
+            dataset = d
+    if dataset is None:
+        raise ValueError("Dataset named 'data_table' not found.")
+
+    data_table = dataset['data']  # structured numpy array
+    df = pd.DataFrame(data_table)
+
+    if base_column_name not in df.columns:
+        raise ValueError(f"Base column '{base_column_name}' not found in dataset.")
+
+    err_column = f"err_{base_column_name}"
+    if err_column not in df.columns:
+        raise ValueError(f"Column '{err_column}' not found in dataset.")
+
+    # Apply callback to base column
+
+    err_index = data_table.dtype.names.index(err_column)
+
+    # Read original lines from file
+    with open(path_to_data_file, 'rb') as file:
+        raw_lines = file.readlines()
+
+    header_length = header_metadata_dict['header_length']
+    data_table_lines = []
+
+    # Iterate through data table lines
+    cnt = 0
+    for line_idx in range(len(raw_lines)):
+        if line_idx >= header_length - 1:
+            line = raw_lines[line_idx]
+            fields = list(re.finditer(rb'\S+', line))
+
+            if err_index < len(fields):
+                match = fields[err_index]
+                original_bytes = match.group()
+                original_str = original_bytes.decode('utf-8')
+
+                # Skip column header or fill values
+                clean_original_str = original_str.strip().replace('.', '')
+                if err_column in original_str: 
+                    data_table_lines.append(line)
+                    continue
+
+                # Decimal precision
+                decimals = len(original_str.split('.')[1]) if '.' in original_str else 0
+
+                try:
+                    original_err = float(original_str)
+                    if not (clean_original_str and all(c == '9' for c in clean_original_str)):
+                        additional_term = df.loc[cnt, base_column_name]
+                        updated_value = compute_uncertainty_estimate(additional_term, original_err)
+                    else: # if original value is missing, then keep the same
+                        updated_value = original_err
+                except Exception as e:
+                    raise RuntimeError(f"Error calculating updated value on line {line_idx}: {e}")
+
+                # Preserve width and precision
+                start, end = match.span()
+                width = end - start
+                formatted_str = f"{updated_value:.{decimals}f}"
+
+                if len(formatted_str) > width:
+                    print(f"Warning: formatted value '{formatted_str}' too wide for field of width {width} at line {line_idx}. Value may be truncated.")
+
+                formatted_bytes = formatted_str.rjust(width).encode('utf-8')
+                new_line = line[:start] + formatted_bytes + line[end:]
+                data_table_lines.append(new_line)
+                cnt += 1
+
+    # Reconstruct the file
+    processed_lines = (
+        header_metadata_dict['raw_header_part1'] +
+        header_metadata_dict['raw_header_part2'] +
+        header_metadata_dict['raw_header_part3'] +
+        data_table_lines
+    )
+
+    # Write updated content to file
+    with open(path_to_data_file, 'wb') as f:
+        for line in processed_lines:
+            decoded = line.decode('utf-8').rstrip('\n')
+            f.write((decoded + '\n').encode('utf-8'))
+
+
+if __name__ == '__main__':
+    path_to_data_file = os.path.normpath(os.path.join(
+        'data', 'CH0002G.20240201010000.20250521123253.aerosol_mass_spectrometer.chemistry_ACSM.pm1_non_refractory.7w.1h.CH02L_Aerodyne_ToF-ACSM_092.CH02L_Aerodyne_ToF-ACSM_PAY.lev2.nas'
+    ))
+    main(path_to_data_file, base_column_name='Org')
--- a/pipelines/steps/utils.py
+++ b/pipelines/steps/utils.py
@ -93,7 +93,8 @@ def generate_missing_value_code(max_val, num_decimals):

    return missing_code

-
+def compute_uncertainty_estimate(x,x_err):
+    return ((0.5*x_err)**2+(0.5*x)**2)**0.5


 def generate_error_dataframe(df: pd.DataFrame, datetime_var):