From 6eccbb50184521a5e15e7227bb37c659026eb84f Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Mon, 3 Mar 2025 18:55:46 +0100
Subject: [PATCH] Moved get_metadata() from
 pipelines/steps/prepare_ebas_submission.py to utils.py

---
 pipelines/steps/prepare_ebas_submission.py | 88 +++++++++++++++++-----
 pipelines/steps/utils.py                   | 19 ++++-
 2 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/pipelines/steps/prepare_ebas_submission.py b/pipelines/steps/prepare_ebas_submission.py
index b7448ec..41cc71d 100644
--- a/pipelines/steps/prepare_ebas_submission.py
+++ b/pipelines/steps/prepare_ebas_submission.py
@@ -1,9 +1,22 @@
+import sys, os
+
+try:
+    thisFilePath = os.path.abspath(__file__)
+    print(thisFilePath)
+except NameError:
+    print("[Notice] The __file__ attribute is unavailable in this environment (e.g., Jupyter or IDLE).")
+    print("When using a terminal, make sure the working directory is set to the script's location to prevent path issues (for the DIMA submodule)")
+    #print("Otherwise, path to submodule DIMA may not be resolved properly.")
+    thisFilePath = os.getcwd()  # Use current directory or specify a default
+
+
+projectPath = os.path.normpath(os.path.join(thisFilePath, "..", "..",'..'))  # Move up to project root
+
 import argparse
-import os
 import pandas as pd
-import json
-import os
+import json, yaml
 import pandas as pd
+from utils import get_metadata
 
 def join_tables(csv_files: list):
     """
@@ -43,22 +56,22 @@ def join_tables(csv_files: list):
     return acum_df
 
     
-def get_metadata(path_to_file):
+def load_acsm_to_ebas_dict():
 
-    path, filename = os.path.split(path_to_file)
+    # Implicit input
+    dict_file = os.path.normpath(os.path.join(projectPath,"pipelines/dictionaries/acsm_to_ebas.yaml"))
 
-    path_to_metadata = None
-    for item in os.listdir(path):
-        if 'metadata.json' in item:
-            path_to_metadata = os.path.normpath(os.path.join(path,item))
-    metadata = {}
-    if path_to_file:
-        with open(path_to_metadata,'r') as stream:
-            metadata = json.load(stream)
+    output_dict = {}
+    try:
+        with open(dict_file, 'r') as stream:
+            output_dict = yaml.load(stream, Loader=yaml.FullLoader)
+    except Exception as e:
+        
+        print(f'Error loading {dict_file}: {e}')
+        return {}
+    
+    return output_dict
     
-        metadata = metadata.get(filename,{})
-
-    return metadata
 
 if __name__ == "__main__":
     
@@ -68,8 +81,47 @@ if __name__ == "__main__":
 
     acum_df = join_tables([path1,path2])
 
-    acum_df.to_csv('data/all_table.txt',sep='\t',index=None)
+    acsm_to_ebas = load_acsm_to_ebas_dict()    
+
+    #print("Before renaming:", acum_df.columns)
+    #print("Renaming map keys:", acsm_to_ebas['renaming_map'].keys())
+
+    acum_df = acum_df.rename(columns=acsm_to_ebas['renaming_map'])
+    acum_df['ACSM_time'] = pd.to_datetime(acum_df['ACSM_time'])
+
+    reduced_set_of_vars = [key for key in acum_df.columns if 'factor' not in key]
+    print(reduced_set_of_vars)
+    acum_df.loc[:,reduced_set_of_vars].to_csv('data/JFJ_ACSM-017_2024.txt',sep='\t',index=None, date_format="%Y/%m/%d %H:%M:%S")
+
+    # Count the number of NaT (null) values
+    num_nats = acum_df['ACSM_time'].isna().sum()
+
+    # Get the total number of rows
+    total_rows = len(acum_df)
+
+    # Calculate the percentage of NaT values
+    percentage_nats = (num_nats / total_rows) * 100
+
+    print(f"Total rows: {total_rows}")
+    print(f"NaT (missing) values: {num_nats}")
+    print(f"Percentage of data loss: {percentage_nats:.2f}%")
 
     acum_df = join_tables([path3])
+    acum_df = acum_df.rename(columns=acsm_to_ebas['renaming_map'])
+    acum_df['ACSM_time'] = pd.to_datetime(acum_df['ACSM_time'])
 
-    acum_df.to_csv('data/all_table_flags.txt',sep='\t',index=None)
\ No newline at end of file
+
+    # Count the number of NaT (null) values
+    num_nats = acum_df['ACSM_time'].isna().sum()
+
+    # Get the total number of rows
+    total_rows = len(acum_df)
+
+    # Calculate the percentage of NaT values
+    percentage_nats = (num_nats / total_rows) * 100
+
+    print(f"Total rows: {total_rows}")
+    print(f"NaT (missing) values: {num_nats}")
+    print(f"Percentage of data loss: {percentage_nats:.2f}%")
+
+    acum_df.to_csv('data/JFJ_ACSM-017_FLAGS_2024.txt',sep='\t',index=None, date_format="%Y/%m/%d %H:%M:%S")
\ No newline at end of file
diff --git a/pipelines/steps/utils.py b/pipelines/steps/utils.py
index be7fd85..dec2085 100644
--- a/pipelines/steps/utils.py
+++ b/pipelines/steps/utils.py
@@ -30,4 +30,21 @@ def record_data_lineage(path_to_output_file, projectPath, metadata):
     
     print(f"Metadata for calibrated data saved to {path_to_metadata_file}")
     
-    return 0
\ No newline at end of file
+    return 0
+
+def get_metadata(path_to_file):
+
+    path, filename = os.path.split(path_to_file)
+
+    path_to_metadata = None
+    for item in os.listdir(path):
+        if 'metadata.json' in item:
+            path_to_metadata = os.path.normpath(os.path.join(path,item))
+    metadata = {}
+    if path_to_file:
+        with open(path_to_metadata,'r') as stream:
+            metadata = json.load(stream)
+    
+        metadata = metadata.get(filename,{})
+
+    return metadata
\ No newline at end of file