Refactor steps using main functions from each step instead of subprocess with command line interface

This commit is contained in:
2025-03-14 13:44:03 +01:00
parent b93cb7517b
commit 9504576fc0

View File

@ -2,28 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"c:\\Users\\florez_j\\Documents\\GitLab\\acsmnode\\dima\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\python310.zip\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\DLLs\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\n",
"\n",
"C:\\Users\\florez_j\\AppData\\Roaming\\Python\\Python310\\site-packages\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\\site-packages\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\\site-packages\\win32\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\\site-packages\\win32\\lib\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\\site-packages\\Pythonwin\n",
"c:\\ProgramData\\Anaconda3\\envs\\flaggingapp_env\\lib\\site-packages\\setuptools\\_vendor\n"
]
}
],
"outputs": [],
"source": [
"import sys\n",
"import os\n",
@ -31,11 +12,11 @@
"\n",
"\n",
"notebook_dir = os.getcwd() # Current working directory (assumes running from notebooks/)\n",
"#project_path = os.path.normpath(os.path.join(notebook_dir, \"..\")) # Move up to project root\n",
"dima_path = os.path.normpath(os.path.join(notebook_dir, \"dima\")) # Move up to project root\n",
"project_path = os.path.normpath(os.path.join(notebook_dir, \"..\")) # Move up to project root\n",
"dima_path = os.path.normpath(os.path.join(project_path, \"dima\")) # Move up to project root\n",
"\n",
"#if project_path not in sys.path: # Avoid duplicate entries\n",
"# sys.path.append(project_path)\n",
"if project_path not in sys.path: # Avoid duplicate entries\n",
" sys.path.append(project_path)\n",
"if dima_path not in sys.path:\n",
" sys.path.insert(0,dima_path)\n",
"#sys.path.append(os.path.join(root_dir,'dima','instruments'))\n",
@ -48,7 +29,9 @@
"\n",
"\n",
"for item in sys.path:\n",
" print(item)\n"
" print(item)\n",
"\n",
"CAMPAIGN_DATA_FILE = \"../data/collection_JFJ_2024_2025-03-17_2025-02-17.h5\""
]
},
{
@ -58,60 +41,20 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path: c:\\Users\\florez_j\\Documents\\GitLab\\acsmnode\\pipelines\\steps\\apply_calibration_factors.py\n",
"data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10.h5\n",
" dataset_name ... parent_file\n",
"0 ACSM_TOFWARE/2024/ACSM_JFJ_2024_meta.txt/data_... ... ACSM_JFJ_2024_meta.txt\n",
"1 ACSM_TOFWARE/2024/ACSM_JFJ_2024_timeseries.txt... ... ACSM_JFJ_2024_timeseries.txt\n",
"2 ACSM_TOFWARE/2024/Org_data_valid.csv/data_table ... Org_data_valid.csv\n",
"3 ACSM_TOFWARE/2024/Org_err_valid.csv/data_table ... Org_err_valid.csv\n",
"4 ACSM_TOFWARE/2024/Org_mz_valid.csv/data_table ... Org_mz_valid.csv\n",
"\n",
"[5 rows x 3 columns]\n",
"ACSM_JFJ_2024_timeseries.txt\n",
"pipelines/params/calibration_factors.yaml\n",
"Path to output directory : data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10\n",
"Processing script : pipelines\\steps\\apply_calibration_factors.py\n",
"Output directory : data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_processed/2024\n",
"NO3_11000\n",
"SO4_11000\n",
"NH4_11000\n",
"Org_11000\n",
"Chl_11000\n",
"Org_44_11000\n",
"Org_43_11000\n",
"Org_60_11000\n",
"NO3_30_11000\n",
"SO4_98_11000\n",
"SO4_81_11000\n",
"SO4_82_11000\n",
"SO4_62_11000\n",
"SO4_48_11000\n",
"Metadata for calibrated data saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_processed/2024/data_lineage_metadata.json\n",
"Metadata for calibrated data saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_processed/2024/data_lineage_metadata.json\n",
"Calibration factors saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_processed/2024/ACSM_JFJ_2024_timeseries_calibration_factors.csv\n",
"Calibrated data saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_processed/2024/ACSM_JFJ_2024_timeseries_calibrated.csv\n",
"Data lineage saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10\n",
"\n"
]
}
],
"outputs": [],
"source": [
"from pipelines.steps.apply_calibration_factors import main as run_apply_calibration_factors\n",
"\n",
"path_to_data_file = 'data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10.h5'\n",
"path_to_data_file = CAMPAIGN_DATA_FILE\n",
"path_to_calibration_file = '../pipelines/params/calibration_factors.yaml'\n",
"dataset_name = 'ACSM_TOFWARE/2024/ACSM_JFJ_2024_timeseries.txt/data_table'\n",
"path_to_calibration_file = 'pipelines/params/calibration_factors.yaml'\n",
"command = ['python', 'pipelines/steps/apply_calibration_factors.py', path_to_data_file, dataset_name, path_to_calibration_file]\n",
"status = subprocess.run(command, capture_output=True, check=True)\n",
"#command = ['python', 'pipelines/steps/apply_calibration_factors.py', path_to_data_file, dataset_name, path_to_calibration_file]\n",
"#status = subprocess.run(command, capture_output=True, check=True)\n",
"#print(status.stdout.decode())\n",
"\n",
"print(status.stdout.decode())"
"run_apply_calibration_factors(path_to_data_file,path_to_calibration_file)\n"
]
},
{
@ -121,113 +64,18 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"c:\\Users\\florez_j\\Documents\\GitLab\\acsmnode\\pipelines\\steps\\compute_automated_flags.py\n",
"data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10.h5\n",
" dataset_name ... parent_file\n",
"0 ACSM_TOFWARE/2024/ACSM_JFJ_2024_meta.txt/data_... ... ACSM_JFJ_2024_meta.txt\n",
"1 ACSM_TOFWARE/2024/ACSM_JFJ_2024_timeseries.txt... ... ACSM_JFJ_2024_timeseries.txt\n",
"2 ACSM_TOFWARE/2024/Org_data_valid.csv/data_table ... Org_data_valid.csv\n",
"3 ACSM_TOFWARE/2024/Org_err_valid.csv/data_table ... Org_err_valid.csv\n",
"4 ACSM_TOFWARE/2024/Org_mz_valid.csv/data_table ... Org_mz_valid.csv\n",
"\n",
"[5 rows x 3 columns]\n",
"pipelines/params/validity_thresholds.yaml\n",
"Path to output directory : data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10\n",
"Processing script %s: pipelines\\steps\\compute_automated_flags.py\n",
"Output directory: %s data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_flags/2024\n",
"t_base\n",
"Diagnostic variable t_base has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"VaporizerTemp_C\n",
"HeaterBias_V\n",
"Diagnostic variable HeaterBias_V has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"FlowRefWave\n",
"Diagnostic variable FlowRefWave has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"FlowRate_mb\n",
"Diagnostic variable FlowRate_mb has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"FlowRate_ccs\n",
"FilamentEmission_mA\n",
"Detector_V\n",
"Diagnostic variable Detector_V has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"AnalogInput06_V\n",
"Diagnostic variable AnalogInput06_V has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"ABRefWave\n",
"Diagnostic variable ABRefWave has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"ABsamp\n",
"ABCorrFact\n",
"Diagnostic variable ABCorrFact has not defined limits in {'calibration_params': {'path_to_file': 'pipelines/params/calibration_params.yaml'}, 'validity_thresholds': {'description': 'Defines the value range of a particular variable', 'variables': {'VaporizerTemp_C': {'lower_lim': 538, 'upper_lim': 600, 'description': 'heater temperature'}, 'ABsamp': {'lower_lim': 20000, 'upper_lim': 500000}, 'FlowRate_ccs': {'lower_lim': 2.1, 'upper_lim': 2.3}, 'FilamentEmission_mA': {'lower_lim': 0.75, 'upper_lim': 1.5}}}}.\n",
"Metadata for calibrated data saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_flags/2024/data_lineage_metadata.json\n",
"Flags saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10/ACSM_TOFWARE_flags/2024/ACSM_JFJ_2024_meta_flags.csv\n",
"Data lineage saved to data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10\n",
"\n"
]
}
],
"outputs": [],
"source": [
"path_to_data_file = 'data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10.h5'\n",
"from pipelines.steps.generate_flags import main as run_generate_flags\n",
"path_to_data_file = CAMPAIGN_DATA_FILE\n",
"dataset_name = 'ACSM_TOFWARE/2024/ACSM_JFJ_2024_meta.txt/data_table'\n",
"path_to_config_file = 'pipelines/params/validity_thresholds.yaml'\n",
"command = ['python', 'pipelines/steps/compute_automated_flags.py', path_to_data_file, dataset_name, path_to_config_file]\n",
"status = subprocess.run(command, capture_output=True, check=True)\n",
"\n",
"print(status.stdout.decode())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"[Start] Data integration :\n",
"Source: data\\collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10\n",
"Destination: data\\collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10.h5\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\florez_j\\Documents\\GitLab\\acsmnode\\dima\\instruments\\readers\\acsm_tofware_reader.py:98: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" df = pd.read_csv(tmp_filename,\n",
"c:\\Users\\florez_j\\Documents\\GitLab\\acsmnode\\dima\\instruments\\readers\\acsm_tofware_reader.py:98: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
" df = pd.read_csv(tmp_filename,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[=================================-------------------------------------------------------------------] 33.3% ...\n",
"Completed data transfer for instFolder: /ACSM_TOFWARE/2024\n",
"Starting data transfer from instFolder: /ACSM_TOFWARE_flags/2024\n",
"[===================================================================---------------------------------] 66.7% ...\n",
"Completed data transfer for instFolder: /ACSM_TOFWARE_flags/2024\n",
"Starting data transfer from instFolder: /ACSM_TOFWARE_processed/2024\n",
"[====================================================================================================] 100.0% ...\n",
"Completed data transfer for instFolder: /ACSM_TOFWARE_processed/2024\n",
"[End] Data integration\n"
]
}
],
"source": [
"import dima.src.hdf5_ops as dataOps \n",
"\n",
"dataManager = dataOps.HDF5DataOpsManager(path_to_data_file)\n",
"dataManager.load_file_obj()\n",
"dataManager.update_file('data/collection_JFJ_2024_LeilaS_2025-02-10_2025-02-10')\n",
"dataManager.unload_file_obj()"
"#command = ['python', 'pipelines/steps/compute_automated_flags.py', path_to_data_file, dataset_name, path_to_config_file]\n",
"#status = subprocess.run(command, capture_output=True, check=True)\n",
"#print(status.stdout.decode())\n",
"run_generate_flags(path_to_data_file, 'diagnostics')\n"
]
},
{
@ -236,43 +84,46 @@
"metadata": {},
"outputs": [],
"source": [
"path_to_data_file = 'data/collection_JFJ_2024_LeilaS_2025-02-07_2025-02-07.h5'\n",
"from pipelines.steps.generate_flags import main as run_generate_flags\n",
"path_to_data_file = CAMPAIGN_DATA_FILE\n",
"dataset_name = 'ACSM_TOFWARE/2024/ACSM_JFJ_2024_meta.txt/data_table'\n",
"path_to_calibration_file = 'pipelines/params/validity_thresholds.yaml'\n",
"command = ['python', 'pipelines/steps/apply_diagnostic_flags.py', path_to_data_file, dataset_name, path_to_calibration_file]\n",
"status = subprocess.run(command, capture_output=True, check=True)\n",
"\n",
"print(status.stdout.decode()) "
"path_to_config_file = 'pipelines/params/validity_thresholds.yaml'\n",
"#command = ['python', 'pipelines/steps/compute_automated_flags.py', path_to_data_file, dataset_name, path_to_config_file]\n",
"#status = subprocess.run(command, capture_output=True, check=True)\n",
"#print(status.stdout.decode())\n",
"run_generate_flags(path_to_data_file, 'species')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'dataManager' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mdataManager\u001b[49m\u001b[38;5;241m.\u001b[39mload_file_obj()\n\u001b[0;32m 2\u001b[0m dataManager\u001b[38;5;241m.\u001b[39mextract_and_load_dataset_metadata()\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(dataManager\u001b[38;5;241m.\u001b[39mdataset_metadata_df\u001b[38;5;241m.\u001b[39mhead())\n",
"\u001b[1;31mNameError\u001b[0m: name 'dataManager' is not defined"
]
}
],
"outputs": [],
"source": [
"import dima.src.hdf5_ops as dataOps \n",
"\n",
"dataManager = dataOps.HDF5DataOpsManager(CAMPAIGN_DATA_FILE)\n",
"dataManager.update_file('../data/collection_JFJ_2024_LeilaS_2025-02-17_2025-02-17')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataManager = dataOps.HDF5DataOpsManager(path_to_data_file)\n",
"dataManager.load_file_obj()\n",
"dataManager.extract_and_load_dataset_metadata()\n",
"print(dataManager.dataset_metadata_df.head())\n",
"dataManager.unload_file_obj()\n"
"df = dataManager.dataset_metadata_df\n",
"print(df.head(10))\n",
"dataManager.unload_file_obj()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "flaggingapp_env",
"display_name": "dash_multi_chem_env",
"language": "python",
"name": "python3"
},
@ -286,7 +137,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
"version": "3.11.9"
}
},
"nbformat": 4,