From 0c1c0d04dac5dae451245a5a7ab04a1e2c8a1162 Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Fri, 16 Aug 2024 10:07:28 +0200 Subject: [PATCH] Added data integration workflow and related config file for acsm data integration. --- dima_config.yaml | 20 ++++ workflow_data_integration.ipynb | 163 ++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 dima_config.yaml create mode 100644 workflow_data_integration.ipynb diff --git a/dima_config.yaml b/dima_config.yaml new file mode 100644 index 0000000..b652cbc --- /dev/null +++ b/dima_config.yaml @@ -0,0 +1,20 @@ +input_file_directory: 'raw_datafolder/' +output_file_directory: 'data_products/' + +project: 'Building FAIR data chains for atmospheric observations in the ACTRIS Switzerland Network' +contact: 'NoraN' +group_id: 'APOG' + +experiment: 'acsm_campaign' # beamtime, smog_chamber, lab_experiment +dataset_startdate: '2024-01-01' +dataset_enddate: '2024-02-29' +actris_level: '1' + + + +instrument_datafolder: + - 'ACSM_TOFWARE' + +integration_mode: 'collection' +# Specify datetimes (YYYY-MM-DD HH-MM-SS) at which experimental steps were created. +datetime_steps: [] \ No newline at end of file diff --git a/workflow_data_integration.ipynb b/workflow_data_integration.ipynb new file mode 100644 index 0000000..9c4f775 --- /dev/null +++ b/workflow_data_integration.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data integration workflow of experimental campaign\n", + "\n", + "In this notebook, we will go through a our data integration workflow. This involves the following steps:\n", + "\n", + "1. Specify data integration file through YAML configuration file.\n", + "2. Create an integrated HDF5 file of experimental campaign from configuration file.\n", + "3. Display the created HDF5 file using a treemap\n", + "\n", + "## Import libraries and modules\n", + "\n", + "* Excecute (or Run) the Cell below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "# Set up project root directory\n", + "root_dir = os.path.abspath(os.curdir)\n", + "sys.path.append(root_dir)\n", + "sys.path.append(os.path.join(root_dir,'dima'))\n", + "\n", + "import dima.src.hdf5_vis as hdf5_vis\n", + "import dima.src.data_integration_lib as dilib\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Specify data integration task through YAML configuration file\n", + "\n", + "* Create your configuration file (i.e., *.yaml file) adhering to the example yaml file in the input folder.\n", + "* Set up input directory and output directory paths and Excecute Cell.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "yaml_config_file_path = 'dima_config.yaml'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Create an integrated HDF5 file of experimental campaign.\n", + "\n", + "* Excecute Cell. Here we run the function `integrate_data_sources` with input argument as the previously specified YAML config file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "hdf5_file_path = dilib.integrate_data_sources(yaml_config_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Display integrated HDF5 file using a treemap\n", + "\n", + "* Excecute Cell. A visual representation in html format of the integrated file should be displayed and stored in the output directory folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isinstance(hdf5_file_path ,list):\n", + " for path_item in hdf5_file_path :\n", + " hdf5_vis.display_group_hierarchy_on_a_treemap(path_item)\n", + "else:\n", + " hdf5_vis.display_group_hierarchy_on_a_treemap(hdf5_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dima.src.metadata_review_lib as metadata\n", + "\n", + "import dima.src.hdf5_data_extraction as h5de\n", + "\n", + "channels1 = ['Chl_11000','NH4_11000','SO4_11000','NO3_11000','Org_11000']\n", + "channels2 = ['FilamentEmission_mA','VaporizerTemp_C','FlowRate_mb','ABsamp']\n", + "\n", + "target_channels = {'location':'ACSM_TOFWARE/ACSM_JFJ_2024_JantoFeb_timeseries.txt/data_table',\n", + " 'names': ','.join(['t_start_Buf','Chl_11000','NH4_11000','SO4_11000','NO3_11000','Org_11000'])\n", + " }\n", + "diagnostic_channels = {'location':'ACSM_TOFWARE/ACSM_JFJ_2024_JantoFeb_meta.txt/data_table',\n", + " 'names': ','.join(['t_base','FilamentEmission_mA','VaporizerTemp_C','FlowRate_mb','ABsamp'])}\n", + "\n", + "DataOpsAPI = h5de.HDF5DataOpsManager(hdf5_file_path)\n", + "\n", + "DataOpsAPI.append_annotations('/',{'target_channels' : target_channels, 'diagnostic_channels' : diagnostic_channels})\n", + "\n", + "DataOpsAPI.reformat_datetime_column('ACSM_TOFWARE/ACSM_JFJ_2024_JantoFeb_timeseries.txt/data_table','t_start_Buf',src_format='%d.%m.%Y %H:%M:%S.%f')\n", + "DataOpsAPI.reformat_datetime_column('ACSM_TOFWARE/ACSM_JFJ_2024_JantoFeb_meta.txt/data_table','t_base',src_format='%d.%m.%Y %H:%M:%S')\n", + "\n", + "DataOpsAPI.close_file()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "multiphase_chemistry_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}