commit 618ac2b917ba4c8f168b6b1fd646e555fe2aae53
Author: stafie_c <catalin.stafie@psi.ch>
Date:   Tue Oct 7 22:57:11 2025 +0200

    Initial commit

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..2f67e11
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,16 @@
+data/
+figures/
+notebooks/
+scripts/
+envs/
+logs/
+*.pyc
+__pycache__/
+*.h5
+.Trash-0/
+.ipynb_checkpoints/
+env_setup.sh
+docker-compose.yaml
+run_container.sh
+TODO.md
+.env
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..90d5b36
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+envs/
+logs/
+*.pyc
+__pycache__/
+*.h5
+.env
+.ipynb_checkpoints
+.Trash-0
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..367c882
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "dima"]
+	path = dima
+	url = https://gitea.psi.ch/5505-public/dima.git
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..e4cf003
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,47 @@
+# Get additional info about the Dockerfile at https://docs.docker.com/reference/dockerfile/ 
+
+FROM condaforge/miniforge3:latest
+
+# Define the name of the environment
+ARG ENV_NAME=idear_env
+ENV ENV_NAME=idear_env
+
+# Set the working directory
+WORKDIR /idear
+
+#RUN apt-get update && apt-get install -y cifs-utils
+
+# Use mamba if available for faster installation
+RUN conda install -y -n base -c conda-forge mamba && \
+    mamba create -y -n $ENV_NAME -c conda-forge python=3.11 \
+    jupyter numpy h5py pandas matplotlib plotly=5.24 scipy pip && \
+    conda clean --all -y && rm -rf /root/.cache/pip
+
+# Activate the environment and install additional pip packages
+RUN conda run -n $ENV_NAME pip install pybis==1.35 igor2 ipykernel sphinx dash dash-bootstrap-components
+
+# Set the default environment when the container starts
+ENV CONDA_DEFAULT_ENV=$ENV_NAME
+ENV PATH=/opt/conda/envs/$ENV_NAME/bin:$PATH
+
+# Create necessary directories for VOLUME
+RUN mkdir -p /idear/data /idear/figures /idear/notebooks /idear/scripts
+#RUN mkdir -p /mnt/lac_ord
+
+# Copy project files, excluding certain directories (handled via .dockerignore)
+COPY . /idear
+
+# Copy and install dependencies from requirements.txt
+COPY requirements.txt /idear/requirements.txt
+RUN conda run -n $ENV_NAME pip install -r /idear/requirements.txt
+
+# Define volumes for excluded directories
+# VOLUME ["/idear/data", "/idear/figures", "/idear/notebooks", "/idear/scripts"]
+
+# Add JupyterLab
+RUN pip install graphviz
+RUN pip install --no-cache-dir jupyterlab
+
+# If you want to set JupyterLab as the default command
+#CMD ["jupyter", "lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token='my-token'"]
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6d1a589
--- /dev/null
+++ b/README.md
@@ -0,0 +1,132 @@
+# IDEAR Project Name
+
+This is a **containerized, JupyterLab-based data toolkit** developed as part of the IDEAR project. It supports efficient, reproducible, and metadata-enriched data processing workflows for instrument-generated datasets.
+
+---
+
+### Key Features
+
+- Modular pipeline with reusable notebook workflows  
+- Metadata-driven HDF5 outputs for long-term data reuse  
+- Optional network-mounted input for seamless integration with shared drives  
+
+---
+
+### Output Format
+
+- **Self-describing HDF5 files**, including:
+  - Project-level, contextual, and data lineage metadata
+
+---
+
+### Extensibility
+
+New instruments can be supported by extending the file parsing capabilities in the `dima/` module.
+
+
+
+## Repository Structure
+
+<details>
+<summary><b>Click to expand</b></summary>
+
+- `data/` — Input and output datasets (mounted volume)  
+- `figures/` — Output visualizations (mounted volume)  
+- `notebooks/` — Jupyter notebooks for processing and metadata integration  
+- `scripts/` — Supplementary processing logic  
+- `dima/` — Metadata and HDF5 schema utilities (persisted module)  
+- `Dockerfile` — Container image definition  
+- `docker-compose.yaml` — Local and networked deployment options  
+- `env_setup.sh` — Optional local environment bootstrap  
+- `CITATION.cff`, `LICENCE`, `README.md`, `.gitignore`, `.dockerignore` — Project metadata and config  
+- `campaignDescriptor.yaml` — Campaign-specific metadata
+
+</details>
+
+---
+
+## Getting Started
+
+### Requirements
+
+#### For Docker-based usage:
+
+- **Docker Desktop**  
+- **Git Bash** (for running shell scripts on Windows)  
+
+#### Optional for local (non-Docker) usage:
+
+- **Conda** (`miniconda` or `anaconda`)  
+
+#### If accessing network drives (e.g., PSI):
+
+- PSI credentials with access to mounted network shares
+
+---
+
+## Clone the Repository
+
+```bash
+git clone --recurse-submodules <your-repo-url>
+cd <your-repo-name>
+```
+
+## Run with Docker
+
+This toolkit includes a containerized JupyterLab environment for executing the data processing pipeline, plus an optional dashboard for manual flagging.
+
+1. Open **PowerShell as Administrator** and navigate to the `your-repo-name` repository.  
+2. Create a `.env` file in the root of `your-repo-name/`.  
+3. **Securely store your network drive access credentials** in the `.env` file by adding the following lines: 
+   ```plaintext
+   CIFS_USER=<your-username>
+   CIFS_PASS=<your-password>
+   JUPYTER_TOKEN=my-token
+   NETWORK_MOUNT=//your-server/your-share
+   ```
+   **To protect your credentials:**
+   - Do not share the .env file with others.
+   - Ensure the file is excluded from version control by adding .env to your .gitignore and .dockerignore files.
+4. Open **Docker Desktop**, then build the container image:
+   ```bash
+   docker build -f Dockerfile -t idear_processor .
+   ```
+5. Start the environment:
+
+- **Locally without network drive mount:**
+   Regardless of value in .env, `NETWORK_MOUNT` defaults to `<your-repo-name>/data/`.
+  ```bash
+  docker compose up idear_processor
+
+- **With network drive mount:**
+
+  ```bash
+  docker compose up idear_processor_networked
+
+6. Access:
+   - **Jupyter Lab**: [http://localhost:8889/lab/](http://localhost:8889/lab/)
+
+7. Stop the app:
+   In the previously open PowerShell terminal, enter:
+   ```bash
+   Ctrl + C
+   ```
+   After the container is properly Stopped, remove the container process as: 
+   ```bash
+   docker rm $(docker ps -aq --filter ancestor=idear_processor)
+   ```
+
+
+## (Optional) Set Up the Python Environment
+
+   > Required only if you plan to run the toolkit outside of Docker
+
+If **Git Bash** lacks a suitable Python interpreter, run:
+
+```bash
+   bash env_setup.sh
+```
+
+## Citation
+
+## License
diff --git a/campaignDescriptor.yaml b/campaignDescriptor.yaml
new file mode 100644
index 0000000..a0ce06d
--- /dev/null
+++ b/campaignDescriptor.yaml
@@ -0,0 +1,66 @@
+# ------------------------------------------------------------------------------
+# Configuration for FAIR Data Integration Pipeline
+# ------------------------------------------------------------------------------
+
+# Can be a local or network path. Examples:
+#   - Local: '../data/data_folder/' # All paths must be relative to folder ../data/
+#   - Network: /mnt/network_drive/data_folder or equivalently  ${NETWORK_MOUNT}/data_folder 
+input_file_directory: '/mnt/network_mount/Data'
+
+# OUTPUT DATA DIRECTORY
+# Always relative to notebook location. If run from `notebooks/`,
+# output will be saved in `../data/`.
+# Do not modify unless you're running from the project root.
+output_file_directory: '../data/'
+
+# ------------------------------------------------------------------------------
+# Project Metadata
+# ------------------------------------------------------------------------------
+
+project: 'Insert project title here'
+contact: 'Your Name or Team'
+group_id: '0000'  # Optional internal group or project ID
+
+# Type of experiment (e.g., campaign, flow_tube_study, smog_chamber, lab_study)
+experiment: 'experiment_type'
+
+# Dataset coverage range (optional but recommended)
+dataset_startdate: 'YYYY-MM-DD'
+dataset_enddate: 'YYYY-MM-DD'
+
+# Data processing level (typically '0', '1', or '2'; follows ACTRIS or custom standards)
+actris_level: '0'
+
+# ------------------------------------------------------------------------------
+# Output File Naming Convention (Optional)
+# ------------------------------------------------------------------------------
+
+# Year of observation
+year: 'YYYY'
+
+# Format string used to define output filenames.
+# You may use any field from this config as a part, comma-separated.
+# Example: 'experiment, year' → experiment_year.h5
+filename_format: 'experiment, year'
+
+# ------------------------------------------------------------------------------
+# Instrument Data Source
+# ------------------------------------------------------------------------------
+
+# Relative subdirectories inside the input directory that contain instrument data.
+# Use one or more folder paths as needed.
+instrument_datafolder:
+  - 'instFolder1/subfolder/'
+  - 'instFolder2'
+
+# ------------------------------------------------------------------------------
+# Data Integration Options
+# ------------------------------------------------------------------------------
+
+# Integration mode: 'collection' or 'single_experiment'.
+integration_mode: 'collection'
+
+# Optional: list of timestamps marking experimental phases or steps.
+# Format each entry as: 'YYYY-MM-DD HH:MM:SS'
+datetime_steps: []
+
diff --git a/data/.gitkeep b/data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/dima b/dima
new file mode 160000
index 0000000..2a9e39b
--- /dev/null
+++ b/dima
@@ -0,0 +1 @@
+Subproject commit 2a9e39b9ca16b372bf30bb99a93039626876f104
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..9c7a022
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,47 @@
+version: '3.9'
+
+services:
+  idear_processor_networked:
+    image: idear_processor
+    restart: unless-stopped
+    environment:
+      - DOCKER_CONTAINER=1
+      - NETWORK_MOUNT=/mnt/network_mount
+    ports:
+      - "${JUPYTER_PORT:-8889}:8888"
+    volumes:
+      - ./data:/idear/data
+      - ./figures:/idear/figures
+      - ./notebooks:/idear/notebooks
+      - ./scripts:/idear/scripts
+      - network_mount:/mnt/network_mount:rw
+    command: >
+      bash -c "
+      jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='${JUPYTER_TOKEN:-my-token}'
+      "
+    profiles:
+      - networked
+
+  idear_processor:
+    image: idear_processor
+    restart: unless-stopped
+    environment:
+      - DOCKER_CONTAINER=1
+      - NETWORK_MOUNT=/idear/data
+    ports:
+      - "${JUPYTER_PORT:-8889}:8888"
+    volumes:
+      - ./:/idear
+    command: >
+      bash -c "
+      jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='${JUPYTER_TOKEN:-my-token}'
+      "
+    profiles:
+      - local-only
+
+volumes:
+  network_mount:
+    driver_opts:
+      type: cifs
+      o: "username=${CIFS_USER},password=${CIFS_PASS},vers=3.0"
+      device: "${NETWORK_MOUNT}"
diff --git a/figures/.gitkeep b/figures/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/demo_data_integration.ipynb b/notebooks/demo_data_integration.ipynb
new file mode 100644
index 0000000..64b8d91
--- /dev/null
+++ b/notebooks/demo_data_integration.ipynb
@@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data integration workflow of experimental campaign\n",
+    "\n",
+    "In this notebook, we will go through a our data integration workflow. This involves the following steps:\n",
+    "\n",
+    "1. Specify data integration file through YAML configuration file.\n",
+    "2. Create an integrated HDF5 file of experimental campaign from configuration file.\n",
+    "3. Display the created HDF5 file using a treemap\n",
+    "\n",
+    "## Import libraries and modules\n",
+    "\n",
+    "* Excecute (or Run) the Cell below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "# Set up project root directory\n",
+    "\n",
+    "notebook_dir = os.getcwd()  # Current working directory (assumes running from notebooks/)\n",
+    "project_path = os.path.normpath(os.path.join(notebook_dir, \"..\"))  # Move up to project root\n",
+    "dima_path = os.path.normpath(os.path.join(project_path, \"dima\"))  # Move up to project root\n",
+    "\n",
+    "for item in sys.path:\n",
+    "    print(item)\n",
+    "\n",
+    "\n",
+    "if project_path not in sys.path:  # Avoid duplicate entries\n",
+    "    sys.path.append(project_path)\n",
+    "    print(project_path)\n",
+    "if dima_path not in sys.path:\n",
+    "    sys.path.insert(0,dima_path)\n",
+    "    print(dima_path)\n",
+    "\n",
+    "import dima.visualization.hdf5_vis as hdf5_vis\n",
+    "import dima.pipelines.data_integration as data_integration\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Specify Data Integration Task via YAML Configuration\n",
+    "\n",
+    "* Open the `campaignDescriptor.yaml` file located in the project root, and fill it out to describe your dataset.\n",
+    "\n",
+    "* Refer to example descriptors in `/dima/input_files/` for guidance.\n",
+    "\n",
+    "* Run the cell below to load your configuration — or skip it and go to the next cell to test the pipeline using one of the predefined campaign descriptors.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "descriptor_path ='../campaignDescriptor.yaml'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Uncomment the lines below to test the data integration pipeline\n",
+    "# using predefined campaign descriptors for existing datasets in 5505.\n",
+    "\n",
+    "# Choose a predefined descriptor:\n",
+    "# Options: (1, 'LI'), (2, 'TBR'), (3, 'NG')\n",
+    "# num, initials = 1, 'LI'\n",
+    "\n",
+    "# Construct the path to the YAML descriptor\n",
+    "# descriptor_path = f'../dima/input_files/campaignDescriptor{num}_{initials}.yaml'\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Create an integrated HDF5 file of experimental campaign.\n",
+    "\n",
+    "* Excecute Cell. Here we run the function `integrate_data_sources` with input argument as the previously specified YAML config file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "hdf5_file_path = data_integration.run_pipeline(descriptor_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hdf5_file_path = ['../data/collection_experiment_type_YYYY_YYYY-MM-DD_YYYY-MM-DD.h5']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Display integrated HDF5 file using a treemap\n",
+    "\n",
+    "* Excecute Cell. A visual representation in html format of the integrated file should be displayed and stored in the output directory folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if isinstance(hdf5_file_path ,list):\n",
+    "    for path_item in hdf5_file_path :\n",
+    "        hdf5_vis.display_group_hierarchy_on_a_treemap(path_item)\n",
+    "else:\n",
+    "    hdf5_vis.display_group_hierarchy_on_a_treemap(hdf5_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/.gitkeep b/scripts/.gitkeep
new file mode 100644
index 0000000..e69de29