commit 618ac2b917ba4c8f168b6b1fd646e555fe2aae53 Author: stafie_c Date: Tue Oct 7 22:57:11 2025 +0200 Initial commit diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2f67e11 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,16 @@ +data/ +figures/ +notebooks/ +scripts/ +envs/ +logs/ +*.pyc +__pycache__/ +*.h5 +.Trash-0/ +.ipynb_checkpoints/ +env_setup.sh +docker-compose.yaml +run_container.sh +TODO.md +.env diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90d5b36 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +envs/ +logs/ +*.pyc +__pycache__/ +*.h5 +.env +.ipynb_checkpoints +.Trash-0 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..367c882 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dima"] + path = dima + url = https://gitea.psi.ch/5505-public/dima.git diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e4cf003 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# Get additional info about the Dockerfile at https://docs.docker.com/reference/dockerfile/ + +FROM condaforge/miniforge3:latest + +# Define the name of the environment +ARG ENV_NAME=idear_env +ENV ENV_NAME=idear_env + +# Set the working directory +WORKDIR /idear + +#RUN apt-get update && apt-get install -y cifs-utils + +# Use mamba if available for faster installation +RUN conda install -y -n base -c conda-forge mamba && \ + mamba create -y -n $ENV_NAME -c conda-forge python=3.11 \ + jupyter numpy h5py pandas matplotlib plotly=5.24 scipy pip && \ + conda clean --all -y && rm -rf /root/.cache/pip + +# Activate the environment and install additional pip packages +RUN conda run -n $ENV_NAME pip install pybis==1.35 igor2 ipykernel sphinx dash dash-bootstrap-components + +# Set the default environment when the container starts +ENV CONDA_DEFAULT_ENV=$ENV_NAME +ENV PATH=/opt/conda/envs/$ENV_NAME/bin:$PATH + +# Create necessary directories for VOLUME +RUN mkdir -p /idear/data /idear/figures /idear/notebooks /idear/scripts +#RUN mkdir -p /mnt/lac_ord + +# Copy project files, excluding certain directories (handled via .dockerignore) +COPY . /idear + +# Copy and install dependencies from requirements.txt +COPY requirements.txt /idear/requirements.txt +RUN conda run -n $ENV_NAME pip install -r /idear/requirements.txt + +# Define volumes for excluded directories +# VOLUME ["/idear/data", "/idear/figures", "/idear/notebooks", "/idear/scripts"] + +# Add JupyterLab +RUN pip install graphviz +RUN pip install --no-cache-dir jupyterlab + +# If you want to set JupyterLab as the default command +#CMD ["jupyter", "lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token='my-token'"] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d1a589 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ +# IDEAR Project Name + +This is a **containerized, JupyterLab-based data toolkit** developed as part of the IDEAR project. It supports efficient, reproducible, and metadata-enriched data processing workflows for instrument-generated datasets. + +--- + +### Key Features + +- Modular pipeline with reusable notebook workflows +- Metadata-driven HDF5 outputs for long-term data reuse +- Optional network-mounted input for seamless integration with shared drives + +--- + +### Output Format + +- **Self-describing HDF5 files**, including: + - Project-level, contextual, and data lineage metadata + +--- + +### Extensibility + +New instruments can be supported by extending the file parsing capabilities in the `dima/` module. + + + +## Repository Structure + +
+Click to expand + +- `data/` — Input and output datasets (mounted volume) +- `figures/` — Output visualizations (mounted volume) +- `notebooks/` — Jupyter notebooks for processing and metadata integration +- `scripts/` — Supplementary processing logic +- `dima/` — Metadata and HDF5 schema utilities (persisted module) +- `Dockerfile` — Container image definition +- `docker-compose.yaml` — Local and networked deployment options +- `env_setup.sh` — Optional local environment bootstrap +- `CITATION.cff`, `LICENCE`, `README.md`, `.gitignore`, `.dockerignore` — Project metadata and config +- `campaignDescriptor.yaml` — Campaign-specific metadata + +
+ +--- + +## Getting Started + +### Requirements + +#### For Docker-based usage: + +- **Docker Desktop** +- **Git Bash** (for running shell scripts on Windows) + +#### Optional for local (non-Docker) usage: + +- **Conda** (`miniconda` or `anaconda`) + +#### If accessing network drives (e.g., PSI): + +- PSI credentials with access to mounted network shares + +--- + +## Clone the Repository + +```bash +git clone --recurse-submodules +cd +``` + +## Run with Docker + +This toolkit includes a containerized JupyterLab environment for executing the data processing pipeline, plus an optional dashboard for manual flagging. + +1. Open **PowerShell as Administrator** and navigate to the `your-repo-name` repository. +2. Create a `.env` file in the root of `your-repo-name/`. +3. **Securely store your network drive access credentials** in the `.env` file by adding the following lines: + ```plaintext + CIFS_USER= + CIFS_PASS= + JUPYTER_TOKEN=my-token + NETWORK_MOUNT=//your-server/your-share + ``` + **To protect your credentials:** + - Do not share the .env file with others. + - Ensure the file is excluded from version control by adding .env to your .gitignore and .dockerignore files. +4. Open **Docker Desktop**, then build the container image: + ```bash + docker build -f Dockerfile -t idear_processor . + ``` +5. Start the environment: + +- **Locally without network drive mount:** + Regardless of value in .env, `NETWORK_MOUNT` defaults to `/data/`. + ```bash + docker compose up idear_processor + +- **With network drive mount:** + + ```bash + docker compose up idear_processor_networked + +6. Access: + - **Jupyter Lab**: [http://localhost:8889/lab/](http://localhost:8889/lab/) + +7. Stop the app: + In the previously open PowerShell terminal, enter: + ```bash + Ctrl + C + ``` + After the container is properly Stopped, remove the container process as: + ```bash + docker rm $(docker ps -aq --filter ancestor=idear_processor) + ``` + + +## (Optional) Set Up the Python Environment + + > Required only if you plan to run the toolkit outside of Docker + +If **Git Bash** lacks a suitable Python interpreter, run: + +```bash + bash env_setup.sh +``` + +## Citation + +## License diff --git a/campaignDescriptor.yaml b/campaignDescriptor.yaml new file mode 100644 index 0000000..a0ce06d --- /dev/null +++ b/campaignDescriptor.yaml @@ -0,0 +1,66 @@ +# ------------------------------------------------------------------------------ +# Configuration for FAIR Data Integration Pipeline +# ------------------------------------------------------------------------------ + +# Can be a local or network path. Examples: +# - Local: '../data/data_folder/' # All paths must be relative to folder ../data/ +# - Network: /mnt/network_drive/data_folder or equivalently ${NETWORK_MOUNT}/data_folder +input_file_directory: '/mnt/network_mount/Data' + +# OUTPUT DATA DIRECTORY +# Always relative to notebook location. If run from `notebooks/`, +# output will be saved in `../data/`. +# Do not modify unless you're running from the project root. +output_file_directory: '../data/' + +# ------------------------------------------------------------------------------ +# Project Metadata +# ------------------------------------------------------------------------------ + +project: 'Insert project title here' +contact: 'Your Name or Team' +group_id: '0000' # Optional internal group or project ID + +# Type of experiment (e.g., campaign, flow_tube_study, smog_chamber, lab_study) +experiment: 'experiment_type' + +# Dataset coverage range (optional but recommended) +dataset_startdate: 'YYYY-MM-DD' +dataset_enddate: 'YYYY-MM-DD' + +# Data processing level (typically '0', '1', or '2'; follows ACTRIS or custom standards) +actris_level: '0' + +# ------------------------------------------------------------------------------ +# Output File Naming Convention (Optional) +# ------------------------------------------------------------------------------ + +# Year of observation +year: 'YYYY' + +# Format string used to define output filenames. +# You may use any field from this config as a part, comma-separated. +# Example: 'experiment, year' → experiment_year.h5 +filename_format: 'experiment, year' + +# ------------------------------------------------------------------------------ +# Instrument Data Source +# ------------------------------------------------------------------------------ + +# Relative subdirectories inside the input directory that contain instrument data. +# Use one or more folder paths as needed. +instrument_datafolder: + - 'instFolder1/subfolder/' + - 'instFolder2' + +# ------------------------------------------------------------------------------ +# Data Integration Options +# ------------------------------------------------------------------------------ + +# Integration mode: 'collection' or 'single_experiment'. +integration_mode: 'collection' + +# Optional: list of timestamps marking experimental phases or steps. +# Format each entry as: 'YYYY-MM-DD HH:MM:SS' +datetime_steps: [] + diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/dima b/dima new file mode 160000 index 0000000..2a9e39b --- /dev/null +++ b/dima @@ -0,0 +1 @@ +Subproject commit 2a9e39b9ca16b372bf30bb99a93039626876f104 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..9c7a022 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,47 @@ +version: '3.9' + +services: + idear_processor_networked: + image: idear_processor + restart: unless-stopped + environment: + - DOCKER_CONTAINER=1 + - NETWORK_MOUNT=/mnt/network_mount + ports: + - "${JUPYTER_PORT:-8889}:8888" + volumes: + - ./data:/idear/data + - ./figures:/idear/figures + - ./notebooks:/idear/notebooks + - ./scripts:/idear/scripts + - network_mount:/mnt/network_mount:rw + command: > + bash -c " + jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='${JUPYTER_TOKEN:-my-token}' + " + profiles: + - networked + + idear_processor: + image: idear_processor + restart: unless-stopped + environment: + - DOCKER_CONTAINER=1 + - NETWORK_MOUNT=/idear/data + ports: + - "${JUPYTER_PORT:-8889}:8888" + volumes: + - ./:/idear + command: > + bash -c " + jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='${JUPYTER_TOKEN:-my-token}' + " + profiles: + - local-only + +volumes: + network_mount: + driver_opts: + type: cifs + o: "username=${CIFS_USER},password=${CIFS_PASS},vers=3.0" + device: "${NETWORK_MOUNT}" diff --git a/figures/.gitkeep b/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/demo_data_integration.ipynb b/notebooks/demo_data_integration.ipynb new file mode 100644 index 0000000..64b8d91 --- /dev/null +++ b/notebooks/demo_data_integration.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data integration workflow of experimental campaign\n", + "\n", + "In this notebook, we will go through a our data integration workflow. This involves the following steps:\n", + "\n", + "1. Specify data integration file through YAML configuration file.\n", + "2. Create an integrated HDF5 file of experimental campaign from configuration file.\n", + "3. Display the created HDF5 file using a treemap\n", + "\n", + "## Import libraries and modules\n", + "\n", + "* Excecute (or Run) the Cell below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "# Set up project root directory\n", + "\n", + "notebook_dir = os.getcwd() # Current working directory (assumes running from notebooks/)\n", + "project_path = os.path.normpath(os.path.join(notebook_dir, \"..\")) # Move up to project root\n", + "dima_path = os.path.normpath(os.path.join(project_path, \"dima\")) # Move up to project root\n", + "\n", + "for item in sys.path:\n", + " print(item)\n", + "\n", + "\n", + "if project_path not in sys.path: # Avoid duplicate entries\n", + " sys.path.append(project_path)\n", + " print(project_path)\n", + "if dima_path not in sys.path:\n", + " sys.path.insert(0,dima_path)\n", + " print(dima_path)\n", + "\n", + "import dima.visualization.hdf5_vis as hdf5_vis\n", + "import dima.pipelines.data_integration as data_integration\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Specify Data Integration Task via YAML Configuration\n", + "\n", + "* Open the `campaignDescriptor.yaml` file located in the project root, and fill it out to describe your dataset.\n", + "\n", + "* Refer to example descriptors in `/dima/input_files/` for guidance.\n", + "\n", + "* Run the cell below to load your configuration — or skip it and go to the next cell to test the pipeline using one of the predefined campaign descriptors.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "descriptor_path ='../campaignDescriptor.yaml'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment the lines below to test the data integration pipeline\n", + "# using predefined campaign descriptors for existing datasets in 5505.\n", + "\n", + "# Choose a predefined descriptor:\n", + "# Options: (1, 'LI'), (2, 'TBR'), (3, 'NG')\n", + "# num, initials = 1, 'LI'\n", + "\n", + "# Construct the path to the YAML descriptor\n", + "# descriptor_path = f'../dima/input_files/campaignDescriptor{num}_{initials}.yaml'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Create an integrated HDF5 file of experimental campaign.\n", + "\n", + "* Excecute Cell. Here we run the function `integrate_data_sources` with input argument as the previously specified YAML config file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "hdf5_file_path = data_integration.run_pipeline(descriptor_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hdf5_file_path = ['../data/collection_experiment_type_YYYY_YYYY-MM-DD_YYYY-MM-DD.h5']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Display integrated HDF5 file using a treemap\n", + "\n", + "* Excecute Cell. A visual representation in html format of the integrated file should be displayed and stored in the output directory folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isinstance(hdf5_file_path ,list):\n", + " for path_item in hdf5_file_path :\n", + " hdf5_vis.display_group_hierarchy_on_a_treemap(path_item)\n", + "else:\n", + " hdf5_vis.display_group_hierarchy_on_a_treemap(hdf5_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/scripts/.gitkeep b/scripts/.gitkeep new file mode 100644 index 0000000..e69de29