ci: add benchmark workflow

2026-04-18 06:15:37 +02:00 · 2026-04-17 16:06:02 +02:00
13 changed files with 1093 additions and 32 deletions
--- a/.github/scripts/aggregate_benchmarks.py
+++ b/.github/scripts/aggregate_benchmarks.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Aggregate and merge benchmark JSON files.
+
+The workflow runs the same benchmark suite on multiple independent runners.
+This script reads every JSON file produced by those attempts, normalizes the
+contained benchmark values, and writes a compact mapping JSON where each value is
+the median across attempts. It can also merge independent hyperfine JSON files
+from one runner into a single hyperfine-style JSON file.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from pathlib import Path
+from typing import Any
+
+from compare_benchmarks import Benchmark, extract_benchmarks
+
+
+def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]:
+    """Collect benchmarks from multiple JSON files.
+
+    Args:
+        paths (list[Path]): Paths to hyperfine, pytest-benchmark, or compact
+            mapping JSON files.
+
+    Returns:
+        dict[str, list[Benchmark]]: Benchmarks grouped by benchmark name.
+    """
+
+    collected: dict[str, list[Benchmark]] = {}
+    for path in paths:
+        for name, benchmark in extract_benchmarks(path).items():
+            collected.setdefault(name, []).append(benchmark)
+    return collected
+
+
+def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, object]]:
+    """Aggregate grouped benchmarks using the median value.
+
+    Args:
+        collected (dict[str, list[Benchmark]]): Benchmarks grouped by benchmark
+            name.
+
+    Returns:
+        dict[str, dict[str, object]]: Compact mapping JSON data. Each benchmark
+        contains ``value``, ``unit``, ``metric``, ``attempts``, and
+        ``attempt_values``.
+    """
+
+    aggregated: dict[str, dict[str, object]] = {}
+    for name, benchmarks in sorted(collected.items()):
+        values = [benchmark.value for benchmark in benchmarks]
+        unit = next((benchmark.unit for benchmark in benchmarks if benchmark.unit), "")
+        metric = next((benchmark.metric for benchmark in benchmarks if benchmark.metric), "value")
+        aggregated[name] = {
+            "value": statistics.median(values),
+            "unit": unit,
+            "metric": f"median-of-attempt-{metric}",
+            "attempts": len(values),
+            "attempt_values": values,
+        }
+    return aggregated
+
+
+def merge_hyperfine_results(paths: list[Path]) -> dict[str, Any]:
+    """Merge hyperfine result files.
+
+    Args:
+        paths (list[Path]): Hyperfine JSON files to merge.
+
+    Returns:
+        dict[str, Any]: Hyperfine-style JSON object containing all result rows.
+
+    Raises:
+        ValueError: If any file has no hyperfine ``results`` list.
+    """
+
+    merged: dict[str, Any] = {"results": []}
+    for path in paths:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        results = data.get("results", []) if isinstance(data, dict) else None
+        if not isinstance(results, list):
+            raise ValueError(f"{path} has no hyperfine results list")
+        merged["results"].extend(results)
+    return merged
+
+
+def main_from_paths(input_dir: Path, output: Path) -> int:
+    """Aggregate all JSON files in a directory and write the result.
+
+    Args:
+        input_dir (Path): Directory containing benchmark JSON files.
+        output (Path): Path where the aggregate JSON should be written.
+
+    Returns:
+        int: Always ``0`` on success.
+
+    Raises:
+        ValueError: If no JSON files are found in ``input_dir``.
+    """
+
+    paths = sorted(input_dir.rglob("*.json"))
+    if not paths:
+        raise ValueError(f"No benchmark JSON files found in {input_dir}")
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(
+        json.dumps(aggregate(collect_benchmarks(paths)), indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return 0
+
+
+def merge_from_paths(input_dir: Path, output: Path) -> int:
+    """Merge all hyperfine JSON files in a directory and write the result.
+
+    Args:
+        input_dir (Path): Directory containing hyperfine JSON files.
+        output (Path): Path where the merged JSON should be written.
+
+    Returns:
+        int: Always ``0`` on success.
+
+    Raises:
+        ValueError: If no JSON files are found in ``input_dir``.
+    """
+
+    paths = sorted(input_dir.glob("*.json"))
+    if not paths:
+        raise ValueError(f"No hyperfine JSON files found in {input_dir}")
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(
+        json.dumps(merge_hyperfine_results(paths), indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return 0
+
+
+def main() -> int:
+    """Run the benchmark aggregation command line interface.
+
+    Returns:
+        int: Always ``0`` on success.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode",
+        choices=("aggregate", "merge-hyperfine"),
+        default="aggregate",
+        help="Operation to perform.",
+    )
+    parser.add_argument("--input-dir", required=True, type=Path)
+    parser.add_argument("--output", required=True, type=Path)
+    args = parser.parse_args()
+    if args.mode == "merge-hyperfine":
+        return merge_from_paths(input_dir=args.input_dir, output=args.output)
+    return main_from_paths(input_dir=args.input_dir, output=args.output)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/.github/scripts/compare_benchmarks.py
+++ b/.github/scripts/compare_benchmarks.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+"""Compare benchmark JSON files and write a GitHub Actions summary.
+
+The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark,
+and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing
+formats prefer median values and fall back to mean values when median values are
+not present.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+
+@dataclass(frozen=True)
+class Benchmark:
+    """Normalized benchmark result.
+
+    Attributes:
+        name (str): Stable benchmark name used to match baseline and current results.
+        value (float): Numeric benchmark value used for comparison.
+        unit (str): Display unit for the value, for example ``"s"``.
+        metric (str): Source metric name, for example ``"median"`` or ``"mean"``.
+    """
+
+    name: str
+    value: float
+    unit: str
+    metric: str = "value"
+
+
+@dataclass(frozen=True)
+class Comparison:
+    """Comparison between one baseline benchmark and one current benchmark.
+
+    Attributes:
+        name (str): Benchmark name.
+        baseline (float): Baseline benchmark value.
+        current (float): Current benchmark value.
+        delta_percent (float): Percent change from baseline to current.
+        unit (str): Display unit for both values.
+        metric (str): Current result metric used for comparison.
+        regressed (bool): Whether the change exceeds the configured threshold.
+    """
+
+    name: str
+    baseline: float
+    current: float
+    delta_percent: float
+    unit: str
+    metric: str
+    regressed: bool
+
+
+def _read_json(path: Path) -> Any:
+    """Read JSON data from a file.
+
+    Args:
+        path (Path): Path to the JSON file.
+
+    Returns:
+        Any: Parsed JSON value.
+    """
+
+    with path.open("r", encoding="utf-8") as stream:
+        return json.load(stream)
+
+
+def _as_float(value: Any) -> float | None:
+    """Convert a value to a finite float.
+
+    Args:
+        value (Any): Value to convert.
+
+    Returns:
+        float | None: Converted finite float, or ``None`` if conversion fails.
+    """
+
+    try:
+        result = float(value)
+    except (TypeError, ValueError):
+        return None
+    if math.isfinite(result):
+        return result
+    return None
+
+
+def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from hyperfine JSON.
+
+    Args:
+        data (dict[str, Any]): Parsed hyperfine JSON object.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by command name.
+    """
+
+    benchmarks: dict[str, Benchmark] = {}
+    for result in data.get("results", []):
+        if not isinstance(result, dict):
+            continue
+        name = str(result.get("command") or result.get("name") or "").strip()
+        metric = "median"
+        value = _as_float(result.get(metric))
+        if value is None:
+            metric = "mean"
+            value = _as_float(result.get(metric))
+        if name and value is not None:
+            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
+    return benchmarks
+
+
+def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from pytest-benchmark JSON.
+
+    Args:
+        data (dict[str, Any]): Parsed pytest-benchmark JSON object.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by full benchmark name.
+    """
+
+    benchmarks: dict[str, Benchmark] = {}
+    for benchmark in data.get("benchmarks", []):
+        if not isinstance(benchmark, dict):
+            continue
+
+        name = str(benchmark.get("fullname") or benchmark.get("name") or "").strip()
+        stats = benchmark.get("stats", {})
+        value = None
+        metric = "median"
+        if isinstance(stats, dict):
+            value = _as_float(stats.get(metric))
+            if value is None:
+                metric = "mean"
+                value = _as_float(stats.get(metric))
+        if name and value is not None:
+            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
+    return benchmarks
+
+
+def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from a compact mapping JSON object.
+
+    Args:
+        data (dict[str, Any]): Parsed mapping where each benchmark is either a
+            raw number or an object containing ``value``, ``unit``, and ``metric``.
+
+    Returns:
+        dict[str, Benchmark]: Benchmarks keyed by mapping key.
+    """
+
+    benchmarks: dict[str, Benchmark] = {}
+
+    for name, raw_value in data.items():
+        if name in {"version", "context", "commit", "timestamp"}:
+            continue
+
+        value = _as_float(raw_value)
+        unit = ""
+        metric = "value"
+        if value is None and isinstance(raw_value, dict):
+            value = _as_float(raw_value.get("value"))
+            unit = str(raw_value.get("unit") or "")
+            metric = str(raw_value.get("metric") or "value")
+
+        if value is not None:
+            benchmarks[str(name)] = Benchmark(name=str(name), value=value, unit=unit, metric=metric)
+
+    return benchmarks
+
+
+def extract_benchmarks(path: Path) -> dict[str, Benchmark]:
+    """Extract normalized benchmarks from a supported JSON file.
+
+    Args:
+        path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping
+            JSON file.
+
+    Returns:
+        dict[str, Benchmark]: Normalized benchmarks keyed by name.
+
+    Raises:
+        ValueError: If the JSON root is not an object or no supported benchmark
+            entries can be extracted.
+    """
+
+    data = _read_json(path)
+    if not isinstance(data, dict):
+        raise ValueError(f"{path} must contain a JSON object")
+
+    extractors = (_extract_hyperfine, _extract_pytest_benchmark, _extract_simple_mapping)
+    for extractor in extractors:
+        benchmarks = extractor(data)
+        if benchmarks:
+            return benchmarks
+
+    raise ValueError(f"No supported benchmark entries found in {path}")
+
+
+def compare_benchmarks(
+    baseline: dict[str, Benchmark],
+    current: dict[str, Benchmark],
+    threshold_percent: float,
+    higher_is_better: bool,
+) -> tuple[list[Comparison], list[str], list[str]]:
+    """Compare baseline benchmarks with current benchmarks.
+
+    Args:
+        baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name.
+        current (dict[str, Benchmark]): Current benchmarks keyed by name.
+        threshold_percent (float): Regression threshold in percent.
+        higher_is_better (bool): If ``True``, lower current values are treated as
+            regressions. If ``False``, higher current values are treated as
+            regressions.
+
+    Returns:
+        tuple[list[Comparison], list[str], list[str]]: Comparisons for common
+        benchmark names, names missing from current results, and names newly
+        present in current results.
+    """
+
+    comparisons: list[Comparison] = []
+    missing_in_current: list[str] = []
+    new_in_current: list[str] = []
+
+    for name, baseline_benchmark in sorted(baseline.items()):
+        current_benchmark = current.get(name)
+        if current_benchmark is None:
+            missing_in_current.append(name)
+            continue
+
+        if baseline_benchmark.value == 0:
+            delta_percent = 0.0
+        else:
+            delta_percent = (
+                (current_benchmark.value - baseline_benchmark.value)
+                / abs(baseline_benchmark.value)
+                * 100
+            )
+
+        if higher_is_better:
+            regressed = delta_percent <= -threshold_percent
+        else:
+            regressed = delta_percent >= threshold_percent
+
+        comparisons.append(
+            Comparison(
+                name=name,
+                baseline=baseline_benchmark.value,
+                current=current_benchmark.value,
+                delta_percent=delta_percent,
+                unit=current_benchmark.unit or baseline_benchmark.unit,
+                metric=current_benchmark.metric,
+                regressed=regressed,
+            )
+        )
+
+    for name in sorted(set(current) - set(baseline)):
+        new_in_current.append(name)
+
+    return comparisons, missing_in_current, new_in_current
+
+
+def _format_value(value: float, unit: str) -> str:
+    """Format a benchmark value for Markdown output.
+
+    Args:
+        value (float): Numeric benchmark value.
+        unit (str): Display unit.
+
+    Returns:
+        str: Formatted value with optional unit suffix.
+    """
+
+    suffix = f" {unit}" if unit else ""
+    return f"{value:.6g}{suffix}"
+
+
+def write_summary(
+    path: Path,
+    comparisons: list[Comparison],
+    missing_in_current: list[str],
+    new_in_current: list[str],
+    threshold_percent: float,
+    higher_is_better: bool,
+) -> None:
+    """Write a Markdown benchmark comparison summary.
+
+    Args:
+        path (Path): Path where the summary should be written.
+        comparisons (list[Comparison]): Comparison rows for matching benchmarks.
+        missing_in_current (list[str]): Baseline benchmark names missing from the
+            current result.
+        new_in_current (list[str]): Current benchmark names not present in the
+            baseline result.
+        threshold_percent (float): Regression threshold in percent.
+        higher_is_better (bool): Whether higher benchmark values are considered
+            better.
+    """
+
+    regressions = [comparison for comparison in comparisons if comparison.regressed]
+    direction = "higher is better" if higher_is_better else "lower is better"
+    sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name)
+
+    lines = [
+        "<!-- bw-benchmark-comment -->",
+        "## Benchmark comparison",
+        "",
+        f"Threshold: {threshold_percent:g}% ({direction}).",
+    ]
+    lines.append("")
+
+    if regressions:
+        lines.extend(
+            [
+                f"{len(regressions)} benchmark(s) regressed beyond the configured threshold.",
+                "",
+                "| Benchmark | Baseline | Current | Change |",
+                "| --- | ---: | ---: | ---: |",
+            ]
+        )
+        for comparison in regressions:
+            lines.append(
+                "| "
+                f"{comparison.name} | "
+                f"{_format_value(comparison.baseline, comparison.unit)} | "
+                f"{_format_value(comparison.current, comparison.unit)} | "
+                f"{comparison.delta_percent:+.2f}% |"
+            )
+    else:
+        lines.append("No benchmark regression exceeded the configured threshold.")
+
+    if sorted_comparisons:
+        lines.extend(
+            [
+                "",
+                "<details>",
+                "<summary>All benchmark results</summary>",
+                "",
+                "| Benchmark | Baseline | Current | Change | Status |",
+                "| --- | ---: | ---: | ---: | --- |",
+            ]
+        )
+        for comparison in sorted_comparisons:
+            status = "regressed" if comparison.regressed else "ok"
+            lines.append(
+                "| "
+                f"{comparison.name} | "
+                f"{_format_value(comparison.baseline, comparison.unit)} | "
+                f"{_format_value(comparison.current, comparison.unit)} | "
+                f"{comparison.delta_percent:+.2f}% | "
+                f"{status} |"
+            )
+        lines.extend(["", "</details>"])
+
+    if missing_in_current:
+        lines.extend(["", "Missing benchmarks in the current run:"])
+        lines.extend(f"- `{name}`" for name in missing_in_current)
+
+    if new_in_current:
+        lines.extend(["", "New benchmarks in the current run:"])
+        lines.extend(f"- `{name}`" for name in new_in_current)
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def main() -> int:
+    """Run the benchmark comparison command line interface.
+
+    Returns:
+        int: ``1`` when a regression exceeds the threshold, otherwise ``0``.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline", required=True, type=Path)
+    parser.add_argument("--current", required=True, type=Path)
+    parser.add_argument("--summary", required=True, type=Path)
+    parser.add_argument("--threshold-percent", required=True, type=float)
+    parser.add_argument("--higher-is-better", action="store_true")
+    args = parser.parse_args()
+
+    baseline = extract_benchmarks(args.baseline)
+    current = extract_benchmarks(args.current)
+    comparisons, missing_in_current, new_in_current = compare_benchmarks(
+        baseline=baseline,
+        current=current,
+        threshold_percent=args.threshold_percent,
+        higher_is_better=args.higher_is_better,
+    )
+
+    write_summary(
+        path=args.summary,
+        comparisons=comparisons,
+        missing_in_current=missing_in_current,
+        new_in_current=new_in_current,
+        threshold_percent=args.threshold_percent,
+        higher_is_better=args.higher_is_better,
+    )
+
+    return 1 if any(comparison.regressed for comparison in comparisons) else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/.github/scripts/run_benchmarks.sh
+++ b/.github/scripts/run_benchmarks.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+mkdir -p benchmark-results
+benchmark_json="${BENCHMARK_JSON:-benchmark-results/current.json}"
+benchmark_root="$(dirname "$benchmark_json")"
+hyperfine_benchmark_dir="${BENCHMARK_HYPERFINE_DIR:-tests/benchmarks/hyperfine}"
+pytest_benchmark_dirs="${BENCHMARK_PYTEST_DIRS:-${BENCHMARK_PYTEST_DIR:-}}"
+benchmark_work_dir="$benchmark_root/raw-results"
+hyperfine_json_dir="$benchmark_work_dir/hyperfine"
+pytest_json="$benchmark_work_dir/pytest.json"
+
+shopt -s nullglob
+benchmark_scripts=()
+benchmark_scripts=("$hyperfine_benchmark_dir"/benchmark_*.sh)
+shopt -u nullglob
+
+pytest_dirs=()
+for pytest_benchmark_dir in $pytest_benchmark_dirs; do
+  if [ -d "$pytest_benchmark_dir" ]; then
+    pytest_dirs+=("$pytest_benchmark_dir")
+  else
+    echo "Pytest benchmark directory not found: $pytest_benchmark_dir" >&2
+    exit 1
+  fi
+done
+
+if [ "${#benchmark_scripts[@]}" -eq 0 ] && [ "${#pytest_dirs[@]}" -eq 0 ]; then
+  echo "No benchmark scripts or pytest benchmarks found" >&2
+  exit 1
+fi
+
+echo "Benchmark Python: $(command -v python)"
+python -c 'import sys; print(sys.version)'
+
+rm -rf "$benchmark_work_dir"
+mkdir -p "$hyperfine_json_dir"
+
+if [ "${#benchmark_scripts[@]}" -gt 0 ]; then
+  for benchmark_script in "${benchmark_scripts[@]}"; do
+    title="$(sed -n 's/^# BENCHMARK_TITLE:[[:space:]]*//p' "$benchmark_script" | head -n 1)"
+    if [ -z "$title" ]; then
+      title="$(basename "$benchmark_script" .sh)"
+    fi
+    benchmark_name="$(basename "$benchmark_script" .sh)"
+    benchmark_result_json="$hyperfine_json_dir/$benchmark_name.json"
+    echo "Preflight benchmark script: $benchmark_script"
+    bash "$benchmark_script"
+
+    hyperfine \
+      --show-output \
+      --warmup 1 \
+      --runs 5 \
+      --command-name "$title" \
+      --export-json "$benchmark_result_json" \
+      "bash $(printf "%q" "$benchmark_script")"
+  done
+fi
+
+if [ "${#pytest_dirs[@]}" -gt 0 ]; then
+  pytest \
+    -q "${pytest_dirs[@]}" \
+    --benchmark-only \
+    --benchmark-json "$pytest_json"
+fi
+
+python .github/scripts/aggregate_benchmarks.py \
+  --input-dir "$benchmark_work_dir" \
+  --output "$benchmark_json"
--- a/.github/scripts/run_with_bec_servers.py
+++ b/.github/scripts/run_with_bec_servers.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""Run a command with BEC e2e services available."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import shutil
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+import bec_lib
+from bec_ipython_client import BECIPythonClient
+from bec_lib.redis_connector import RedisConnector
+from bec_lib.service_config import ServiceConfig, ServiceConfigModel
+from redis import Redis
+
+
+def _wait_for_redis(host: str, port: int) -> None:
+    client = Redis(host=host, port=port)
+    deadline = time.monotonic() + 10
+    while time.monotonic() < deadline:
+        try:
+            if client.ping():
+                return
+        except Exception:
+            time.sleep(0.1)
+    raise RuntimeError(f"Redis did not start on {host}:{port}")
+
+
+def _start_redis(files_path: Path, host: str, port: int) -> subprocess.Popen:
+    redis_server = shutil.which("redis-server")
+    if redis_server is None:
+        raise RuntimeError("redis-server executable not found")
+
+    return subprocess.Popen(
+        [
+            redis_server,
+            "--bind",
+            host,
+            "--port",
+            str(port),
+            "--save",
+            "",
+            "--appendonly",
+            "no",
+            "--dir",
+            str(files_path),
+        ]
+    )
+
+
+def _write_configs(files_path: Path, host: str, port: int) -> Path:
+    test_config = files_path / "test_config.yaml"
+    services_config = files_path / "services_config.yaml"
+
+    bec_lib_path = Path(bec_lib.__file__).resolve().parent
+    shutil.copyfile(bec_lib_path / "tests" / "test_config.yaml", test_config)
+
+    service_config = ServiceConfigModel(
+        redis={"host": host, "port": port}, file_writer={"base_path": str(files_path)}
+    )
+    services_config.write_text(service_config.model_dump_json(indent=4), encoding="utf-8")
+    return services_config
+
+
+def _load_demo_config(services_config: Path) -> None:
+    bec = BECIPythonClient(ServiceConfig(services_config), RedisConnector, forced=True)
+    bec.start()
+    try:
+        bec.config.load_demo_config()
+    finally:
+        bec.shutdown()
+        bec._client._reset_singleton()
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+    args = parser.parse_args()
+
+    if args.command[:1] == ["--"]:
+        args.command = args.command[1:]
+    if not args.command:
+        raise ValueError("No command provided")
+
+    host = "127.0.0.1"
+    port = 6379
+
+    with tempfile.TemporaryDirectory(prefix="bec-benchmark-") as tmp:
+        files_path = Path(tmp)
+        services_config = _write_configs(files_path, host, port)
+        redis_process = _start_redis(files_path, host, port)
+        processes = None
+        service_handler = None
+        try:
+            _wait_for_redis(host, port)
+
+            from bec_server.bec_server_utils.service_handler import ServiceHandler
+
+            service_handler = ServiceHandler(
+                bec_path=files_path, config_path=services_config, interface="subprocess"
+            )
+            processes = service_handler.start()
+            _load_demo_config(services_config)
+
+            env = os.environ.copy()
+            return subprocess.run(args.command, env=env, check=False).returncode
+        finally:
+            if service_handler is not None and processes is not None:
+                service_handler.stop(processes)
+            redis_process.terminate()
+            try:
+                redis_process.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                redis_process.kill()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,239 @@
+name: BW Benchmarks
+
+on: [workflow_call]
+
+permissions:
+  contents: read
+
+env:
+  BENCHMARK_JSON: benchmark-results/current.json
+  BENCHMARK_BASELINE_JSON: gh-pages-benchmark-data/benchmarks/latest.json
+  BENCHMARK_SUMMARY: benchmark-results/summary.md
+  BENCHMARK_COMMAND: "bash .github/scripts/run_benchmarks.sh"
+  BENCHMARK_THRESHOLD_PERCENT: 10
+  BENCHMARK_HIGHER_IS_BETTER: false
+
+jobs:
+  benchmark_attempt:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    permissions:
+      contents: read
+    defaults:
+      run:
+        shell: bash -el {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        attempt: [1, 2, 3]
+
+    env:
+      BENCHMARK_JSON: benchmark-results/current-${{ matrix.attempt }}.json
+      BEC_CORE_BRANCH: main
+      OPHYD_DEVICES_BRANCH: main
+      PLUGIN_REPO_BRANCH: main
+      BENCHMARK_PYTEST_DIRS: tests/unit_tests/benchmarks
+      QTWEBENGINE_DISABLE_SANDBOX: 1
+      QT_QPA_PLATFORM: "offscreen"
+
+    steps:
+      - name: Checkout BEC Widgets
+        uses: actions/checkout@v4
+        with:
+          repository: bec-project/bec_widgets
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+
+      - name: Set up Conda
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          auto-update-conda: true
+          auto-activate-base: true
+          python-version: "3.11"
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libgl1 libegl1 x11-utils libxkbcommon-x11-0 libdbus-1-3 xvfb
+          sudo apt-get -y install libnss3 libxdamage1 libasound2t64 libatomic1 libxcursor1
+          sudo apt-get -y install ttyd hyperfine redis-server
+
+      - name: Install full e2e environment
+        run: |
+          echo -e "\033[35;1m Using branch $BEC_CORE_BRANCH of BEC CORE \033[0;m";
+          git clone --branch "$BEC_CORE_BRANCH" https://github.com/bec-project/bec.git
+          echo -e "\033[35;1m Using branch $OPHYD_DEVICES_BRANCH of OPHYD_DEVICES \033[0;m";
+          git clone --branch "$OPHYD_DEVICES_BRANCH" https://github.com/bec-project/ophyd_devices.git
+          export OHPYD_DEVICES_PATH=$PWD/ophyd_devices
+          echo -e "\033[35;1m Using branch $PLUGIN_REPO_BRANCH of bec_testing_plugin \033[0;m";
+          git clone --branch "$PLUGIN_REPO_BRANCH" https://github.com/bec-project/bec_testing_plugin.git
+          cd ./bec
+          conda create -q -n test-environment python=3.11
+          conda activate test-environment
+          source ./bin/install_bec_dev.sh -t
+          cd ../
+          python -m pip install -e ./ophyd_devices -e .[dev,pyside6] -e ./bec_testing_plugin pytest-benchmark
+
+          mkdir -p "$(dirname "$BENCHMARK_JSON")"
+          python .github/scripts/run_with_bec_servers.py -- bash -lc "$BENCHMARK_COMMAND"
+          test -s "$BENCHMARK_JSON"
+
+      - name: Upload benchmark artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bw-benchmark-json-${{ matrix.attempt }}
+          path: ${{ env.BENCHMARK_JSON }}
+
+  benchmark:
+    needs: [benchmark_attempt]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout BEC Widgets
+        uses: actions/checkout@v4
+        with:
+          repository: bec-project/bec_widgets
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+
+      - name: Download benchmark attempts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: bw-benchmark-json-*
+          path: benchmark-results/attempts
+          merge-multiple: true
+
+      - name: Aggregate benchmark attempts
+        run: |
+          python .github/scripts/aggregate_benchmarks.py \
+            --input-dir benchmark-results/attempts \
+            --output "$BENCHMARK_JSON"
+
+      - name: Upload aggregate benchmark artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bw-benchmark-json
+          path: ${{ env.BENCHMARK_JSON }}
+
+      - name: Fetch gh-pages benchmark data
+        run: |
+          if git ls-remote --exit-code --heads origin gh-pages; then
+            git clone --depth=1 --branch gh-pages "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git" gh-pages-benchmark-data
+          else
+            mkdir -p gh-pages-benchmark-data
+          fi
+
+      - name: Compare with latest gh-pages benchmark
+        id: compare
+        continue-on-error: true
+        run: |
+          if [ ! -s "$BENCHMARK_BASELINE_JSON" ]; then
+            mkdir -p "$(dirname "$BENCHMARK_SUMMARY")"
+            {
+              echo "<!-- bw-benchmark-comment -->"
+              echo "## Benchmark comparison"
+              echo
+              echo "No benchmark baseline was found on gh-pages."
+            } > "$BENCHMARK_SUMMARY"
+            exit 0
+          fi
+
+          args=(
+            --baseline "$BENCHMARK_BASELINE_JSON"
+            --current "$BENCHMARK_JSON"
+            --summary "$BENCHMARK_SUMMARY"
+            --threshold-percent "$BENCHMARK_THRESHOLD_PERCENT"
+          )
+
+          if [ "$BENCHMARK_HIGHER_IS_BETTER" = "true" ]; then
+            args+=(--higher-is-better)
+          fi
+
+          set +e
+          python .github/scripts/compare_benchmarks.py "${args[@]}"
+          status=$?
+          set -e
+
+          if [ ! -s "$BENCHMARK_SUMMARY" ]; then
+            mkdir -p "$(dirname "$BENCHMARK_SUMMARY")"
+            {
+              echo "<!-- bw-benchmark-comment -->"
+              echo "## Benchmark comparison"
+              echo
+              echo "Benchmark comparison failed before writing a summary."
+            } > "$BENCHMARK_SUMMARY"
+          fi
+
+          exit "$status"
+
+      - name: Find existing benchmark PR comment
+        if: github.event_name == 'pull_request'
+        id: fc
+        uses: peter-evans/find-comment@v3
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: github-actions[bot]
+          body-includes: "<!-- bw-benchmark-comment -->"
+
+      - name: Create or update benchmark PR comment
+        if: github.event_name == 'pull_request'
+        uses: peter-evans/create-or-update-comment@v5
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          body-path: ${{ env.BENCHMARK_SUMMARY }}
+          edit-mode: replace
+
+      - name: Fail on benchmark regression
+        if: github.event_name == 'pull_request' && steps.compare.outcome == 'failure'
+        run: exit 1
+
+  publish:
+    needs: [benchmark]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    steps:
+      - name: Checkout BEC Widgets
+        uses: actions/checkout@v4
+        with:
+          repository: bec-project/bec_widgets
+          ref: ${{ github.sha }}
+
+      - name: Download aggregate benchmark artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: bw-benchmark-json
+          path: .
+
+      - name: Prepare gh-pages for publishing
+        run: |
+          # Clean up any existing worktree/directory
+          if [ -d gh-pages-benchmark-data ]; then
+            git worktree remove gh-pages-benchmark-data --force || rm -rf gh-pages-benchmark-data
+          fi
+
+          if git ls-remote --exit-code --heads origin gh-pages; then
+            git fetch --depth=1 origin gh-pages
+            git worktree add gh-pages-benchmark-data FETCH_HEAD
+          else
+            git worktree add --detach gh-pages-benchmark-data
+            git -C gh-pages-benchmark-data checkout --orphan gh-pages
+            git -C gh-pages-benchmark-data rm -rf .
+          fi
+
+      - name: Publish benchmark data to gh-pages
+        working-directory: gh-pages-benchmark-data
+        run: |
+          mkdir -p benchmarks/history
+          cp "../$BENCHMARK_JSON" benchmarks/latest.json
+          cp "../$BENCHMARK_JSON" "benchmarks/history/${GITHUB_SHA}.json"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add benchmarks/latest.json "benchmarks/history/${GITHUB_SHA}.json"
+          git commit -m "Update BW benchmark data for ${GITHUB_SHA}" || exit 0
+          git push origin HEAD:gh-pages
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,19 +1,19 @@
 name: Full CI
-on: 
+on:
  push:
  pull_request:
  workflow_dispatch:
    inputs:
      BEC_WIDGETS_BRANCH:
-        description: 'Branch of BEC Widgets to install'
+        description: "Branch of BEC Widgets to install"
        required: false
        type: string
      BEC_CORE_BRANCH:
-        description: 'Branch of BEC Core to install'
+        description: "Branch of BEC Core to install"
        required: false
        type: string
      OPHYD_DEVICES_BRANCH:
-        description: 'Branch of Ophyd Devices to install'
+        description: "Branch of Ophyd Devices to install"
        required: false
        type: string

@@ -23,6 +23,7 @@ concurrency:

 permissions:
  pull-requests: write
+  contents: read

 jobs:
  check_pr_status:
@@ -33,6 +34,15 @@ jobs:
    if: needs.check_pr_status.outputs.branch-pr == ''
    uses: ./.github/workflows/formatter.yml

+  benchmark:
+    needs: [check_pr_status]
+    if: needs.check_pr_status.outputs.branch-pr == ''
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+    uses: ./.github/workflows/benchmark.yml
+
  unit-test:
    needs: [check_pr_status, formatter]
    if: needs.check_pr_status.outputs.branch-pr == ''
@@ -69,9 +79,9 @@ jobs:
    uses: ./.github/workflows/child_repos.yml
    with:
      BEC_CORE_BRANCH: ${{ inputs.BEC_CORE_BRANCH || 'main' }}
-      OPHYD_DEVICES_BRANCH: ${{ inputs.OPHYD_DEVICES_BRANCH || 'main'}} 
+      OPHYD_DEVICES_BRANCH: ${{ inputs.OPHYD_DEVICES_BRANCH || 'main'}}
      BEC_WIDGETS_BRANCH: ${{ inputs.BEC_WIDGETS_BRANCH || github.head_ref || github.sha }}
-  
+
  plugin_repos:
    needs: [check_pr_status, formatter]
    if: needs.check_pr_status.outputs.branch-pr == ''
@@ -81,4 +91,4 @@ jobs:
      BEC_WIDGETS_BRANCH: ${{ inputs.BEC_WIDGETS_BRANCH || github.head_ref || github.sha }}

    secrets:
-      GH_READ_TOKEN: ${{ secrets.GH_READ_TOKEN }}
+      GH_READ_TOKEN: ${{ secrets.GH_READ_TOKEN }}
--- a/.github/workflows/pytest-matrix.yml
+++ b/.github/workflows/pytest-matrix.yml
@@ -1,25 +1,25 @@
 name: Run Pytest with different Python versions
-on: 
+on:
  workflow_call:
    inputs:
      pr_number:
-        description: 'Pull request number'
+        description: "Pull request number"
        required: false
        type: number
      BEC_CORE_BRANCH:
-        description: 'Branch of BEC Core to install'
+        description: "Branch of BEC Core to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string
      OPHYD_DEVICES_BRANCH:
-        description: 'Branch of Ophyd Devices to install'
+        description: "Branch of Ophyd Devices to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string
      BEC_WIDGETS_BRANCH:
-        description: 'Branch of BEC Widgets to install'
+        description: "Branch of BEC Widgets to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string

 jobs:
@@ -30,15 +30,14 @@ jobs:
        python-version: ["3.11", "3.12", "3.13"]

    env:
-      BEC_WIDGETS_BRANCH: main  # Set the branch you want for bec_widgets
-      BEC_CORE_BRANCH: main        # Set the branch you want for bec
-      OPHYD_DEVICES_BRANCH: main   # Set the branch you want for ophyd_devices
+      BEC_WIDGETS_BRANCH: main # Set the branch you want for bec_widgets
+      BEC_CORE_BRANCH: main # Set the branch you want for bec
+      OPHYD_DEVICES_BRANCH: main # Set the branch you want for ophyd_devices
      PROJECT_PATH: ${{ github.repository }}
      QTWEBENGINE_DISABLE_SANDBOX: 1
      QT_QPA_PLATFORM: "offscreen"

    steps:
-
      - name: Checkout BEC Widgets
        uses: actions/checkout@v4
        with:
@@ -56,4 +55,4 @@ jobs:
      - name: Run Pytest
        run: |
          pip install pytest pytest-random-order
-          pytest -v --junitxml=report.xml --random-order ./tests/unit_tests
+          pytest -v --junitxml=report.xml --random-order --ignore=tests/unit_tests/benchmarks ./tests/unit_tests
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -1,32 +1,30 @@
 name: Run Pytest with Coverage
-on: 
+on:
  workflow_call:
    inputs:
      pr_number:
-        description: 'Pull request number'
+        description: "Pull request number"
        required: false
        type: number
      BEC_CORE_BRANCH:
-        description: 'Branch of BEC Core to install'
+        description: "Branch of BEC Core to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string
      OPHYD_DEVICES_BRANCH:
-        description: 'Branch of Ophyd Devices to install'
+        description: "Branch of Ophyd Devices to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string
      BEC_WIDGETS_BRANCH:
-        description: 'Branch of BEC Widgets to install'
+        description: "Branch of BEC Widgets to install"
        required: false
-        default: 'main'
+        default: "main"
        type: string
    secrets:
      CODECOV_TOKEN:
        required: true

-
-
 permissions:
  pull-requests: write

@@ -55,7 +53,7 @@ jobs:

      - name: Run Pytest with Coverage
        id: coverage
-        run: pytest --random-order --cov=bec_widgets --cov-config=pyproject.toml --cov-branch --cov-report=xml --no-cov-on-fail tests/unit_tests/
+        run: pytest --random-order --cov=bec_widgets --cov-config=pyproject.toml --cov-branch --cov-report=xml --no-cov-on-fail --ignore=tests/unit_tests/benchmarks tests/unit_tests/

      - name: Upload test artifacts
        uses: actions/upload-artifact@v4
@@ -69,4 +67,4 @@ jobs:
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
-          slug: bec-project/bec_widgets
+          slug: bec-project/bec_widgets
--- a/tests/benchmarks/hyperfine/benchmark_import_bec_widgets.sh
+++ b/tests/benchmarks/hyperfine/benchmark_import_bec_widgets.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# BENCHMARK_TITLE: Import bec_widgets
+set -euo pipefail
+
+python -c 'import bec_widgets; print(bec_widgets.__file__)'
--- a/tests/benchmarks/hyperfine/benchmark_launch_bec_with_companion.sh
+++ b/tests/benchmarks/hyperfine/benchmark_launch_bec_with_companion.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# BENCHMARK_TITLE: BEC IPython client with companion app
+set -euo pipefail
+
+bec --post-startup-file tests/benchmarks/hyperfine/utils/exit_bec_startup.py
--- a/tests/benchmarks/hyperfine/benchmark_launch_bec_without_companion.sh
+++ b/tests/benchmarks/hyperfine/benchmark_launch_bec_without_companion.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# BENCHMARK_TITLE: BEC IPython client without companion app
+set -euo pipefail
+
+bec --nogui --post-startup-file tests/benchmarks/hyperfine/utils/exit_bec_startup.py
--- a/tests/benchmarks/hyperfine/utils/exit_bec_startup.py
+++ b/tests/benchmarks/hyperfine/utils/exit_bec_startup.py
@@ -0,0 +1,5 @@
+import time
+
+_ip = get_ipython()
+_ip.confirm_exit = False
+_ip.ask_exit()
--- a/tests/unit_tests/benchmarks/test_dock_area_benchmark.py
+++ b/tests/unit_tests/benchmarks/test_dock_area_benchmark.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import pytest
+
+from bec_widgets.widgets.containers.dock_area.dock_area import BECDockArea
+from bec_widgets.widgets.plots.waveform.waveform import Waveform
+from tests.unit_tests.client_mocks import mocked_client
+
+
+@pytest.fixture
+def dock_area(qtbot, mocked_client):
+    widget = BECDockArea(client=mocked_client)
+    qtbot.addWidget(widget)
+    qtbot.waitExposed(widget)
+    yield widget
+
+
+def test_add_waveform_to_dock_area(benchmark, dock_area, qtbot, mocked_client):
+    """Benchmark adding a Waveform widget to an existing dock area."""
+
+    def add_waveform():
+        dock_area.new("Waveform")
+        return dock_area
+
+    dock = benchmark(add_waveform)
+
+    assert dock is not None