diff --git a/.github/scripts/aggregate_benchmarks.py b/.github/scripts/aggregate_benchmarks.py
deleted file mode 100644
index 22818257..00000000
--- a/.github/scripts/aggregate_benchmarks.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-"""Aggregate and merge benchmark JSON files.
-
-The workflow runs the same benchmark suite on multiple independent runners.
-This script reads every JSON file produced by those attempts, normalizes the
-contained benchmark values, and writes a compact mapping JSON where each value is
-the median across attempts. It can also merge independent hyperfine JSON files
-from one runner into a single hyperfine-style JSON file.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import statistics
-from pathlib import Path
-from typing import Any
-
-from compare_benchmarks import Benchmark, extract_benchmarks
-
-
-def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]:
-    """Collect benchmarks from multiple JSON files.
-
-    Args:
-        paths (list[Path]): Paths to hyperfine, pytest-benchmark, or compact
-            mapping JSON files.
-
-    Returns:
-        dict[str, list[Benchmark]]: Benchmarks grouped by benchmark name.
-    """
-
-    collected: dict[str, list[Benchmark]] = {}
-    for path in paths:
-        for name, benchmark in extract_benchmarks(path).items():
-            collected.setdefault(name, []).append(benchmark)
-    return collected
-
-
-def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, object]]:
-    """Aggregate grouped benchmarks using the median value.
-
-    Args:
-        collected (dict[str, list[Benchmark]]): Benchmarks grouped by benchmark
-            name.
-
-    Returns:
-        dict[str, dict[str, object]]: Compact mapping JSON data. Each benchmark
-        contains ``value``, ``unit``, ``metric``, ``attempts``, and
-        ``attempt_values``.
-    """
-
-    aggregated: dict[str, dict[str, object]] = {}
-    for name, benchmarks in sorted(collected.items()):
-        values = [benchmark.value for benchmark in benchmarks]
-        unit = next((benchmark.unit for benchmark in benchmarks if benchmark.unit), "")
-        metric = next((benchmark.metric for benchmark in benchmarks if benchmark.metric), "value")
-        aggregated[name] = {
-            "value": statistics.median(values),
-            "unit": unit,
-            "metric": f"median-of-attempt-{metric}",
-            "attempts": len(values),
-            "attempt_values": values,
-        }
-    return aggregated
-
-
-def merge_hyperfine_results(paths: list[Path]) -> dict[str, Any]:
-    """Merge hyperfine result files.
-
-    Args:
-        paths (list[Path]): Hyperfine JSON files to merge.
-
-    Returns:
-        dict[str, Any]: Hyperfine-style JSON object containing all result rows.
-
-    Raises:
-        ValueError: If any file has no hyperfine ``results`` list.
-    """
-
-    merged: dict[str, Any] = {"results": []}
-    for path in paths:
-        data = json.loads(path.read_text(encoding="utf-8"))
-        results = data.get("results", []) if isinstance(data, dict) else None
-        if not isinstance(results, list):
-            raise ValueError(f"{path} has no hyperfine results list")
-        merged["results"].extend(results)
-    return merged
-
-
-def main_from_paths(input_dir: Path, output: Path) -> int:
-    """Aggregate all JSON files in a directory and write the result.
-
-    Args:
-        input_dir (Path): Directory containing benchmark JSON files.
-        output (Path): Path where the aggregate JSON should be written.
-
-    Returns:
-        int: Always ``0`` on success.
-
-    Raises:
-        ValueError: If no JSON files are found in ``input_dir``.
-    """
-
-    paths = sorted(input_dir.rglob("*.json"))
-    if not paths:
-        raise ValueError(f"No benchmark JSON files found in {input_dir}")
-
-    output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(
-        json.dumps(aggregate(collect_benchmarks(paths)), indent=2, sort_keys=True) + "\n",
-        encoding="utf-8",
-    )
-    return 0
-
-
-def merge_from_paths(input_dir: Path, output: Path) -> int:
-    """Merge all hyperfine JSON files in a directory and write the result.
-
-    Args:
-        input_dir (Path): Directory containing hyperfine JSON files.
-        output (Path): Path where the merged JSON should be written.
-
-    Returns:
-        int: Always ``0`` on success.
-
-    Raises:
-        ValueError: If no JSON files are found in ``input_dir``.
-    """
-
-    paths = sorted(input_dir.glob("*.json"))
-    if not paths:
-        raise ValueError(f"No hyperfine JSON files found in {input_dir}")
-
-    output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(
-        json.dumps(merge_hyperfine_results(paths), indent=2, sort_keys=True) + "\n",
-        encoding="utf-8",
-    )
-    return 0
-
-
-def main() -> int:
-    """Run the benchmark aggregation command line interface.
-
-    Returns:
-        int: Always ``0`` on success.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode",
-        choices=("aggregate", "merge-hyperfine"),
-        default="aggregate",
-        help="Operation to perform.",
-    )
-    parser.add_argument("--input-dir", required=True, type=Path)
-    parser.add_argument("--output", required=True, type=Path)
-    args = parser.parse_args()
-    if args.mode == "merge-hyperfine":
-        return merge_from_paths(input_dir=args.input_dir, output=args.output)
-    return main_from_paths(input_dir=args.input_dir, output=args.output)
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
deleted file mode 100644
index 2bb2f9bd..00000000
--- a/.github/scripts/compare_benchmarks.py
+++ /dev/null
@@ -1,411 +0,0 @@
-#!/usr/bin/env python3
-"""Compare benchmark JSON files and write a GitHub Actions summary.
-
-The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark,
-and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing
-formats prefer median values and fall back to mean values when median values are
-not present.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import math
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-
-
-@dataclass(frozen=True)
-class Benchmark:
-    """Normalized benchmark result.
-
-    Attributes:
-        name (str): Stable benchmark name used to match baseline and current results.
-        value (float): Numeric benchmark value used for comparison.
-        unit (str): Display unit for the value, for example ``"s"``.
-        metric (str): Source metric name, for example ``"median"`` or ``"mean"``.
-    """
-
-    name: str
-    value: float
-    unit: str
-    metric: str = "value"
-
-
-@dataclass(frozen=True)
-class Comparison:
-    """Comparison between one baseline benchmark and one current benchmark.
-
-    Attributes:
-        name (str): Benchmark name.
-        baseline (float): Baseline benchmark value.
-        current (float): Current benchmark value.
-        delta_percent (float): Percent change from baseline to current.
-        unit (str): Display unit for both values.
-        metric (str): Current result metric used for comparison.
-        regressed (bool): Whether the change exceeds the configured threshold.
-    """
-
-    name: str
-    baseline: float
-    current: float
-    delta_percent: float
-    unit: str
-    metric: str
-    regressed: bool
-
-
-def _read_json(path: Path) -> Any:
-    """Read JSON data from a file.
-
-    Args:
-        path (Path): Path to the JSON file.
-
-    Returns:
-        Any: Parsed JSON value.
-    """
-
-    with path.open("r", encoding="utf-8") as stream:
-        return json.load(stream)
-
-
-def _as_float(value: Any) -> float | None:
-    """Convert a value to a finite float.
-
-    Args:
-        value (Any): Value to convert.
-
-    Returns:
-        float | None: Converted finite float, or ``None`` if conversion fails.
-    """
-
-    try:
-        result = float(value)
-    except (TypeError, ValueError):
-        return None
-    if math.isfinite(result):
-        return result
-    return None
-
-
-def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
-    """Extract normalized benchmarks from hyperfine JSON.
-
-    Args:
-        data (dict[str, Any]): Parsed hyperfine JSON object.
-
-    Returns:
-        dict[str, Benchmark]: Benchmarks keyed by command name.
-    """
-
-    benchmarks: dict[str, Benchmark] = {}
-    for result in data.get("results", []):
-        if not isinstance(result, dict):
-            continue
-        name = str(result.get("command") or result.get("name") or "").strip()
-        metric = "median"
-        value = _as_float(result.get(metric))
-        if value is None:
-            metric = "mean"
-            value = _as_float(result.get(metric))
-        if name and value is not None:
-            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
-    return benchmarks
-
-
-def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
-    """Extract normalized benchmarks from pytest-benchmark JSON.
-
-    Args:
-        data (dict[str, Any]): Parsed pytest-benchmark JSON object.
-
-    Returns:
-        dict[str, Benchmark]: Benchmarks keyed by full benchmark name.
-    """
-
-    benchmarks: dict[str, Benchmark] = {}
-    for benchmark in data.get("benchmarks", []):
-        if not isinstance(benchmark, dict):
-            continue
-
-        name = str(benchmark.get("fullname") or benchmark.get("name") or "").strip()
-        stats = benchmark.get("stats", {})
-        value = None
-        metric = "median"
-        if isinstance(stats, dict):
-            value = _as_float(stats.get(metric))
-            if value is None:
-                metric = "mean"
-                value = _as_float(stats.get(metric))
-        if name and value is not None:
-            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
-    return benchmarks
-
-
-def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
-    """Extract normalized benchmarks from a compact mapping JSON object.
-
-    Args:
-        data (dict[str, Any]): Parsed mapping where each benchmark is either a
-            raw number or an object containing ``value``, ``unit``, and ``metric``.
-
-    Returns:
-        dict[str, Benchmark]: Benchmarks keyed by mapping key.
-    """
-
-    benchmarks: dict[str, Benchmark] = {}
-
-    for name, raw_value in data.items():
-        if name in {"version", "context", "commit", "timestamp"}:
-            continue
-
-        value = _as_float(raw_value)
-        unit = ""
-        metric = "value"
-        if value is None and isinstance(raw_value, dict):
-            value = _as_float(raw_value.get("value"))
-            unit = str(raw_value.get("unit") or "")
-            metric = str(raw_value.get("metric") or "value")
-
-        if value is not None:
-            benchmarks[str(name)] = Benchmark(name=str(name), value=value, unit=unit, metric=metric)
-
-    return benchmarks
-
-
-def extract_benchmarks(path: Path) -> dict[str, Benchmark]:
-    """Extract normalized benchmarks from a supported JSON file.
-
-    Args:
-        path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping
-            JSON file.
-
-    Returns:
-        dict[str, Benchmark]: Normalized benchmarks keyed by name.
-
-    Raises:
-        ValueError: If the JSON root is not an object or no supported benchmark
-            entries can be extracted.
-    """
-
-    data = _read_json(path)
-    if not isinstance(data, dict):
-        raise ValueError(f"{path} must contain a JSON object")
-
-    extractors = (_extract_hyperfine, _extract_pytest_benchmark, _extract_simple_mapping)
-    for extractor in extractors:
-        benchmarks = extractor(data)
-        if benchmarks:
-            return benchmarks
-
-    raise ValueError(f"No supported benchmark entries found in {path}")
-
-
-def compare_benchmarks(
-    baseline: dict[str, Benchmark],
-    current: dict[str, Benchmark],
-    threshold_percent: float,
-    higher_is_better: bool,
-) -> tuple[list[Comparison], list[str], list[str]]:
-    """Compare baseline benchmarks with current benchmarks.
-
-    Args:
-        baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name.
-        current (dict[str, Benchmark]): Current benchmarks keyed by name.
-        threshold_percent (float): Regression threshold in percent.
-        higher_is_better (bool): If ``True``, lower current values are treated as
-            regressions. If ``False``, higher current values are treated as
-            regressions.
-
-    Returns:
-        tuple[list[Comparison], list[str], list[str]]: Comparisons for common
-        benchmark names, names missing from current results, and names newly
-        present in current results.
-    """
-
-    comparisons: list[Comparison] = []
-    missing_in_current: list[str] = []
-    new_in_current: list[str] = []
-
-    for name, baseline_benchmark in sorted(baseline.items()):
-        current_benchmark = current.get(name)
-        if current_benchmark is None:
-            missing_in_current.append(name)
-            continue
-
-        if baseline_benchmark.value == 0:
-            delta_percent = 0.0
-        else:
-            delta_percent = (
-                (current_benchmark.value - baseline_benchmark.value)
-                / abs(baseline_benchmark.value)
-                * 100
-            )
-
-        if higher_is_better:
-            regressed = delta_percent <= -threshold_percent
-        else:
-            regressed = delta_percent >= threshold_percent
-
-        comparisons.append(
-            Comparison(
-                name=name,
-                baseline=baseline_benchmark.value,
-                current=current_benchmark.value,
-                delta_percent=delta_percent,
-                unit=current_benchmark.unit or baseline_benchmark.unit,
-                metric=current_benchmark.metric,
-                regressed=regressed,
-            )
-        )
-
-    for name in sorted(set(current) - set(baseline)):
-        new_in_current.append(name)
-
-    return comparisons, missing_in_current, new_in_current
-
-
-def _format_value(value: float, unit: str) -> str:
-    """Format a benchmark value for Markdown output.
-
-    Args:
-        value (float): Numeric benchmark value.
-        unit (str): Display unit.
-
-    Returns:
-        str: Formatted value with optional unit suffix.
-    """
-
-    suffix = f" {unit}" if unit else ""
-    return f"{value:.6g}{suffix}"
-
-
-def write_summary(
-    path: Path,
-    comparisons: list[Comparison],
-    missing_in_current: list[str],
-    new_in_current: list[str],
-    threshold_percent: float,
-    higher_is_better: bool,
-) -> None:
-    """Write a Markdown benchmark comparison summary.
-
-    Args:
-        path (Path): Path where the summary should be written.
-        comparisons (list[Comparison]): Comparison rows for matching benchmarks.
-        missing_in_current (list[str]): Baseline benchmark names missing from the
-            current result.
-        new_in_current (list[str]): Current benchmark names not present in the
-            baseline result.
-        threshold_percent (float): Regression threshold in percent.
-        higher_is_better (bool): Whether higher benchmark values are considered
-            better.
-    """
-
-    regressions = [comparison for comparison in comparisons if comparison.regressed]
-    direction = "higher is better" if higher_is_better else "lower is better"
-    sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name)
-
-    lines = [
-        "<!-- bw-benchmark-comment -->",
-        "## Benchmark comparison",
-        "",
-        f"Threshold: {threshold_percent:g}% ({direction}).",
-    ]
-    lines.append("")
-
-    if regressions:
-        lines.extend(
-            [
-                f"{len(regressions)} benchmark(s) regressed beyond the configured threshold.",
-                "",
-                "| Benchmark | Baseline | Current | Change |",
-                "| --- | ---: | ---: | ---: |",
-            ]
-        )
-        for comparison in regressions:
-            lines.append(
-                "| "
-                f"{comparison.name} | "
-                f"{_format_value(comparison.baseline, comparison.unit)} | "
-                f"{_format_value(comparison.current, comparison.unit)} | "
-                f"{comparison.delta_percent:+.2f}% |"
-            )
-    else:
-        lines.append("No benchmark regression exceeded the configured threshold.")
-
-    if sorted_comparisons:
-        lines.extend(
-            [
-                "",
-                "<details>",
-                "<summary>All benchmark results</summary>",
-                "",
-                "| Benchmark | Baseline | Current | Change | Status |",
-                "| --- | ---: | ---: | ---: | --- |",
-            ]
-        )
-        for comparison in sorted_comparisons:
-            status = "regressed" if comparison.regressed else "ok"
-            lines.append(
-                "| "
-                f"{comparison.name} | "
-                f"{_format_value(comparison.baseline, comparison.unit)} | "
-                f"{_format_value(comparison.current, comparison.unit)} | "
-                f"{comparison.delta_percent:+.2f}% | "
-                f"{status} |"
-            )
-        lines.extend(["", "</details>"])
-
-    if missing_in_current:
-        lines.extend(["", "Missing benchmarks in the current run:"])
-        lines.extend(f"- `{name}`" for name in missing_in_current)
-
-    if new_in_current:
-        lines.extend(["", "New benchmarks in the current run:"])
-        lines.extend(f"- `{name}`" for name in new_in_current)
-
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
-
-
-def main() -> int:
-    """Run the benchmark comparison command line interface.
-
-    Returns:
-        int: ``1`` when a regression exceeds the threshold, otherwise ``0``.
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--baseline", required=True, type=Path)
-    parser.add_argument("--current", required=True, type=Path)
-    parser.add_argument("--summary", required=True, type=Path)
-    parser.add_argument("--threshold-percent", required=True, type=float)
-    parser.add_argument("--higher-is-better", action="store_true")
-    args = parser.parse_args()
-
-    baseline = extract_benchmarks(args.baseline)
-    current = extract_benchmarks(args.current)
-    comparisons, missing_in_current, new_in_current = compare_benchmarks(
-        baseline=baseline,
-        current=current,
-        threshold_percent=args.threshold_percent,
-        higher_is_better=args.higher_is_better,
-    )
-
-    write_summary(
-        path=args.summary,
-        comparisons=comparisons,
-        missing_in_current=missing_in_current,
-        new_in_current=new_in_current,
-        threshold_percent=args.threshold_percent,
-        higher_is_better=args.higher_is_better,
-    )
-
-    return 1 if any(comparison.regressed for comparison in comparisons) else 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/.github/scripts/run_benchmarks.sh b/.github/scripts/run_benchmarks.sh
deleted file mode 100644
index 64c23790..00000000
--- a/.github/scripts/run_benchmarks.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-mkdir -p benchmark-results
-benchmark_json="${BENCHMARK_JSON:-benchmark-results/current.json}"
-benchmark_root="$(dirname "$benchmark_json")"
-hyperfine_benchmark_dir="${BENCHMARK_HYPERFINE_DIR:-tests/benchmarks/hyperfine}"
-pytest_benchmark_dirs="${BENCHMARK_PYTEST_DIRS:-${BENCHMARK_PYTEST_DIR:-}}"
-benchmark_work_dir="$benchmark_root/raw-results"
-hyperfine_json_dir="$benchmark_work_dir/hyperfine"
-pytest_json="$benchmark_work_dir/pytest.json"
-
-shopt -s nullglob
-benchmark_scripts=()
-benchmark_scripts=("$hyperfine_benchmark_dir"/benchmark_*.sh)
-shopt -u nullglob
-
-pytest_dirs=()
-for pytest_benchmark_dir in $pytest_benchmark_dirs; do
-  if [ -d "$pytest_benchmark_dir" ]; then
-    pytest_dirs+=("$pytest_benchmark_dir")
-  else
-    echo "Pytest benchmark directory not found: $pytest_benchmark_dir" >&2
-    exit 1
-  fi
-done
-
-if [ "${#benchmark_scripts[@]}" -eq 0 ] && [ "${#pytest_dirs[@]}" -eq 0 ]; then
-  echo "No benchmark scripts or pytest benchmarks found" >&2
-  exit 1
-fi
-
-echo "Benchmark Python: $(command -v python)"
-python -c 'import sys; print(sys.version)'
-
-rm -rf "$benchmark_work_dir"
-mkdir -p "$hyperfine_json_dir"
-
-if [ "${#benchmark_scripts[@]}" -gt 0 ]; then
-  for benchmark_script in "${benchmark_scripts[@]}"; do
-    title="$(sed -n 's/^# BENCHMARK_TITLE:[[:space:]]*//p' "$benchmark_script" | head -n 1)"
-    if [ -z "$title" ]; then
-      title="$(basename "$benchmark_script" .sh)"
-    fi
-    benchmark_name="$(basename "$benchmark_script" .sh)"
-    benchmark_result_json="$hyperfine_json_dir/$benchmark_name.json"
-    echo "Preflight benchmark script: $benchmark_script"
-    bash "$benchmark_script"
-
-    hyperfine \
-      --show-output \
-      --warmup 1 \
-      --runs 5 \
-      --command-name "$title" \
-      --export-json "$benchmark_result_json" \
-      "bash $(printf "%q" "$benchmark_script")"
-  done
-fi
-
-if [ "${#pytest_dirs[@]}" -gt 0 ]; then
-  pytest \
-    -q "${pytest_dirs[@]}" \
-    --benchmark-only \
-    --benchmark-json "$pytest_json"
-fi
-
-python .github/scripts/aggregate_benchmarks.py \
-  --input-dir "$benchmark_work_dir" \
-  --output "$benchmark_json"
diff --git a/.github/scripts/setup_benchmark_env.sh b/.github/scripts/setup_benchmark_env.sh
new file mode 100755
index 00000000..b001ae1c
--- /dev/null
+++ b/.github/scripts/setup_benchmark_env.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+bec_core_branch="${BEC_CORE_BRANCH:-main}"
+ophyd_devices_branch="${OPHYD_DEVICES_BRANCH:-main}"
+plugin_repo_branch="${PLUGIN_REPO_BRANCH:-main}"
+python_version="${PYTHON_VERSION:-3.11}"
+
+if command -v conda >/dev/null 2>&1; then
+  conda_base="$(conda info --base)"
+  source "$conda_base/etc/profile.d/conda.sh"
+fi
+
+echo "Using branch ${bec_core_branch} of BEC CORE"
+git clone --branch "$bec_core_branch" https://github.com/bec-project/bec.git
+
+echo "Using branch ${ophyd_devices_branch} of OPHYD_DEVICES"
+git clone --branch "$ophyd_devices_branch" https://github.com/bec-project/ophyd_devices.git
+
+echo "Using branch ${plugin_repo_branch} of bec_testing_plugin"
+git clone --branch "$plugin_repo_branch" https://github.com/bec-project/bec_testing_plugin.git
+
+conda create -q -n test-environment "python=${python_version}"
+conda activate test-environment
+
+cd bec
+source ./bin/install_bec_dev.sh -t
+cd ..
+
+python -m pip install -e ./ophyd_devices -e .[dev,pyside6] -e ./bec_testing_plugin
+
+benchmark_tmp_dir="$(mktemp -d)"
+export BEC_SERVICE_CONFIG="$benchmark_tmp_dir/services_config.yaml"
+ready_file="$benchmark_tmp_dir/ready"
+supervisor_log="$benchmark_tmp_dir/bec-benchmark-services.log"
+
+python .github/scripts/start_bec_benchmark_services.py \
+  --files-path "$benchmark_tmp_dir" \
+  --services-config "$BEC_SERVICE_CONFIG" \
+  --ready-file "$ready_file" \
+  > "$supervisor_log" 2>&1 &
+supervisor_pid=$!
+
+cleanup_benchmark_services() {
+  if kill -0 "$supervisor_pid" >/dev/null 2>&1; then
+    kill "$supervisor_pid"
+    wait "$supervisor_pid" || true
+  fi
+  rm -rf "$benchmark_tmp_dir"
+}
+trap cleanup_benchmark_services EXIT
+
+deadline=$((SECONDS + 30))
+while [ ! -f "$ready_file" ]; do
+  if ! kill -0 "$supervisor_pid" >/dev/null 2>&1; then
+    cat "$supervisor_log" >&2 || true
+    echo "BEC benchmark service supervisor exited before becoming ready" >&2
+    exit 1
+  fi
+  if [ "$SECONDS" -ge "$deadline" ]; then
+    cat "$supervisor_log" >&2 || true
+    echo "Timed out waiting for BEC benchmark services" >&2
+    exit 1
+  fi
+  sleep 0.2
+done
+
+cat "$supervisor_log"
+echo "BEC_SERVICE_CONFIG=$BEC_SERVICE_CONFIG" >> "$GITHUB_ENV"
diff --git a/.github/scripts/run_with_bec_servers.py b/.github/scripts/start_bec_benchmark_services.py
old mode 100644
new mode 100755
similarity index 56%
rename from .github/scripts/run_with_bec_servers.py
rename to .github/scripts/start_bec_benchmark_services.py
index 719e25f0..1666c7a6
--- a/.github/scripts/run_with_bec_servers.py
+++ b/.github/scripts/start_bec_benchmark_services.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
-"""Run a command with BEC e2e services available."""
+"""Start BEC services for benchmark workflows and keep them alive."""
 
 from __future__ import annotations
 
 import argparse
-import os
 import shutil
 import subprocess
-import tempfile
 import time
 from pathlib import Path
 
@@ -52,18 +50,14 @@ def _start_redis(files_path: Path, host: str, port: int) -> subprocess.Popen:
     )
 
 
-def _write_configs(files_path: Path, host: str, port: int) -> Path:
-    test_config = files_path / "test_config.yaml"
-    services_config = files_path / "services_config.yaml"
-
+def _write_configs(files_path: Path, services_config: Path, host: str, port: int) -> None:
     bec_lib_path = Path(bec_lib.__file__).resolve().parent
-    shutil.copyfile(bec_lib_path / "tests" / "test_config.yaml", test_config)
+    shutil.copyfile(bec_lib_path / "tests" / "test_config.yaml", files_path / "test_config.yaml")
 
     service_config = ServiceConfigModel(
         redis={"host": host, "port": port}, file_writer={"base_path": str(files_path)}
     )
     services_config.write_text(service_config.model_dump_json(indent=4), encoding="utf-8")
-    return services_config
 
 
 def _load_demo_config(services_config: Path) -> None:
@@ -78,44 +72,41 @@ def _load_demo_config(services_config: Path) -> None:
 
 def main() -> int:
     parser = argparse.ArgumentParser()
-    parser.add_argument("command", nargs=argparse.REMAINDER)
+    parser.add_argument("--files-path", required=True, type=Path)
+    parser.add_argument("--services-config", required=True, type=Path)
+    parser.add_argument("--ready-file", required=True, type=Path)
     args = parser.parse_args()
 
-    if args.command[:1] == ["--"]:
-        args.command = args.command[1:]
-    if not args.command:
-        raise ValueError("No command provided")
-
     host = "127.0.0.1"
     port = 6379
 
-    with tempfile.TemporaryDirectory(prefix="bec-benchmark-") as tmp:
-        files_path = Path(tmp)
-        services_config = _write_configs(files_path, host, port)
-        redis_process = _start_redis(files_path, host, port)
-        processes = None
-        service_handler = None
+    args.files_path.mkdir(parents=True, exist_ok=True)
+    _write_configs(args.files_path, args.services_config, host, port)
+    redis_process = _start_redis(args.files_path, host, port)
+    processes = None
+    service_handler = None
+    try:
+        _wait_for_redis(host, port)
+
+        from bec_server.bec_server_utils.service_handler import ServiceHandler
+
+        service_handler = ServiceHandler(
+            bec_path=args.files_path, config_path=args.services_config, interface="subprocess"
+        )
+        processes = service_handler.start()
+        _load_demo_config(args.services_config)
+        args.ready_file.write_text("ready\n", encoding="utf-8")
+        print("BEC benchmark services are ready", flush=True)
+    finally:
+        if service_handler is not None and processes is not None:
+            service_handler.stop(processes)
+        redis_process.terminate()
         try:
-            _wait_for_redis(host, port)
+            redis_process.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            redis_process.kill()
 
-            from bec_server.bec_server_utils.service_handler import ServiceHandler
-
-            service_handler = ServiceHandler(
-                bec_path=files_path, config_path=services_config, interface="subprocess"
-            )
-            processes = service_handler.start()
-            _load_demo_config(services_config)
-
-            env = os.environ.copy()
-            return subprocess.run(args.command, env=env, check=False).returncode
-        finally:
-            if service_handler is not None and processes is not None:
-                service_handler.stop(processes)
-            redis_process.terminate()
-            try:
-                redis_process.wait(timeout=10)
-            except subprocess.TimeoutExpired:
-                redis_process.kill()
+    return 0
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index f8724689..c1f5805d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,40 +1,24 @@
 name: BW Benchmarks
 
-on: [workflow_call]
+on: [ workflow_call ]
 
 permissions:
   contents: read
 
 env:
-  BENCHMARK_JSON: benchmark-results/current.json
-  BENCHMARK_BASELINE_JSON: gh-pages-benchmark-data/benchmarks/latest.json
-  BENCHMARK_SUMMARY: benchmark-results/summary.md
-  BENCHMARK_COMMAND: "bash .github/scripts/run_benchmarks.sh"
-  BENCHMARK_THRESHOLD_PERCENT: 10
-  BENCHMARK_HIGHER_IS_BETTER: false
+  QTWEBENGINE_DISABLE_SANDBOX: 1
+  QT_QPA_PLATFORM: "offscreen"
 
 jobs:
-  benchmark_attempt:
+  benchmark:
     runs-on: ubuntu-latest
-    continue-on-error: true
     permissions:
-      contents: read
+      contents: write
+      issues: write
+      pull-requests: write
     defaults:
       run:
         shell: bash -el {0}
-    strategy:
-      fail-fast: false
-      matrix:
-        attempt: [1, 2, 3]
-
-    env:
-      BENCHMARK_JSON: benchmark-results/current-${{ matrix.attempt }}.json
-      BEC_CORE_BRANCH: main
-      OPHYD_DEVICES_BRANCH: main
-      PLUGIN_REPO_BRANCH: main
-      BENCHMARK_PYTEST_DIRS: tests/unit_tests/benchmarks
-      QTWEBENGINE_DISABLE_SANDBOX: 1
-      QT_QPA_PLATFORM: "offscreen"
 
     steps:
       - name: Checkout BEC Widgets
@@ -50,190 +34,14 @@ jobs:
           auto-activate-base: true
           python-version: "3.11"
 
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libgl1 libegl1 x11-utils libxkbcommon-x11-0 libdbus-1-3 xvfb
-          sudo apt-get -y install libnss3 libxdamage1 libasound2t64 libatomic1 libxcursor1
-          sudo apt-get -y install ttyd hyperfine redis-server
-
-      - name: Install full e2e environment
-        run: |
-          echo -e "\033[35;1m Using branch $BEC_CORE_BRANCH of BEC CORE \033[0;m";
-          git clone --branch "$BEC_CORE_BRANCH" https://github.com/bec-project/bec.git
-          echo -e "\033[35;1m Using branch $OPHYD_DEVICES_BRANCH of OPHYD_DEVICES \033[0;m";
-          git clone --branch "$OPHYD_DEVICES_BRANCH" https://github.com/bec-project/ophyd_devices.git
-          export OHPYD_DEVICES_PATH=$PWD/ophyd_devices
-          echo -e "\033[35;1m Using branch $PLUGIN_REPO_BRANCH of bec_testing_plugin \033[0;m";
-          git clone --branch "$PLUGIN_REPO_BRANCH" https://github.com/bec-project/bec_testing_plugin.git
-          cd ./bec
-          conda create -q -n test-environment python=3.11
-          conda activate test-environment
-          source ./bin/install_bec_dev.sh -t
-          cd ../
-          python -m pip install -e ./ophyd_devices -e .[dev,pyside6] -e ./bec_testing_plugin pytest-benchmark
-
-          mkdir -p "$(dirname "$BENCHMARK_JSON")"
-          python .github/scripts/run_with_bec_servers.py -- bash -lc "$BENCHMARK_COMMAND"
-          test -s "$BENCHMARK_JSON"
-
-      - name: Upload benchmark artifact
-        uses: actions/upload-artifact@v4
+      - name: Run, compare, and publish benchmarks
+        uses: bec-project/benchmark_action@main
         with:
-          name: bw-benchmark-json-${{ matrix.attempt }}
-          path: ${{ env.BENCHMARK_JSON }}
-
-  benchmark:
-    needs: [benchmark_attempt]
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      issues: write
-      pull-requests: write
-
-    steps:
-      - name: Checkout BEC Widgets
-        uses: actions/checkout@v4
-        with:
-          repository: bec-project/bec_widgets
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
-
-      - name: Download benchmark attempts
-        uses: actions/download-artifact@v4
-        with:
-          pattern: bw-benchmark-json-*
-          path: benchmark-results/attempts
-          merge-multiple: true
-
-      - name: Aggregate benchmark attempts
-        run: |
-          python .github/scripts/aggregate_benchmarks.py \
-            --input-dir benchmark-results/attempts \
-            --output "$BENCHMARK_JSON"
-
-      - name: Upload aggregate benchmark artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: bw-benchmark-json
-          path: ${{ env.BENCHMARK_JSON }}
-
-      - name: Fetch gh-pages benchmark data
-        run: |
-          if git ls-remote --exit-code --heads origin gh-pages; then
-            git clone --depth=1 --branch gh-pages "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git" gh-pages-benchmark-data
-          else
-            mkdir -p gh-pages-benchmark-data
-          fi
-
-      - name: Compare with latest gh-pages benchmark
-        id: compare
-        continue-on-error: true
-        run: |
-          if [ ! -s "$BENCHMARK_BASELINE_JSON" ]; then
-            mkdir -p "$(dirname "$BENCHMARK_SUMMARY")"
-            {
-              echo "<!-- bw-benchmark-comment -->"
-              echo "## Benchmark comparison"
-              echo
-              echo "No benchmark baseline was found on gh-pages."
-            } > "$BENCHMARK_SUMMARY"
-            exit 0
-          fi
-
-          args=(
-            --baseline "$BENCHMARK_BASELINE_JSON"
-            --current "$BENCHMARK_JSON"
-            --summary "$BENCHMARK_SUMMARY"
-            --threshold-percent "$BENCHMARK_THRESHOLD_PERCENT"
-          )
-
-          if [ "$BENCHMARK_HIGHER_IS_BETTER" = "true" ]; then
-            args+=(--higher-is-better)
-          fi
-
-          set +e
-          python .github/scripts/compare_benchmarks.py "${args[@]}"
-          status=$?
-          set -e
-
-          if [ ! -s "$BENCHMARK_SUMMARY" ]; then
-            mkdir -p "$(dirname "$BENCHMARK_SUMMARY")"
-            {
-              echo "<!-- bw-benchmark-comment -->"
-              echo "## Benchmark comparison"
-              echo
-              echo "Benchmark comparison failed before writing a summary."
-            } > "$BENCHMARK_SUMMARY"
-          fi
-
-          exit "$status"
-
-      - name: Find existing benchmark PR comment
-        if: github.event_name == 'pull_request'
-        id: fc
-        uses: peter-evans/find-comment@v3
-        with:
-          issue-number: ${{ github.event.pull_request.number }}
-          comment-author: github-actions[bot]
-          body-includes: "<!-- bw-benchmark-comment -->"
-
-      - name: Create or update benchmark PR comment
-        if: github.event_name == 'pull_request'
-        uses: peter-evans/create-or-update-comment@v5
-        with:
-          issue-number: ${{ github.event.pull_request.number }}
-          comment-id: ${{ steps.fc.outputs.comment-id }}
-          body-path: ${{ env.BENCHMARK_SUMMARY }}
-          edit-mode: replace
-
-      - name: Fail on benchmark regression
-        if: github.event_name == 'pull_request' && steps.compare.outcome == 'failure'
-        run: exit 1
-
-  publish:
-    needs: [benchmark]
-    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-
-    steps:
-      - name: Checkout BEC Widgets
-        uses: actions/checkout@v4
-        with:
-          repository: bec-project/bec_widgets
-          ref: ${{ github.sha }}
-
-      - name: Download aggregate benchmark artifact
-        uses: actions/download-artifact@v4
-        with:
-          name: bw-benchmark-json
-          path: .
-
-      - name: Prepare gh-pages for publishing
-        run: |
-          # Clean up any existing worktree/directory
-          if [ -d gh-pages-benchmark-data ]; then
-            git worktree remove gh-pages-benchmark-data --force || rm -rf gh-pages-benchmark-data
-          fi
-
-          if git ls-remote --exit-code --heads origin gh-pages; then
-            git fetch --depth=1 origin gh-pages
-            git worktree add gh-pages-benchmark-data FETCH_HEAD
-          else
-            git worktree add --detach gh-pages-benchmark-data
-            git -C gh-pages-benchmark-data checkout --orphan gh-pages
-            git -C gh-pages-benchmark-data rm -rf .
-          fi
-
-      - name: Publish benchmark data to gh-pages
-        working-directory: gh-pages-benchmark-data
-        run: |
-          mkdir -p benchmarks/history
-          cp "../$BENCHMARK_JSON" benchmarks/latest.json
-          cp "../$BENCHMARK_JSON" "benchmarks/history/${GITHUB_SHA}.json"
-          git config user.name "github-actions[bot]"
-          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-          git add benchmarks/latest.json "benchmarks/history/${GITHUB_SHA}.json"
-          git commit -m "Update BW benchmark data for ${GITHUB_SHA}" || exit 0
-          git push origin HEAD:gh-pages
+          mode: all
+          attempts: "3"
+          system-packages: libgl1 libegl1 x11-utils libxkbcommon-x11-0 libdbus-1-3 xvfb
+            libnss3 libxdamage1 libasound2t64 libatomic1 libxcursor1 hyperfine
+            redis-server
+          setup-scripts: .github/scripts/setup_benchmark_env.sh
+          benchmark-pytest-dirs: tests/unit_tests/benchmarks
+          threshold-percent: "10"