bec_widgets/.github/scripts/compare_benchmarks.py

#!/usr/bin/env python3
"""Compare benchmark JSON files and write a GitHub Actions summary.

The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark,
and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing
formats prefer median values and fall back to mean values when median values are
not present.
"""

from __future__ import annotations

import argparse
import json
import math
from dataclasses import dataclass
from pathlib import Path
from typing import Any


@dataclass(frozen=True)
class Benchmark:
    """Normalized benchmark result.

    Attributes:
        name (str): Stable benchmark name used to match baseline and current results.
        value (float): Numeric benchmark value used for comparison.
        unit (str): Display unit for the value, for example ``"s"``.
        metric (str): Source metric name, for example ``"median"`` or ``"mean"``.
    """

    name: str
    value: float
    unit: str
    metric: str = "value"


@dataclass(frozen=True)
class Comparison:
    """Comparison between one baseline benchmark and one current benchmark.

    Attributes:
        name (str): Benchmark name.
        baseline (float): Baseline benchmark value.
        current (float): Current benchmark value.
        delta_percent (float): Percent change from baseline to current.
        unit (str): Display unit for both values.
        metric (str): Current result metric used for comparison.
        regressed (bool): Whether the change exceeds the configured threshold in
            the worse direction.
        improved (bool): Whether the change exceeds the configured threshold in
            the better direction.
    """

    name: str
    baseline: float
    current: float
    delta_percent: float
    unit: str
    metric: str
    regressed: bool
    improved: bool


def _read_json(path: Path) -> Any:
    """Read JSON data from a file.

    Args:
        path (Path): Path to the JSON file.

    Returns:
        Any: Parsed JSON value.
    """

    with path.open("r", encoding="utf-8") as stream:
        return json.load(stream)


def _as_float(value: Any) -> float | None:
    """Convert a value to a finite float.

    Args:
        value (Any): Value to convert.

    Returns:
        float | None: Converted finite float, or ``None`` if conversion fails.
    """

    try:
        result = float(value)
    except (TypeError, ValueError):
        return None
    if math.isfinite(result):
        return result
    return None


def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
    """Extract normalized benchmarks from hyperfine JSON.

    Args:
        data (dict[str, Any]): Parsed hyperfine JSON object.

    Returns:
        dict[str, Benchmark]: Benchmarks keyed by command name.
    """

    benchmarks: dict[str, Benchmark] = {}
    for result in data.get("results", []):
        if not isinstance(result, dict):
            continue
        name = str(result.get("command") or result.get("name") or "").strip()
        metric = "median"
        value = _as_float(result.get(metric))
        if value is None:
            metric = "mean"
            value = _as_float(result.get(metric))
        if name and value is not None:
            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
    return benchmarks


def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
    """Extract normalized benchmarks from pytest-benchmark JSON.

    Args:
        data (dict[str, Any]): Parsed pytest-benchmark JSON object.

    Returns:
        dict[str, Benchmark]: Benchmarks keyed by full benchmark name.
    """

    benchmarks: dict[str, Benchmark] = {}
    for benchmark in data.get("benchmarks", []):
        if not isinstance(benchmark, dict):
            continue

        name = str(benchmark.get("fullname") or benchmark.get("name") or "").strip()
        stats = benchmark.get("stats", {})
        value = None
        metric = "median"
        if isinstance(stats, dict):
            value = _as_float(stats.get(metric))
            if value is None:
                metric = "mean"
                value = _as_float(stats.get(metric))
        if name and value is not None:
            benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
    return benchmarks


def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
    """Extract normalized benchmarks from a compact mapping JSON object.

    Args:
        data (dict[str, Any]): Parsed mapping where each benchmark is either a
            raw number or an object containing ``value``, ``unit``, and ``metric``.

    Returns:
        dict[str, Benchmark]: Benchmarks keyed by mapping key.
    """

    benchmarks: dict[str, Benchmark] = {}

    for name, raw_value in data.items():
        if name in {"version", "context", "commit", "timestamp"}:
            continue

        value = _as_float(raw_value)
        unit = ""
        metric = "value"
        if value is None and isinstance(raw_value, dict):
            value = _as_float(raw_value.get("value"))
            unit = str(raw_value.get("unit") or "")
            metric = str(raw_value.get("metric") or "value")

        if value is not None:
            benchmarks[str(name)] = Benchmark(name=str(name), value=value, unit=unit, metric=metric)

    return benchmarks


def extract_benchmarks(path: Path) -> dict[str, Benchmark]:
    """Extract normalized benchmarks from a supported JSON file.

    Args:
        path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping
            JSON file.

    Returns:
        dict[str, Benchmark]: Normalized benchmarks keyed by name.

    Raises:
        ValueError: If the JSON root is not an object or no supported benchmark
            entries can be extracted.
    """

    data = _read_json(path)
    if not isinstance(data, dict):
        raise ValueError(f"{path} must contain a JSON object")

    extractors = (_extract_hyperfine, _extract_pytest_benchmark, _extract_simple_mapping)
    for extractor in extractors:
        benchmarks = extractor(data)
        if benchmarks:
            return benchmarks

    raise ValueError(f"No supported benchmark entries found in {path}")


def compare_benchmarks(
    baseline: dict[str, Benchmark],
    current: dict[str, Benchmark],
    threshold_percent: float,
    higher_is_better: bool,
) -> tuple[list[Comparison], list[str], list[str]]:
    """Compare baseline benchmarks with current benchmarks.

    Args:
        baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name.
        current (dict[str, Benchmark]): Current benchmarks keyed by name.
        threshold_percent (float): Regression threshold in percent.
        higher_is_better (bool): If ``True``, lower current values are treated as
            regressions. If ``False``, higher current values are treated as
            regressions.

    Returns:
        tuple[list[Comparison], list[str], list[str]]: Comparisons for common
        benchmark names, names missing from current results, and names newly
        present in current results.
    """

    comparisons: list[Comparison] = []
    missing_in_current: list[str] = []
    new_in_current: list[str] = []

    for name, baseline_benchmark in sorted(baseline.items()):
        current_benchmark = current.get(name)
        if current_benchmark is None:
            missing_in_current.append(name)
            continue

        if baseline_benchmark.value == 0:
            delta_percent = 0.0
        else:
            delta_percent = (
                (current_benchmark.value - baseline_benchmark.value)
                / abs(baseline_benchmark.value)
                * 100
            )

        if higher_is_better:
            regressed = delta_percent <= -threshold_percent
            improved = delta_percent >= threshold_percent
        else:
            regressed = delta_percent >= threshold_percent
            improved = delta_percent <= -threshold_percent

        comparisons.append(
            Comparison(
                name=name,
                baseline=baseline_benchmark.value,
                current=current_benchmark.value,
                delta_percent=delta_percent,
                unit=current_benchmark.unit or baseline_benchmark.unit,
                metric=current_benchmark.metric,
                regressed=regressed,
                improved=improved,
            )
        )

    for name in sorted(set(current) - set(baseline)):
        new_in_current.append(name)

    return comparisons, missing_in_current, new_in_current


def _format_value(value: float, unit: str) -> str:
    """Format a benchmark value for Markdown output.

    Args:
        value (float): Numeric benchmark value.
        unit (str): Display unit.

    Returns:
        str: Formatted value with optional unit suffix.
    """

    suffix = f" {unit}" if unit else ""
    return f"{value:.6g}{suffix}"


def _format_status(comparison: Comparison) -> str:
    """Format a comparison status for Markdown output."""

    if comparison.regressed:
        return ":red_circle: regressed"
    if comparison.improved:
        return ":green_circle: improved"
    return "ok"


def write_summary(
    path: Path,
    comparisons: list[Comparison],
    missing_in_current: list[str],
    new_in_current: list[str],
    threshold_percent: float,
    higher_is_better: bool,
) -> None:
    """Write a Markdown benchmark comparison summary.

    Args:
        path (Path): Path where the summary should be written.
        comparisons (list[Comparison]): Comparison rows for matching benchmarks.
        missing_in_current (list[str]): Baseline benchmark names missing from the
            current result.
        new_in_current (list[str]): Current benchmark names not present in the
            baseline result.
        threshold_percent (float): Regression threshold in percent.
        higher_is_better (bool): Whether higher benchmark values are considered
            better.
    """

    regressions = [comparison for comparison in comparisons if comparison.regressed]
    improvements = [comparison for comparison in comparisons if comparison.improved]
    direction = "higher is better" if higher_is_better else "lower is better"
    sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name)

    lines = [
        "<!-- bw-benchmark-comment -->",
        "## Benchmark comparison",
        "",
        f"Threshold: {threshold_percent:g}% ({direction}).",
        f"Result: {len(regressions)} regression(s), {len(improvements)} improvement(s) beyond threshold.",
    ]
    lines.append("")

    if regressions:
        lines.extend(
            [
                f"{len(regressions)} benchmark(s) regressed beyond the configured threshold.",
                "",
                "| Benchmark | Baseline | Current | Change |",
                "| --- | ---: | ---: | ---: |",
            ]
        )
        for comparison in regressions:
            lines.append(
                "| "
                f"{comparison.name} | "
                f"{_format_value(comparison.baseline, comparison.unit)} | "
                f"{_format_value(comparison.current, comparison.unit)} | "
                f"{comparison.delta_percent:+.2f}% |"
            )
    else:
        lines.append("No benchmark regression exceeded the configured threshold.")

    lines.append("")

    if improvements:
        lines.extend(
            [
                f"{len(improvements)} benchmark(s) improved beyond the configured threshold.",
                "",
                "| Benchmark | Baseline | Current | Change |",
                "| --- | ---: | ---: | ---: |",
            ]
        )
        for comparison in improvements:
            lines.append(
                "| "
                f"{comparison.name} | "
                f"{_format_value(comparison.baseline, comparison.unit)} | "
                f"{_format_value(comparison.current, comparison.unit)} | "
                f"{comparison.delta_percent:+.2f}% |"
            )
    else:
        lines.append("No benchmark improvement exceeded the configured threshold.")

    if sorted_comparisons:
        lines.extend(
            [
                "",
                "<details>",
                "<summary>All benchmark results</summary>",
                "",
                "| Benchmark | Baseline | Current | Change | Status |",
                "| --- | ---: | ---: | ---: | --- |",
            ]
        )
        for comparison in sorted_comparisons:
            lines.append(
                "| "
                f"{comparison.name} | "
                f"{_format_value(comparison.baseline, comparison.unit)} | "
                f"{_format_value(comparison.current, comparison.unit)} | "
                f"{comparison.delta_percent:+.2f}% | "
                f"{_format_status(comparison)} |"
            )
        lines.extend(["", "</details>"])

    if missing_in_current:
        lines.extend(["", "Missing benchmarks in the current run:"])
        lines.extend(f"- `{name}`" for name in missing_in_current)

    if new_in_current:
        lines.extend(["", "New benchmarks in the current run:"])
        lines.extend(f"- `{name}`" for name in new_in_current)

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def main() -> int:
    """Run the benchmark comparison command line interface.

    Returns:
        int: ``1`` when a regression exceeds the threshold, otherwise ``0``.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("--baseline", required=True, type=Path)
    parser.add_argument("--current", required=True, type=Path)
    parser.add_argument("--summary", required=True, type=Path)
    parser.add_argument("--threshold-percent", required=True, type=float)
    parser.add_argument("--higher-is-better", action="store_true")
    args = parser.parse_args()

    baseline = extract_benchmarks(args.baseline)
    current = extract_benchmarks(args.current)
    comparisons, missing_in_current, new_in_current = compare_benchmarks(
        baseline=baseline,
        current=current,
        threshold_percent=args.threshold_percent,
        higher_is_better=args.higher_is_better,
    )

    write_summary(
        path=args.summary,
        comparisons=comparisons,
        missing_in_current=missing_in_current,
        new_in_current=new_in_current,
        threshold_percent=args.threshold_percent,
        higher_is_better=args.higher_is_better,
    )

    return 1 if any(comparison.regressed for comparison in comparisons) else 0


if __name__ == "__main__":
    raise SystemExit(main())