#!/usr/bin/env python3 """Compare benchmark JSON files and write a GitHub Actions summary. The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark, and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing formats prefer median values and fall back to mean values when median values are not present. """ from __future__ import annotations import argparse import json import math from dataclasses import dataclass from pathlib import Path from typing import Any @dataclass(frozen=True) class Benchmark: """Normalized benchmark result. Attributes: name (str): Stable benchmark name used to match baseline and current results. value (float): Numeric benchmark value used for comparison. unit (str): Display unit for the value, for example ``"s"``. metric (str): Source metric name, for example ``"median"`` or ``"mean"``. """ name: str value: float unit: str metric: str = "value" @dataclass(frozen=True) class Comparison: """Comparison between one baseline benchmark and one current benchmark. Attributes: name (str): Benchmark name. baseline (float): Baseline benchmark value. current (float): Current benchmark value. delta_percent (float): Percent change from baseline to current. unit (str): Display unit for both values. metric (str): Current result metric used for comparison. regressed (bool): Whether the change exceeds the configured threshold in the worse direction. improved (bool): Whether the change exceeds the configured threshold in the better direction. """ name: str baseline: float current: float delta_percent: float unit: str metric: str regressed: bool improved: bool def _read_json(path: Path) -> Any: """Read JSON data from a file. Args: path (Path): Path to the JSON file. Returns: Any: Parsed JSON value. """ with path.open("r", encoding="utf-8") as stream: return json.load(stream) def _as_float(value: Any) -> float | None: """Convert a value to a finite float. Args: value (Any): Value to convert. Returns: float | None: Converted finite float, or ``None`` if conversion fails. """ try: result = float(value) except (TypeError, ValueError): return None if math.isfinite(result): return result return None def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]: """Extract normalized benchmarks from hyperfine JSON. Args: data (dict[str, Any]): Parsed hyperfine JSON object. Returns: dict[str, Benchmark]: Benchmarks keyed by command name. """ benchmarks: dict[str, Benchmark] = {} for result in data.get("results", []): if not isinstance(result, dict): continue name = str(result.get("command") or result.get("name") or "").strip() metric = "median" value = _as_float(result.get(metric)) if value is None: metric = "mean" value = _as_float(result.get(metric)) if name and value is not None: benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric) return benchmarks def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]: """Extract normalized benchmarks from pytest-benchmark JSON. Args: data (dict[str, Any]): Parsed pytest-benchmark JSON object. Returns: dict[str, Benchmark]: Benchmarks keyed by full benchmark name. """ benchmarks: dict[str, Benchmark] = {} for benchmark in data.get("benchmarks", []): if not isinstance(benchmark, dict): continue name = str(benchmark.get("fullname") or benchmark.get("name") or "").strip() stats = benchmark.get("stats", {}) value = None metric = "median" if isinstance(stats, dict): value = _as_float(stats.get(metric)) if value is None: metric = "mean" value = _as_float(stats.get(metric)) if name and value is not None: benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric) return benchmarks def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]: """Extract normalized benchmarks from a compact mapping JSON object. Args: data (dict[str, Any]): Parsed mapping where each benchmark is either a raw number or an object containing ``value``, ``unit``, and ``metric``. Returns: dict[str, Benchmark]: Benchmarks keyed by mapping key. """ benchmarks: dict[str, Benchmark] = {} for name, raw_value in data.items(): if name in {"version", "context", "commit", "timestamp"}: continue value = _as_float(raw_value) unit = "" metric = "value" if value is None and isinstance(raw_value, dict): value = _as_float(raw_value.get("value")) unit = str(raw_value.get("unit") or "") metric = str(raw_value.get("metric") or "value") if value is not None: benchmarks[str(name)] = Benchmark(name=str(name), value=value, unit=unit, metric=metric) return benchmarks def extract_benchmarks(path: Path) -> dict[str, Benchmark]: """Extract normalized benchmarks from a supported JSON file. Args: path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping JSON file. Returns: dict[str, Benchmark]: Normalized benchmarks keyed by name. Raises: ValueError: If the JSON root is not an object or no supported benchmark entries can be extracted. """ data = _read_json(path) if not isinstance(data, dict): raise ValueError(f"{path} must contain a JSON object") extractors = (_extract_hyperfine, _extract_pytest_benchmark, _extract_simple_mapping) for extractor in extractors: benchmarks = extractor(data) if benchmarks: return benchmarks raise ValueError(f"No supported benchmark entries found in {path}") def compare_benchmarks( baseline: dict[str, Benchmark], current: dict[str, Benchmark], threshold_percent: float, higher_is_better: bool, ) -> tuple[list[Comparison], list[str], list[str]]: """Compare baseline benchmarks with current benchmarks. Args: baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name. current (dict[str, Benchmark]): Current benchmarks keyed by name. threshold_percent (float): Regression threshold in percent. higher_is_better (bool): If ``True``, lower current values are treated as regressions. If ``False``, higher current values are treated as regressions. Returns: tuple[list[Comparison], list[str], list[str]]: Comparisons for common benchmark names, names missing from current results, and names newly present in current results. """ comparisons: list[Comparison] = [] missing_in_current: list[str] = [] new_in_current: list[str] = [] for name, baseline_benchmark in sorted(baseline.items()): current_benchmark = current.get(name) if current_benchmark is None: missing_in_current.append(name) continue if baseline_benchmark.value == 0: delta_percent = 0.0 else: delta_percent = ( (current_benchmark.value - baseline_benchmark.value) / abs(baseline_benchmark.value) * 100 ) if higher_is_better: regressed = delta_percent <= -threshold_percent improved = delta_percent >= threshold_percent else: regressed = delta_percent >= threshold_percent improved = delta_percent <= -threshold_percent comparisons.append( Comparison( name=name, baseline=baseline_benchmark.value, current=current_benchmark.value, delta_percent=delta_percent, unit=current_benchmark.unit or baseline_benchmark.unit, metric=current_benchmark.metric, regressed=regressed, improved=improved, ) ) for name in sorted(set(current) - set(baseline)): new_in_current.append(name) return comparisons, missing_in_current, new_in_current def _format_value(value: float, unit: str) -> str: """Format a benchmark value for Markdown output. Args: value (float): Numeric benchmark value. unit (str): Display unit. Returns: str: Formatted value with optional unit suffix. """ suffix = f" {unit}" if unit else "" return f"{value:.6g}{suffix}" def _format_status(comparison: Comparison) -> str: """Format a comparison status for Markdown output.""" if comparison.regressed: return ":red_circle: regressed" if comparison.improved: return ":green_circle: improved" return "ok" def write_summary( path: Path, comparisons: list[Comparison], missing_in_current: list[str], new_in_current: list[str], threshold_percent: float, higher_is_better: bool, ) -> None: """Write a Markdown benchmark comparison summary. Args: path (Path): Path where the summary should be written. comparisons (list[Comparison]): Comparison rows for matching benchmarks. missing_in_current (list[str]): Baseline benchmark names missing from the current result. new_in_current (list[str]): Current benchmark names not present in the baseline result. threshold_percent (float): Regression threshold in percent. higher_is_better (bool): Whether higher benchmark values are considered better. """ regressions = [comparison for comparison in comparisons if comparison.regressed] improvements = [comparison for comparison in comparisons if comparison.improved] direction = "higher is better" if higher_is_better else "lower is better" sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name) lines = [ "", "## Benchmark comparison", "", f"Threshold: {threshold_percent:g}% ({direction}).", f"Result: {len(regressions)} regression(s), {len(improvements)} improvement(s) beyond threshold.", ] lines.append("") if regressions: lines.extend( [ f"{len(regressions)} benchmark(s) regressed beyond the configured threshold.", "", "| Benchmark | Baseline | Current | Change |", "| --- | ---: | ---: | ---: |", ] ) for comparison in regressions: lines.append( "| " f"{comparison.name} | " f"{_format_value(comparison.baseline, comparison.unit)} | " f"{_format_value(comparison.current, comparison.unit)} | " f"{comparison.delta_percent:+.2f}% |" ) else: lines.append("No benchmark regression exceeded the configured threshold.") lines.append("") if improvements: lines.extend( [ f"{len(improvements)} benchmark(s) improved beyond the configured threshold.", "", "| Benchmark | Baseline | Current | Change |", "| --- | ---: | ---: | ---: |", ] ) for comparison in improvements: lines.append( "| " f"{comparison.name} | " f"{_format_value(comparison.baseline, comparison.unit)} | " f"{_format_value(comparison.current, comparison.unit)} | " f"{comparison.delta_percent:+.2f}% |" ) else: lines.append("No benchmark improvement exceeded the configured threshold.") if sorted_comparisons: lines.extend( [ "", "
", "All benchmark results", "", "| Benchmark | Baseline | Current | Change | Status |", "| --- | ---: | ---: | ---: | --- |", ] ) for comparison in sorted_comparisons: lines.append( "| " f"{comparison.name} | " f"{_format_value(comparison.baseline, comparison.unit)} | " f"{_format_value(comparison.current, comparison.unit)} | " f"{comparison.delta_percent:+.2f}% | " f"{_format_status(comparison)} |" ) lines.extend(["", "
"]) if missing_in_current: lines.extend(["", "Missing benchmarks in the current run:"]) lines.extend(f"- `{name}`" for name in missing_in_current) if new_in_current: lines.extend(["", "New benchmarks in the current run:"]) lines.extend(f"- `{name}`" for name in new_in_current) path.parent.mkdir(parents=True, exist_ok=True) path.write_text("\n".join(lines) + "\n", encoding="utf-8") def main() -> int: """Run the benchmark comparison command line interface. Returns: int: ``1`` when a regression exceeds the threshold, otherwise ``0``. """ parser = argparse.ArgumentParser() parser.add_argument("--baseline", required=True, type=Path) parser.add_argument("--current", required=True, type=Path) parser.add_argument("--summary", required=True, type=Path) parser.add_argument("--threshold-percent", required=True, type=float) parser.add_argument("--higher-is-better", action="store_true") args = parser.parse_args() baseline = extract_benchmarks(args.baseline) current = extract_benchmarks(args.current) comparisons, missing_in_current, new_in_current = compare_benchmarks( baseline=baseline, current=current, threshold_percent=args.threshold_percent, higher_is_better=args.higher_is_better, ) write_summary( path=args.summary, comparisons=comparisons, missing_in_current=missing_in_current, new_in_current=new_in_current, threshold_percent=args.threshold_percent, higher_is_better=args.higher_is_better, ) return 1 if any(comparison.regressed for comparison in comparisons) else 0 if __name__ == "__main__": raise SystemExit(main())