mirror of
https://github.com/bec-project/bec_widgets.git
synced 2026-04-23 16:50:44 +02:00
452 lines
14 KiB
Python
452 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Compare benchmark JSON files and write a GitHub Actions summary.
|
|
|
|
The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark,
|
|
and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing
|
|
formats prefer median values and fall back to mean values when median values are
|
|
not present.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Benchmark:
|
|
"""Normalized benchmark result.
|
|
|
|
Attributes:
|
|
name (str): Stable benchmark name used to match baseline and current results.
|
|
value (float): Numeric benchmark value used for comparison.
|
|
unit (str): Display unit for the value, for example ``"s"``.
|
|
metric (str): Source metric name, for example ``"median"`` or ``"mean"``.
|
|
"""
|
|
|
|
name: str
|
|
value: float
|
|
unit: str
|
|
metric: str = "value"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Comparison:
|
|
"""Comparison between one baseline benchmark and one current benchmark.
|
|
|
|
Attributes:
|
|
name (str): Benchmark name.
|
|
baseline (float): Baseline benchmark value.
|
|
current (float): Current benchmark value.
|
|
delta_percent (float): Percent change from baseline to current.
|
|
unit (str): Display unit for both values.
|
|
metric (str): Current result metric used for comparison.
|
|
regressed (bool): Whether the change exceeds the configured threshold in
|
|
the worse direction.
|
|
improved (bool): Whether the change exceeds the configured threshold in
|
|
the better direction.
|
|
"""
|
|
|
|
name: str
|
|
baseline: float
|
|
current: float
|
|
delta_percent: float
|
|
unit: str
|
|
metric: str
|
|
regressed: bool
|
|
improved: bool
|
|
|
|
|
|
def _read_json(path: Path) -> Any:
|
|
"""Read JSON data from a file.
|
|
|
|
Args:
|
|
path (Path): Path to the JSON file.
|
|
|
|
Returns:
|
|
Any: Parsed JSON value.
|
|
"""
|
|
|
|
with path.open("r", encoding="utf-8") as stream:
|
|
return json.load(stream)
|
|
|
|
|
|
def _as_float(value: Any) -> float | None:
|
|
"""Convert a value to a finite float.
|
|
|
|
Args:
|
|
value (Any): Value to convert.
|
|
|
|
Returns:
|
|
float | None: Converted finite float, or ``None`` if conversion fails.
|
|
"""
|
|
|
|
try:
|
|
result = float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
if math.isfinite(result):
|
|
return result
|
|
return None
|
|
|
|
|
|
def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]:
|
|
"""Extract normalized benchmarks from hyperfine JSON.
|
|
|
|
Args:
|
|
data (dict[str, Any]): Parsed hyperfine JSON object.
|
|
|
|
Returns:
|
|
dict[str, Benchmark]: Benchmarks keyed by command name.
|
|
"""
|
|
|
|
benchmarks: dict[str, Benchmark] = {}
|
|
for result in data.get("results", []):
|
|
if not isinstance(result, dict):
|
|
continue
|
|
name = str(result.get("command") or result.get("name") or "").strip()
|
|
metric = "median"
|
|
value = _as_float(result.get(metric))
|
|
if value is None:
|
|
metric = "mean"
|
|
value = _as_float(result.get(metric))
|
|
if name and value is not None:
|
|
benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
|
|
return benchmarks
|
|
|
|
|
|
def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]:
|
|
"""Extract normalized benchmarks from pytest-benchmark JSON.
|
|
|
|
Args:
|
|
data (dict[str, Any]): Parsed pytest-benchmark JSON object.
|
|
|
|
Returns:
|
|
dict[str, Benchmark]: Benchmarks keyed by full benchmark name.
|
|
"""
|
|
|
|
benchmarks: dict[str, Benchmark] = {}
|
|
for benchmark in data.get("benchmarks", []):
|
|
if not isinstance(benchmark, dict):
|
|
continue
|
|
|
|
name = str(benchmark.get("fullname") or benchmark.get("name") or "").strip()
|
|
stats = benchmark.get("stats", {})
|
|
value = None
|
|
metric = "median"
|
|
if isinstance(stats, dict):
|
|
value = _as_float(stats.get(metric))
|
|
if value is None:
|
|
metric = "mean"
|
|
value = _as_float(stats.get(metric))
|
|
if name and value is not None:
|
|
benchmarks[name] = Benchmark(name=name, value=value, unit="s", metric=metric)
|
|
return benchmarks
|
|
|
|
|
|
def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]:
|
|
"""Extract normalized benchmarks from a compact mapping JSON object.
|
|
|
|
Args:
|
|
data (dict[str, Any]): Parsed mapping where each benchmark is either a
|
|
raw number or an object containing ``value``, ``unit``, and ``metric``.
|
|
|
|
Returns:
|
|
dict[str, Benchmark]: Benchmarks keyed by mapping key.
|
|
"""
|
|
|
|
benchmarks: dict[str, Benchmark] = {}
|
|
|
|
for name, raw_value in data.items():
|
|
if name in {"version", "context", "commit", "timestamp"}:
|
|
continue
|
|
|
|
value = _as_float(raw_value)
|
|
unit = ""
|
|
metric = "value"
|
|
if value is None and isinstance(raw_value, dict):
|
|
value = _as_float(raw_value.get("value"))
|
|
unit = str(raw_value.get("unit") or "")
|
|
metric = str(raw_value.get("metric") or "value")
|
|
|
|
if value is not None:
|
|
benchmarks[str(name)] = Benchmark(name=str(name), value=value, unit=unit, metric=metric)
|
|
|
|
return benchmarks
|
|
|
|
|
|
def extract_benchmarks(path: Path) -> dict[str, Benchmark]:
|
|
"""Extract normalized benchmarks from a supported JSON file.
|
|
|
|
Args:
|
|
path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping
|
|
JSON file.
|
|
|
|
Returns:
|
|
dict[str, Benchmark]: Normalized benchmarks keyed by name.
|
|
|
|
Raises:
|
|
ValueError: If the JSON root is not an object or no supported benchmark
|
|
entries can be extracted.
|
|
"""
|
|
|
|
data = _read_json(path)
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"{path} must contain a JSON object")
|
|
|
|
extractors = (_extract_hyperfine, _extract_pytest_benchmark, _extract_simple_mapping)
|
|
for extractor in extractors:
|
|
benchmarks = extractor(data)
|
|
if benchmarks:
|
|
return benchmarks
|
|
|
|
raise ValueError(f"No supported benchmark entries found in {path}")
|
|
|
|
|
|
def compare_benchmarks(
|
|
baseline: dict[str, Benchmark],
|
|
current: dict[str, Benchmark],
|
|
threshold_percent: float,
|
|
higher_is_better: bool,
|
|
) -> tuple[list[Comparison], list[str], list[str]]:
|
|
"""Compare baseline benchmarks with current benchmarks.
|
|
|
|
Args:
|
|
baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name.
|
|
current (dict[str, Benchmark]): Current benchmarks keyed by name.
|
|
threshold_percent (float): Regression threshold in percent.
|
|
higher_is_better (bool): If ``True``, lower current values are treated as
|
|
regressions. If ``False``, higher current values are treated as
|
|
regressions.
|
|
|
|
Returns:
|
|
tuple[list[Comparison], list[str], list[str]]: Comparisons for common
|
|
benchmark names, names missing from current results, and names newly
|
|
present in current results.
|
|
"""
|
|
|
|
comparisons: list[Comparison] = []
|
|
missing_in_current: list[str] = []
|
|
new_in_current: list[str] = []
|
|
|
|
for name, baseline_benchmark in sorted(baseline.items()):
|
|
current_benchmark = current.get(name)
|
|
if current_benchmark is None:
|
|
missing_in_current.append(name)
|
|
continue
|
|
|
|
if baseline_benchmark.value == 0:
|
|
delta_percent = 0.0
|
|
else:
|
|
delta_percent = (
|
|
(current_benchmark.value - baseline_benchmark.value)
|
|
/ abs(baseline_benchmark.value)
|
|
* 100
|
|
)
|
|
|
|
if higher_is_better:
|
|
regressed = delta_percent <= -threshold_percent
|
|
improved = delta_percent >= threshold_percent
|
|
else:
|
|
regressed = delta_percent >= threshold_percent
|
|
improved = delta_percent <= -threshold_percent
|
|
|
|
comparisons.append(
|
|
Comparison(
|
|
name=name,
|
|
baseline=baseline_benchmark.value,
|
|
current=current_benchmark.value,
|
|
delta_percent=delta_percent,
|
|
unit=current_benchmark.unit or baseline_benchmark.unit,
|
|
metric=current_benchmark.metric,
|
|
regressed=regressed,
|
|
improved=improved,
|
|
)
|
|
)
|
|
|
|
for name in sorted(set(current) - set(baseline)):
|
|
new_in_current.append(name)
|
|
|
|
return comparisons, missing_in_current, new_in_current
|
|
|
|
|
|
def _format_value(value: float, unit: str) -> str:
|
|
"""Format a benchmark value for Markdown output.
|
|
|
|
Args:
|
|
value (float): Numeric benchmark value.
|
|
unit (str): Display unit.
|
|
|
|
Returns:
|
|
str: Formatted value with optional unit suffix.
|
|
"""
|
|
|
|
suffix = f" {unit}" if unit else ""
|
|
return f"{value:.6g}{suffix}"
|
|
|
|
|
|
def _format_status(comparison: Comparison) -> str:
|
|
"""Format a comparison status for Markdown output."""
|
|
|
|
if comparison.regressed:
|
|
return ":red_circle: regressed"
|
|
if comparison.improved:
|
|
return ":green_circle: improved"
|
|
return "ok"
|
|
|
|
|
|
def write_summary(
|
|
path: Path,
|
|
comparisons: list[Comparison],
|
|
missing_in_current: list[str],
|
|
new_in_current: list[str],
|
|
threshold_percent: float,
|
|
higher_is_better: bool,
|
|
) -> None:
|
|
"""Write a Markdown benchmark comparison summary.
|
|
|
|
Args:
|
|
path (Path): Path where the summary should be written.
|
|
comparisons (list[Comparison]): Comparison rows for matching benchmarks.
|
|
missing_in_current (list[str]): Baseline benchmark names missing from the
|
|
current result.
|
|
new_in_current (list[str]): Current benchmark names not present in the
|
|
baseline result.
|
|
threshold_percent (float): Regression threshold in percent.
|
|
higher_is_better (bool): Whether higher benchmark values are considered
|
|
better.
|
|
"""
|
|
|
|
regressions = [comparison for comparison in comparisons if comparison.regressed]
|
|
improvements = [comparison for comparison in comparisons if comparison.improved]
|
|
direction = "higher is better" if higher_is_better else "lower is better"
|
|
sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name)
|
|
|
|
lines = [
|
|
"<!-- bw-benchmark-comment -->",
|
|
"## Benchmark comparison",
|
|
"",
|
|
f"Threshold: {threshold_percent:g}% ({direction}).",
|
|
f"Result: {len(regressions)} regression(s), {len(improvements)} improvement(s) beyond threshold.",
|
|
]
|
|
lines.append("")
|
|
|
|
if regressions:
|
|
lines.extend(
|
|
[
|
|
f"{len(regressions)} benchmark(s) regressed beyond the configured threshold.",
|
|
"",
|
|
"| Benchmark | Baseline | Current | Change |",
|
|
"| --- | ---: | ---: | ---: |",
|
|
]
|
|
)
|
|
for comparison in regressions:
|
|
lines.append(
|
|
"| "
|
|
f"{comparison.name} | "
|
|
f"{_format_value(comparison.baseline, comparison.unit)} | "
|
|
f"{_format_value(comparison.current, comparison.unit)} | "
|
|
f"{comparison.delta_percent:+.2f}% |"
|
|
)
|
|
else:
|
|
lines.append("No benchmark regression exceeded the configured threshold.")
|
|
|
|
lines.append("")
|
|
|
|
if improvements:
|
|
lines.extend(
|
|
[
|
|
f"{len(improvements)} benchmark(s) improved beyond the configured threshold.",
|
|
"",
|
|
"| Benchmark | Baseline | Current | Change |",
|
|
"| --- | ---: | ---: | ---: |",
|
|
]
|
|
)
|
|
for comparison in improvements:
|
|
lines.append(
|
|
"| "
|
|
f"{comparison.name} | "
|
|
f"{_format_value(comparison.baseline, comparison.unit)} | "
|
|
f"{_format_value(comparison.current, comparison.unit)} | "
|
|
f"{comparison.delta_percent:+.2f}% |"
|
|
)
|
|
else:
|
|
lines.append("No benchmark improvement exceeded the configured threshold.")
|
|
|
|
if sorted_comparisons:
|
|
lines.extend(
|
|
[
|
|
"",
|
|
"<details>",
|
|
"<summary>All benchmark results</summary>",
|
|
"",
|
|
"| Benchmark | Baseline | Current | Change | Status |",
|
|
"| --- | ---: | ---: | ---: | --- |",
|
|
]
|
|
)
|
|
for comparison in sorted_comparisons:
|
|
lines.append(
|
|
"| "
|
|
f"{comparison.name} | "
|
|
f"{_format_value(comparison.baseline, comparison.unit)} | "
|
|
f"{_format_value(comparison.current, comparison.unit)} | "
|
|
f"{comparison.delta_percent:+.2f}% | "
|
|
f"{_format_status(comparison)} |"
|
|
)
|
|
lines.extend(["", "</details>"])
|
|
|
|
if missing_in_current:
|
|
lines.extend(["", "Missing benchmarks in the current run:"])
|
|
lines.extend(f"- `{name}`" for name in missing_in_current)
|
|
|
|
if new_in_current:
|
|
lines.extend(["", "New benchmarks in the current run:"])
|
|
lines.extend(f"- `{name}`" for name in new_in_current)
|
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
|
|
|
|
def main() -> int:
|
|
"""Run the benchmark comparison command line interface.
|
|
|
|
Returns:
|
|
int: ``1`` when a regression exceeds the threshold, otherwise ``0``.
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--baseline", required=True, type=Path)
|
|
parser.add_argument("--current", required=True, type=Path)
|
|
parser.add_argument("--summary", required=True, type=Path)
|
|
parser.add_argument("--threshold-percent", required=True, type=float)
|
|
parser.add_argument("--higher-is-better", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
baseline = extract_benchmarks(args.baseline)
|
|
current = extract_benchmarks(args.current)
|
|
comparisons, missing_in_current, new_in_current = compare_benchmarks(
|
|
baseline=baseline,
|
|
current=current,
|
|
threshold_percent=args.threshold_percent,
|
|
higher_is_better=args.higher_is_better,
|
|
)
|
|
|
|
write_summary(
|
|
path=args.summary,
|
|
comparisons=comparisons,
|
|
missing_in_current=missing_in_current,
|
|
new_in_current=new_in_current,
|
|
threshold_percent=args.threshold_percent,
|
|
higher_is_better=args.higher_is_better,
|
|
)
|
|
|
|
return 1 if any(comparison.regressed for comparison in comparisons) else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|