diff --git a/.github/scripts/aggregate_benchmarks.py b/.github/scripts/aggregate_benchmarks.py index 9baa581a..02fecd13 100644 --- a/.github/scripts/aggregate_benchmarks.py +++ b/.github/scripts/aggregate_benchmarks.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 -"""Aggregate benchmark JSON files by taking the median across runner attempts.""" +"""Aggregate benchmark JSON files by taking the median across runner attempts. + +The workflow runs the same benchmark suite on multiple independent runners. +This script reads every JSON file produced by those attempts, normalizes the +contained benchmark values, and writes a compact mapping JSON where each value is +the median across attempts. +""" from __future__ import annotations @@ -12,6 +18,16 @@ from compare_benchmarks import Benchmark, extract_benchmarks def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]: + """Collect benchmarks from multiple JSON files. + + Args: + paths (list[Path]): Paths to hyperfine, pytest-benchmark, or compact + mapping JSON files. + + Returns: + dict[str, list[Benchmark]]: Benchmarks grouped by benchmark name. + """ + collected: dict[str, list[Benchmark]] = {} for path in paths: for name, benchmark in extract_benchmarks(path).items(): @@ -20,6 +36,18 @@ def collect_benchmarks(paths: list[Path]) -> dict[str, list[Benchmark]]: def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, object]]: + """Aggregate grouped benchmarks using the median value. + + Args: + collected (dict[str, list[Benchmark]]): Benchmarks grouped by benchmark + name. + + Returns: + dict[str, dict[str, object]]: Compact mapping JSON data. Each benchmark + contains ``value``, ``unit``, ``metric``, ``attempts``, and + ``attempt_values``. + """ + aggregated: dict[str, dict[str, object]] = {} for name, benchmarks in sorted(collected.items()): values = [benchmark.value for benchmark in benchmarks] @@ -36,6 +64,19 @@ def aggregate(collected: dict[str, list[Benchmark]]) -> dict[str, dict[str, obje def main_from_paths(input_dir: Path, output: Path) -> int: + """Aggregate all JSON files in a directory and write the result. + + Args: + input_dir (Path): Directory containing benchmark JSON files. + output (Path): Path where the aggregate JSON should be written. + + Returns: + int: Always ``0`` on success. + + Raises: + ValueError: If no JSON files are found in ``input_dir``. + """ + paths = sorted(input_dir.rglob("*.json")) if not paths: raise ValueError(f"No benchmark JSON files found in {input_dir}") @@ -49,6 +90,12 @@ def main_from_paths(input_dir: Path, output: Path) -> int: def main() -> int: + """Run the benchmark aggregation command line interface. + + Returns: + int: Always ``0`` on success. + """ + parser = argparse.ArgumentParser() parser.add_argument("--input-dir", required=True, type=Path) parser.add_argument("--output", required=True, type=Path) diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py index dc431b8b..2bb2f9bd 100644 --- a/.github/scripts/compare_benchmarks.py +++ b/.github/scripts/compare_benchmarks.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 -"""Compare benchmark JSON files and write a GitHub Actions summary.""" +"""Compare benchmark JSON files and write a GitHub Actions summary. + +The script supports JSON emitted by hyperfine, JSON emitted by pytest-benchmark, +and a compact mapping format generated by ``aggregate_benchmarks.py``. Timing +formats prefer median values and fall back to mean values when median values are +not present. +""" from __future__ import annotations @@ -13,6 +19,15 @@ from typing import Any @dataclass(frozen=True) class Benchmark: + """Normalized benchmark result. + + Attributes: + name (str): Stable benchmark name used to match baseline and current results. + value (float): Numeric benchmark value used for comparison. + unit (str): Display unit for the value, for example ``"s"``. + metric (str): Source metric name, for example ``"median"`` or ``"mean"``. + """ + name: str value: float unit: str @@ -21,6 +36,18 @@ class Benchmark: @dataclass(frozen=True) class Comparison: + """Comparison between one baseline benchmark and one current benchmark. + + Attributes: + name (str): Benchmark name. + baseline (float): Baseline benchmark value. + current (float): Current benchmark value. + delta_percent (float): Percent change from baseline to current. + unit (str): Display unit for both values. + metric (str): Current result metric used for comparison. + regressed (bool): Whether the change exceeds the configured threshold. + """ + name: str baseline: float current: float @@ -31,11 +58,29 @@ class Comparison: def _read_json(path: Path) -> Any: + """Read JSON data from a file. + + Args: + path (Path): Path to the JSON file. + + Returns: + Any: Parsed JSON value. + """ + with path.open("r", encoding="utf-8") as stream: return json.load(stream) def _as_float(value: Any) -> float | None: + """Convert a value to a finite float. + + Args: + value (Any): Value to convert. + + Returns: + float | None: Converted finite float, or ``None`` if conversion fails. + """ + try: result = float(value) except (TypeError, ValueError): @@ -46,6 +91,15 @@ def _as_float(value: Any) -> float | None: def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]: + """Extract normalized benchmarks from hyperfine JSON. + + Args: + data (dict[str, Any]): Parsed hyperfine JSON object. + + Returns: + dict[str, Benchmark]: Benchmarks keyed by command name. + """ + benchmarks: dict[str, Benchmark] = {} for result in data.get("results", []): if not isinstance(result, dict): @@ -62,6 +116,15 @@ def _extract_hyperfine(data: dict[str, Any]) -> dict[str, Benchmark]: def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]: + """Extract normalized benchmarks from pytest-benchmark JSON. + + Args: + data (dict[str, Any]): Parsed pytest-benchmark JSON object. + + Returns: + dict[str, Benchmark]: Benchmarks keyed by full benchmark name. + """ + benchmarks: dict[str, Benchmark] = {} for benchmark in data.get("benchmarks", []): if not isinstance(benchmark, dict): @@ -82,6 +145,16 @@ def _extract_pytest_benchmark(data: dict[str, Any]) -> dict[str, Benchmark]: def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]: + """Extract normalized benchmarks from a compact mapping JSON object. + + Args: + data (dict[str, Any]): Parsed mapping where each benchmark is either a + raw number or an object containing ``value``, ``unit``, and ``metric``. + + Returns: + dict[str, Benchmark]: Benchmarks keyed by mapping key. + """ + benchmarks: dict[str, Benchmark] = {} for name, raw_value in data.items(): @@ -103,6 +176,20 @@ def _extract_simple_mapping(data: dict[str, Any]) -> dict[str, Benchmark]: def extract_benchmarks(path: Path) -> dict[str, Benchmark]: + """Extract normalized benchmarks from a supported JSON file. + + Args: + path (Path): Path to a hyperfine, pytest-benchmark, or compact mapping + JSON file. + + Returns: + dict[str, Benchmark]: Normalized benchmarks keyed by name. + + Raises: + ValueError: If the JSON root is not an object or no supported benchmark + entries can be extracted. + """ + data = _read_json(path) if not isinstance(data, dict): raise ValueError(f"{path} must contain a JSON object") @@ -122,6 +209,22 @@ def compare_benchmarks( threshold_percent: float, higher_is_better: bool, ) -> tuple[list[Comparison], list[str], list[str]]: + """Compare baseline benchmarks with current benchmarks. + + Args: + baseline (dict[str, Benchmark]): Baseline benchmarks keyed by name. + current (dict[str, Benchmark]): Current benchmarks keyed by name. + threshold_percent (float): Regression threshold in percent. + higher_is_better (bool): If ``True``, lower current values are treated as + regressions. If ``False``, higher current values are treated as + regressions. + + Returns: + tuple[list[Comparison], list[str], list[str]]: Comparisons for common + benchmark names, names missing from current results, and names newly + present in current results. + """ + comparisons: list[Comparison] = [] missing_in_current: list[str] = [] new_in_current: list[str] = [] @@ -165,6 +268,16 @@ def compare_benchmarks( def _format_value(value: float, unit: str) -> str: + """Format a benchmark value for Markdown output. + + Args: + value (float): Numeric benchmark value. + unit (str): Display unit. + + Returns: + str: Formatted value with optional unit suffix. + """ + suffix = f" {unit}" if unit else "" return f"{value:.6g}{suffix}" @@ -177,6 +290,20 @@ def write_summary( threshold_percent: float, higher_is_better: bool, ) -> None: + """Write a Markdown benchmark comparison summary. + + Args: + path (Path): Path where the summary should be written. + comparisons (list[Comparison]): Comparison rows for matching benchmarks. + missing_in_current (list[str]): Baseline benchmark names missing from the + current result. + new_in_current (list[str]): Current benchmark names not present in the + baseline result. + threshold_percent (float): Regression threshold in percent. + higher_is_better (bool): Whether higher benchmark values are considered + better. + """ + regressions = [comparison for comparison in comparisons if comparison.regressed] direction = "higher is better" if higher_is_better else "lower is better" sorted_comparisons = sorted(comparisons, key=lambda comparison: comparison.name) @@ -245,6 +372,12 @@ def write_summary( def main() -> int: + """Run the benchmark comparison command line interface. + + Returns: + int: ``1`` when a regression exceeds the threshold, otherwise ``0``. + """ + parser = argparse.ArgumentParser() parser.add_argument("--baseline", required=True, type=Path) parser.add_argument("--current", required=True, type=Path)