From 9ebfebd17638fb9f44d636b94aaaedcb1ab97bf1 Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Fri, 24 Apr 2026 09:58:38 +0200 Subject: [PATCH 1/7] Add script for checking regressions on a PR --- scripts/compare_bench_regressions.py | 1149 ++++++++++++++++++++++++++ 1 file changed, 1149 insertions(+) create mode 100755 scripts/compare_bench_regressions.py diff --git a/scripts/compare_bench_regressions.py b/scripts/compare_bench_regressions.py new file mode 100755 index 0000000000..febf32b62a --- /dev/null +++ b/scripts/compare_bench_regressions.py @@ -0,0 +1,1149 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import argparse +import json +import math +import os +import re +import shutil +import statistics +import subprocess +import sys + +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + + +DEFAULT_HISTORY_DAYS = 60 +DEFAULT_THRESHOLD_PERCENT = 5.0 +DEFAULT_ABSOLUTE_TIME_DELTA_SECONDS = 0.002 +DEFAULT_ANCESTOR_SEARCH_COMMITS = 30 +DEFAULT_MIN_HISTORY_POINTS = 8 +DEFAULT_RECENT_HISTORY_POINTS = 10 +PREEXISTING_SHIFT_TAIL_POINTS = 3 +MAD_SCALE_FACTOR = 1.4826 +IQR_SIGMA_FACTOR = 1.349 +REAL_MAD_Z_THRESHOLD = 4.5 +FLAKY_MAD_Z_THRESHOLD = 2.5 +DEFAULT_PROJECT = "G" +DEFAULT_REPOSITORY = "graalpython" +PRIMARY_TIME_METRIC = "time" +PRIMARY_HEAP_METRICS = ("memory", "allocated-memory") +PR_NUMBER_RE = re.compile(r"\[(?P\d+)\]") + + +@dataclass(frozen=True) +class BuildInfo: + key: str + state: str + url: str + build_number: int + description: str | None + + +@dataclass(frozen=True) +class JobPair: + job_name: str + current_build_number: int + baseline_build_number: int + baseline_commit: str + + +@dataclass(frozen=True) +class Measurement: + suite: str + benchmark: str + job_name: str + machine_name: str + host_vm: str + host_vm_config: str + guest_vm: str + guest_vm_config: str + branch: str + commit_rev: str + build_number: int + metric_name: str + metric_unit: str + metric_better: str + avg: float + stddev: float + count: int + + +@dataclass(frozen=True) +class HistoryStats: + n: int + minimum: float + p10: float + median: float + p90: float + maximum: float + mean: float + cv: float + points_at_or_worse_than_current: int + typical_adjacent_change_pct: float + recent_n: int + recent_median: float + recent_mad: float + recent_iqr: float + recent_sigma: float + recent_p90: float + recent_maximum: float + recent_points_at_or_worse_than_current: int + recent_tail_n: int + recent_tail_minimum: float + recent_tail_maximum: float + recent_tail_points_at_or_worse_than_current: int + + +@dataclass(frozen=True) +class RegressionFinding: + direction: str + benchmark: str + suite: str + machine_name: str + host_vm: str + host_vm_config: str + guest_vm: str + guest_vm_config: str + metric_name: str + metric_unit: str + baseline_commit: str + baseline_build_number: int + baseline_value: float + current_commit: str + current_build_number: int + current_value: float + delta_pct: float + abs_delta: float + classification: str + reason: str + history: HistoryStats | None + + +class ScriptError(RuntimeError): + pass + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=( + "Compare GraalPy benchmark results for a PR against merge-base baseline jobs and classify " + "regressions (and optionally, improvements) as plausible, inconclusive, flaky." + ) + ) + parser.add_argument("--pr", type=int, help="Bitbucket PR number to inspect.") + parser.add_argument("--current-pr", action="store_true", help="Resolve the PR from the current HEAD commit.") + parser.add_argument("--merge-commit", help="Override the PR merge commit used for current benchmark jobs.") + parser.add_argument( + "--head-commit", + help="Commit used to compute the merge-base against the baseline branch. Defaults to local HEAD.", + ) + parser.add_argument("--baseline-branch", default="master", help="Baseline branch to compare against.") + parser.add_argument( + "--baseline-branch-local-name", + help="Local git name for the baseline branch when it differs from the bench-server branch name.", + ) + parser.add_argument("--history-days", type=int, default=DEFAULT_HISTORY_DAYS, help="History window in days.") + parser.add_argument( + "--threshold", + type=float, + default=DEFAULT_THRESHOLD_PERCENT, + help="Only inspect regressions strictly above this percent threshold.", + ) + parser.add_argument( + "--absolute-time-delta-floor", + type=float, + default=DEFAULT_ABSOLUTE_TIME_DELTA_SECONDS, + help="Treat time regressions below this absolute delta as flaky tiny benchmarks.", + ) + parser.add_argument( + "--ancestor-search-commits", + type=int, + default=DEFAULT_ANCESTOR_SEARCH_COMMITS, + help="How many first-parent baseline ancestors to search for missing benchmark jobs.", + ) + parser.add_argument( + "--min-history-points", + type=int, + default=DEFAULT_MIN_HISTORY_POINTS, + help="Minimum history points required before a regression can be labeled plausible.", + ) + parser.add_argument( + "--recent-history-points", + type=int, + default=DEFAULT_RECENT_HISTORY_POINTS, + help="Number of most recent baseline points to treat as recent history for regression classification.", + ) + parser.add_argument("--project", default=DEFAULT_PROJECT, help="Bitbucket project key.") + parser.add_argument("--repo", default=DEFAULT_REPOSITORY, help="Bitbucket repository key.") + parser.add_argument("--repo-dir", default=".", help="Local repository directory for git operations.") + parser.add_argument("--bench-cli", help="Path to bench-cli. Defaults to bench-cli on PATH or standard local build.") + parser.add_argument("--show-improvements", action="store_true", help="Also show possible improvements.") + parser.add_argument("--json-out", help="Write a machine-readable JSON report to this file.") + return parser.parse_args() + + +def run_command(command: list[str], *, cwd: str | None = None, input_text: str | None = None) -> str: + process = subprocess.run( + command, + cwd=cwd, + input=input_text, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + if process.returncode != 0: + raise ScriptError( + "Command failed with exit code {}:\n{}\n{}".format( + process.returncode, + " ".join(command), + process.stderr.strip(), + ) + ) + return process.stdout + + +def resolve_bench_cli(explicit_path: str | None) -> str: + candidates = [] + if explicit_path: + candidates.append(explicit_path) + env_path = os.environ.get("BENCH_CLI") + if env_path: + candidates.append(env_path) + which_bench_cli = shutil.which("bench-cli") + if which_bench_cli: + candidates.append(which_bench_cli) + for candidate in candidates: + if candidate and os.path.isfile(candidate) and os.access(candidate, os.X_OK): + return candidate + raise ScriptError( + "Could not find bench-cli. Use --bench-cli or put bench-cli on PATH." + ) + + +def get_local_head_commit(repo_dir: str) -> str: + return run_command(["git", "rev-parse", "HEAD"], cwd=repo_dir).strip() + + +def resolve_pr_number(args: argparse.Namespace, head_commit: str, repo_dir: str) -> int: + if args.pr: + return args.pr + if not args.current_pr and args.merge_commit: + raise ScriptError("--merge-commit requires --pr or --current-pr so the script can report context.") + output = run_command(["gdev-cli", "bitbucket", "get-prs", "-c", head_commit], cwd=repo_dir) + match = PR_NUMBER_RE.search(output) + if not match: + raise ScriptError("Could not resolve a PR for HEAD commit {}.\n{}".format(head_commit, output.strip())) + return int(match.group("pr")) + + +def get_merge_base(repo_dir: str, baseline_branch_local_name: str, head_commit: str) -> str: + return run_command(["git", "merge-base", baseline_branch_local_name, head_commit], cwd=repo_dir).strip() + + +def get_first_parent_ancestors(repo_dir: str, start_commit: str, limit: int) -> list[str]: + output = run_command( + ["git", "rev-list", "--first-parent", "--max-count", str(limit), start_commit], + cwd=repo_dir, + ).strip() + return [line for line in output.splitlines() if line] + + +def get_gate_overview(project: str, repo: str, pr: int, repo_dir: str) -> dict[str, Any]: + output = run_command( + ["gdev-cli", "buildbot", "gate-overview", "-p", project, "-r", repo, "-pr", str(pr), "--json"], + cwd=repo_dir, + ) + return json.loads(output) + + +def get_builds_for_commit(commit: str, repo_dir: str) -> dict[str, BuildInfo]: + output = run_command(["gdev-cli", "bitbucket", "get-builds", "-c", commit, "--all", "--json"], cwd=repo_dir) + data = json.loads(output) + values = data.get("values") or [] + builds = {} + for value in values: + url = value["url"] + build_number_text = url.rstrip("/").split("/")[-1] + if not build_number_text.isdigit(): + continue + builds[value["key"]] = BuildInfo( + key=value["key"], + state=value["state"], + url=url, + build_number=int(build_number_text), + description=value.get("description"), + ) + return builds + + +def select_current_pybench_builds(builds: dict[str, BuildInfo]) -> dict[str, BuildInfo]: + selected = {} + for key, build in builds.items(): + if not key.startswith("pybench-"): + continue + if build.state != "SUCCESSFUL": + continue + if "/builders/ci_executor/builds/" not in build.url: + continue + selected[key] = build + if not selected: + raise ScriptError("No successful pybench builds found for the current merge commit.") + return selected + + +def pair_jobs_with_baseline( + current_builds: dict[str, BuildInfo], + baseline_start_commit: str, + repo_dir: str, + ancestor_search_commits: int, +) -> tuple[list[JobPair], list[str]]: + unresolved = set(current_builds) + paired: list[JobPair] = [] + ancestor_commits = get_first_parent_ancestors(repo_dir, baseline_start_commit, ancestor_search_commits) + seen_commits: set[str] = set() + for commit in ancestor_commits: + if commit in seen_commits: + continue + seen_commits.add(commit) + baseline_builds = get_builds_for_commit(commit, repo_dir) + for job_name in list(unresolved): + baseline_build = baseline_builds.get(job_name) + if baseline_build is None or baseline_build.state != "SUCCESSFUL": + continue + paired.append( + JobPair( + job_name=job_name, + current_build_number=current_builds[job_name].build_number, + baseline_build_number=baseline_build.build_number, + baseline_commit=commit, + ) + ) + unresolved.remove(job_name) + if not unresolved: + break + paired.sort(key=lambda pair: pair.job_name) + return paired, sorted(unresolved) + + +def metric_names_for_job(job_name: str) -> tuple[str, ...]: + if "heap" in job_name: + return PRIMARY_HEAP_METRICS + return (PRIMARY_TIME_METRIC,) + + +def build_measurement_query(current_commit: str, job_pairs: list[JobPair]) -> dict[str, Any]: + current_build_numbers = [str(pair.current_build_number) for pair in job_pairs] + baseline_build_numbers = [str(pair.baseline_build_number) for pair in job_pairs] + commit_revs = sorted({current_commit} | {pair.baseline_commit for pair in job_pairs}) + return { + "queries": [ + { + "format": {"type": "application/bench-results+json"}, + "selectors": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + {"avg": "metric.value"}, + {"stddev": "metric.value"}, + {"count": "metric.value"}, + ], + "filters": { + "commit.rev": {"enum": commit_revs}, + "build.number": {"enum": current_build_numbers + baseline_build_numbers}, + "metric.name": {"enum": [PRIMARY_TIME_METRIC, *PRIMARY_HEAP_METRICS]}, + }, + "grouping": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + ], + "clustering": [ + {"build.job-name": "asc"}, + {"bench-suite": "asc"}, + {"benchmark": "asc"}, + {"metric.name": "asc"}, + ], + "samplers": [], + } + ] + } + + +def run_bench_query(bench_cli: str, query: dict[str, Any], repo_dir: str) -> dict[str, Any]: + output = run_command([bench_cli, "run", "-"], cwd=repo_dir, input_text=json.dumps(query)) + return json.loads(output) + + +def parse_measurements(query_result: dict[str, Any]) -> list[Measurement]: + selectors = query_result["metadata"]["selectors"] + selector_to_index = {name: index for index, name in enumerate(selectors)} + measurements = [] + for cluster in query_result["results"]: + for row in cluster["data"]: + measurements.append( + Measurement( + suite=row[selector_to_index["bench-suite"]], + benchmark=row[selector_to_index["benchmark"]], + job_name=row[selector_to_index["build.job-name"]], + machine_name=row[selector_to_index["machine.name"]], + host_vm=row[selector_to_index["host-vm"]], + host_vm_config=row[selector_to_index["host-vm-config"]], + guest_vm=row[selector_to_index["guest-vm"]], + guest_vm_config=row[selector_to_index["guest-vm-config"]], + branch=row[selector_to_index["branch"]], + commit_rev=row[selector_to_index["commit.rev"]], + build_number=int(row[selector_to_index["build.number"]]), + metric_unit=row[selector_to_index["metric.unit"]], + metric_name=row[selector_to_index["metric.name"]], + metric_better=row[selector_to_index["metric.better"]], + avg=float(row[selector_to_index["metric.value:AVG"]]), + stddev=float(row[selector_to_index["metric.value:STDDEV"]]), + count=int(row[selector_to_index["metric.value:COUNT"]]), + ) + ) + return measurements + + +def pick_preferred_measurements( + measurements: list[Measurement], + current_commit: str, + job_pairs: list[JobPair], +) -> list[tuple[Measurement, Measurement]]: + comparison_pairs = [] + for pair in job_pairs: + current_rows = [ + measurement + for measurement in measurements + if measurement.job_name == pair.job_name + and measurement.build_number == pair.current_build_number + and measurement.commit_rev == current_commit + ] + baseline_rows = [ + measurement + for measurement in measurements + if measurement.job_name == pair.job_name + and measurement.build_number == pair.baseline_build_number + and measurement.commit_rev == pair.baseline_commit + ] + grouped_current = { + ( + measurement.machine_name, + measurement.host_vm, + measurement.host_vm_config, + measurement.guest_vm, + measurement.guest_vm_config, + measurement.benchmark, + measurement.metric_name, + ): measurement + for measurement in current_rows + } + grouped_baseline = { + ( + measurement.machine_name, + measurement.host_vm, + measurement.host_vm_config, + measurement.guest_vm, + measurement.guest_vm_config, + measurement.benchmark, + measurement.metric_name, + ): measurement + for measurement in baseline_rows + } + machines_and_benchmarks = { + (machine_name, host_vm, host_vm_config, guest_vm, guest_vm_config, benchmark) + for machine_name, host_vm, host_vm_config, guest_vm, guest_vm_config, benchmark, _metric_name in grouped_current + } + for machine_name, host_vm, host_vm_config, guest_vm, guest_vm_config, benchmark in sorted(machines_and_benchmarks): + chosen_current = None + chosen_baseline = None + for metric_name in metric_names_for_job(pair.job_name): + candidate_current = grouped_current.get( + (machine_name, host_vm, host_vm_config, guest_vm, guest_vm_config, benchmark, metric_name) + ) + candidate_baseline = grouped_baseline.get( + (machine_name, host_vm, host_vm_config, guest_vm, guest_vm_config, benchmark, metric_name) + ) + if candidate_current is None or candidate_baseline is None: + continue + chosen_current = candidate_current + chosen_baseline = candidate_baseline + break + if chosen_current is None or chosen_baseline is None: + continue + comparison_pairs.append((chosen_current, chosen_baseline)) + return comparison_pairs + + +def compute_regression_percent(current_value: float, baseline_value: float, lower_is_better: bool) -> float: + if baseline_value == 0: + return 0.0 + if lower_is_better: + return (current_value / baseline_value - 1.0) * 100.0 + return (1.0 - current_value / baseline_value) * 100.0 + + +def worse_direction_delta(observed_value: float, reference_value: float, lower_is_better: bool) -> float: + if lower_is_better: + return observed_value - reference_value + return reference_value - observed_value + + +def directional_delta(observed_value: float, reference_value: float, lower_is_better: bool, direction: str) -> float: + worse_delta = worse_direction_delta(observed_value, reference_value, lower_is_better) + if direction == "regression": + return worse_delta + if direction == "improvement": + return -worse_delta + raise ValueError("Unsupported direction: {}".format(direction)) + + +def directional_percent(observed_value: float, reference_value: float, lower_is_better: bool, direction: str) -> float: + regression_pct = compute_regression_percent(observed_value, reference_value, lower_is_better) + if direction == "regression": + return regression_pct + if direction == "improvement": + return -regression_pct + raise ValueError("Unsupported direction: {}".format(direction)) + + +def directional_extreme_value(values: list[float], lower_is_better: bool, direction: str) -> float: + if direction == "regression": + return max(values) if lower_is_better else min(values) + if direction == "improvement": + return min(values) if lower_is_better else max(values) + raise ValueError("Unsupported direction: {}".format(direction)) + + +def percentile(values: list[float], fraction: float) -> float: + if not values: + raise ValueError("percentile() requires at least one value") + if len(values) == 1: + return values[0] + index = (len(values) - 1) * fraction + lower_index = math.floor(index) + upper_index = math.ceil(index) + if lower_index == upper_index: + return values[int(index)] + return values[lower_index] * (upper_index - index) + values[upper_index] * (index - lower_index) + + +def build_history_query( + baseline_branch: str, + history_days: int, + candidates: list[tuple[Measurement, Measurement]], +) -> dict[str, Any]: + jobs = sorted({current.job_name for current, _baseline in candidates}) + benchmarks = sorted({current.benchmark for current, _baseline in candidates}) + machines = sorted({current.machine_name for current, _baseline in candidates}) + metric_names = sorted({current.metric_name for current, _baseline in candidates}) + return { + "queries": [ + { + "format": {"type": "application/bench-results+json"}, + "selectors": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "commit.committer-ts", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + {"avg": "metric.value"}, + ], + "filters": { + "build.job-name": {"enum": jobs}, + "benchmark": {"enum": benchmarks}, + "machine.name": {"enum": machines}, + "metric.name": {"enum": metric_names}, + "branch": {"enum": [baseline_branch]}, + "commit.committer-ts": {"last-n": history_days, "unit": "D"}, + }, + "grouping": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "commit.committer-ts", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + ], + "clustering": [ + {"build.job-name": "asc"}, + {"benchmark": "asc"}, + {"commit.committer-ts": "asc"}, + ], + "samplers": [], + } + ] + } + + +def history_key_for_measurement(measurement: Measurement) -> tuple[str, str, str, str, str, str, str, str]: + return ( + measurement.job_name, + measurement.machine_name, + measurement.host_vm, + measurement.host_vm_config, + measurement.guest_vm, + measurement.guest_vm_config, + measurement.benchmark, + measurement.metric_name, + ) + + +def parse_history_values(query_result: dict[str, Any]) -> dict[tuple[str, str, str, str, str, str, str, str], list[float]]: + selectors = query_result["metadata"]["selectors"] + selector_to_index = {name: index for index, name in enumerate(selectors)} + history: dict[tuple[str, str, str, str, str, str, str, str], list[float]] = {} + for cluster in query_result["results"]: + for row in cluster["data"]: + key = ( + row[selector_to_index["build.job-name"]], + row[selector_to_index["machine.name"]], + row[selector_to_index["host-vm"]], + row[selector_to_index["host-vm-config"]], + row[selector_to_index["guest-vm"]], + row[selector_to_index["guest-vm-config"]], + row[selector_to_index["benchmark"]], + row[selector_to_index["metric.name"]], + ) + history.setdefault(key, []).append(float(row[selector_to_index["metric.value:AVG"]])) + return history + + +def calculate_history_stats( + values: list[float], + current_value: float, + lower_is_better: bool, + recent_history_points: int, + direction: str, +) -> HistoryStats: + ordered_values = list(values) + sorted_values = sorted(ordered_values) + recent_values = ordered_values[-min(len(ordered_values), recent_history_points) :] + recent_sorted_values = sorted(recent_values) + recent_p25 = percentile(recent_sorted_values, 0.25) + recent_p75 = percentile(recent_sorted_values, 0.75) + recent_median = statistics.median(recent_sorted_values) + recent_absolute_deviations = [abs(value - recent_median) for value in recent_values] + recent_mad = statistics.median(recent_absolute_deviations) + recent_iqr = recent_p75 - recent_p25 + recent_mad_sigma = MAD_SCALE_FACTOR * recent_mad + recent_iqr_sigma = recent_iqr / IQR_SIGMA_FACTOR if recent_iqr else 0.0 + # Pure MAD is too optimistic for multi-modal recent windows. Use the broader robust scale. + recent_sigma = max(recent_mad_sigma, recent_iqr_sigma) + mean_value = statistics.mean(sorted_values) + stdev_value = statistics.pstdev(sorted_values) if len(sorted_values) > 1 else 0.0 + if mean_value == 0: + cv = 0.0 + else: + cv = stdev_value / mean_value + points_at_or_worse = 0 + for value in ordered_values: + if directional_percent(value, current_value, lower_is_better, direction) >= 0: + points_at_or_worse += 1 + adjacent_changes = [] + for previous, current in zip(ordered_values, ordered_values[1:]): + if previous != 0: + adjacent_changes.append(abs(compute_regression_percent(current, previous, lower_is_better))) + recent_points_at_or_worse = 0 + for value in recent_values: + if directional_percent(value, current_value, lower_is_better, direction) >= 0: + recent_points_at_or_worse += 1 + recent_tail_values = ordered_values[-min(len(ordered_values), PREEXISTING_SHIFT_TAIL_POINTS) :] + recent_tail_points_at_or_worse = 0 + for value in recent_tail_values: + if directional_percent(value, current_value, lower_is_better, direction) >= 0: + recent_tail_points_at_or_worse += 1 + return HistoryStats( + n=len(sorted_values), + minimum=sorted_values[0], + p10=percentile(sorted_values, 0.10), + median=statistics.median(sorted_values), + p90=percentile(sorted_values, 0.90), + maximum=sorted_values[-1], + mean=mean_value, + cv=cv, + points_at_or_worse_than_current=points_at_or_worse, + typical_adjacent_change_pct=statistics.median(adjacent_changes) if adjacent_changes else 0.0, + recent_n=len(recent_sorted_values), + recent_median=recent_median, + recent_mad=recent_mad, + recent_iqr=recent_iqr, + recent_sigma=recent_sigma, + recent_p90=percentile(recent_sorted_values, 0.90), + recent_maximum=recent_sorted_values[-1], + recent_points_at_or_worse_than_current=recent_points_at_or_worse, + recent_tail_n=len(recent_tail_values), + recent_tail_minimum=min(recent_tail_values), + recent_tail_maximum=max(recent_tail_values), + recent_tail_points_at_or_worse_than_current=recent_tail_points_at_or_worse, + ) + + +def classify_change( + current: Measurement, + baseline: Measurement, + history_values: list[float] | None, + threshold_percent: float, + absolute_time_delta_floor: float, + min_history_points: int, + recent_history_points: int, + direction: str, +) -> tuple[str, str, HistoryStats | None]: + lower_is_better = current.metric_better == "lower" + delta_pct = directional_percent(current.avg, baseline.avg, lower_is_better, direction) + abs_delta = abs(current.avg - baseline.avg) + if current.metric_name == PRIMARY_TIME_METRIC and abs_delta < absolute_time_delta_floor: + return "flaky", "tiny absolute time delta", None + if not history_values: + return "inconclusive", "no baseline history available", None + stats = calculate_history_stats(history_values, current.avg, lower_is_better, recent_history_points, direction) + current_vs_recent_median = directional_percent(current.avg, stats.recent_median, lower_is_better, direction) + if stats.n < min_history_points: + return "inconclusive", "too little baseline history to run the recent median/robust-scale check", stats + if stats.recent_n < min_history_points: + return "inconclusive", "too little recent baseline history to run the recent median/robust-scale check", stats + + change_delta = directional_delta(current.avg, stats.recent_median, lower_is_better, direction) + if change_delta <= 0: + return "flaky", "current value is not {} than the recent median".format("better" if direction == "improvement" else "worse"), stats + + if stats.recent_sigma == 0: + if current_vs_recent_median > max(threshold_percent / 2.0, 5.0): + if stats.recent_tail_points_at_or_worse_than_current > 0: + tail_extreme = directional_extreme_value( + [stats.recent_tail_minimum, stats.recent_tail_maximum], lower_is_better, direction + ) + return ( + "inconclusive", + "one of the last {} baseline points already reached {:.6g}".format(stats.recent_tail_n, tail_extreme), + stats, + ) + return "plausible", "current exceeds a perfectly stable recent baseline", stats + return "inconclusive", "recent baseline is perfectly stable but effect size is still small", stats + + mad_z_score = change_delta / stats.recent_sigma + if mad_z_score >= REAL_MAD_Z_THRESHOLD and current_vs_recent_median > max(threshold_percent / 2.0, 5.0): + if stats.recent_tail_points_at_or_worse_than_current > 0: + tail_extreme = directional_extreme_value( + [stats.recent_tail_minimum, stats.recent_tail_maximum], lower_is_better, direction + ) + return ( + "inconclusive", + "one of the last {} baseline points already reached {:.6g}".format(stats.recent_tail_n, tail_extreme), + stats, + ) + return "plausible", "current is {:.1f} robust-sigmas {} than the recent median".format(mad_z_score, "better" if direction == "improvement" else "worse"), stats + if mad_z_score <= FLAKY_MAD_Z_THRESHOLD: + return "flaky", "current is only {:.1f} robust-sigmas {} than the recent median".format(mad_z_score, "better" if direction == "improvement" else "worse"), stats + return "inconclusive", "current is {:.1f} robust-sigmas {} than the recent median".format(mad_z_score, "better" if direction == "improvement" else "worse"), stats + + +def make_finding( + current: Measurement, + baseline: Measurement, + classification: str, + reason: str, + history: HistoryStats | None, + direction: str, +) -> RegressionFinding: + lower_is_better = current.metric_better == "lower" + delta_pct = directional_percent(current.avg, baseline.avg, lower_is_better, direction) + return RegressionFinding( + direction=direction, + benchmark=current.benchmark, + suite=current.suite, + machine_name=current.machine_name, + host_vm=current.host_vm, + host_vm_config=current.host_vm_config, + guest_vm=current.guest_vm, + guest_vm_config=current.guest_vm_config, + metric_name=current.metric_name, + metric_unit=current.metric_unit, + baseline_commit=baseline.commit_rev, + baseline_build_number=baseline.build_number, + baseline_value=baseline.avg, + current_commit=current.commit_rev, + current_build_number=current.build_number, + current_value=current.avg, + delta_pct=delta_pct, + abs_delta=abs(current.avg - baseline.avg), + classification=classification, + reason=reason, + history=history, + ) + + +def collect_findings( + comparison_pairs: list[tuple[Measurement, Measurement]], + history_by_key: dict[tuple[str, str, str, str, str, str, str, str], list[float]], + threshold_percent: float, + absolute_time_delta_floor: float, + min_history_points: int, + recent_history_points: int, + direction: str, +) -> list[RegressionFinding]: + findings = [] + for current, baseline in comparison_pairs: + lower_is_better = current.metric_better == "lower" + delta_pct = directional_percent(current.avg, baseline.avg, lower_is_better, direction) + if delta_pct <= threshold_percent: + continue + history_key = history_key_for_measurement(current) + classification, reason, history = classify_change( + current, + baseline, + history_by_key.get(history_key), + threshold_percent, + absolute_time_delta_floor, + min_history_points, + recent_history_points, + direction, + ) + findings.append(make_finding(current, baseline, classification, reason, history, direction)) + findings.sort( + key=lambda finding: ( + -finding.delta_pct, + finding.host_vm, + finding.host_vm_config, + finding.guest_vm, + finding.guest_vm_config, + finding.benchmark, + ) + ) + return findings + + +def render_table(rows: list[list[str]], headers: list[str]) -> str: + widths = [len(header) for header in headers] + for row in rows: + for index, value in enumerate(row): + widths[index] = max(widths[index], len(value)) + header_line = " ".join(header.ljust(widths[index]) for index, header in enumerate(headers)) + separator_line = " ".join("-" * width for width in widths) + body_lines = [" ".join(value.ljust(widths[index]) for index, value in enumerate(row)) for row in rows] + return "\n".join([header_line, separator_line, *body_lines]) + + +def configuration_label(finding: RegressionFinding) -> str: + return ",".join( + [ + finding.host_vm, + finding.host_vm_config, + finding.guest_vm, + finding.guest_vm_config, + ] + ) + + +def append_direction_sections(lines: list[str], findings: list[RegressionFinding], direction: str) -> None: + plausible_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "plausible"] + flaky_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "flaky"] + inconclusive_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "inconclusive"] + plural = "{}s".format(direction) + title = direction.capitalize() + if plausible_findings: + rows = [] + for finding in plausible_findings: + rows.append( + [ + finding.benchmark, + "{:+.1f}%".format(finding.delta_pct), + configuration_label(finding), + finding.reason, + ] + ) + lines.append("{}:".format(plural)) + lines.append(render_table(rows, ["benchmark", "delta", "configuration", "reason"])) + else: + lines.append("{}: none".format(plural)) + if inconclusive_findings: + lines.append("") + lines.append("Inconclusive {}:".format(plural)) + for finding in inconclusive_findings: + lines.append( + "- {} ({:+.1f}%, {}): {}".format( + finding.benchmark, + finding.delta_pct, + configuration_label(finding), + finding.reason, + ) + ) + else: + lines.append("") + lines.append("Inconclusive {}: none".format(plural)) + if flaky_findings: + lines.append("") + lines.append("Flaky {}:".format(plural)) + for finding in flaky_findings: + lines.append( + "- {} ({:+.1f}%, {}): {}".format( + finding.benchmark, + finding.delta_pct, + configuration_label(finding), + finding.reason, + ) + ) + else: + lines.append("") + lines.append("Flaky {}: none".format(plural)) + + +def build_warnings(job_pairs: list[JobPair], baseline_commit: str, unresolved_jobs: list[str]) -> list[str]: + warnings = [] + fallback_counts: dict[str, int] = {} + for pair in job_pairs: + if pair.baseline_commit == baseline_commit: + continue + fallback_counts[pair.baseline_commit] = fallback_counts.get(pair.baseline_commit, 0) + 1 + + if fallback_counts: + fallback_summary = ", ".join( + "{} job(s) used {}".format(count, commit[:12]) + for commit, count in sorted(fallback_counts.items(), key=lambda item: item[0]) + ) + warnings.append( + "Baseline was not uniform: {} job(s) used an earlier baseline than merge-base {}. {}".format( + sum(fallback_counts.values()), + baseline_commit[:12], + fallback_summary, + ) + ) + + if unresolved_jobs: + preview = ", ".join(unresolved_jobs[:5]) + if len(unresolved_jobs) > 5: + preview += ", ..." + warnings.append( + "WARNING: {} job(s) did not find any baseline within the ancestor search window and were skipped: {}".format( + len(unresolved_jobs), + preview, + ) + ) + + return warnings + + +def summarize_findings(findings: list[RegressionFinding], warnings: list[str]) -> str: + lines = [] + if warnings: + lines.append("Warnings:") + for warning in warnings: + lines.append("- {}".format(warning)) + lines.append("") + append_direction_sections(lines, findings, "regression") + if any(finding.direction == "improvement" for finding in findings): + lines.append("") + append_direction_sections(lines, findings, "improvement") + return "\n".join(lines) + + +def report_json_object( + pr_number: int, + head_commit: str, + merge_commit: str, + baseline_commit: str, + unresolved_jobs: list[str], + findings: list[RegressionFinding], + warnings: list[str], +) -> dict[str, Any]: + return { + "context": { + "pr": pr_number, + "head_commit": head_commit, + "merge_commit": merge_commit, + "baseline_commit": baseline_commit, + "unresolved_jobs": unresolved_jobs, + "warnings": warnings, + }, + "plausible": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "plausible"], + "flaky": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "flaky"], + "inconclusive": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "inconclusive"], + "improvements": [finding_to_dict(finding) for finding in findings if finding.direction == "improvement"], + } + + +def finding_to_dict(finding: RegressionFinding) -> dict[str, Any]: + data = asdict(finding) + if finding.history is None: + data["history"] = None + return data + + +def main() -> int: + args = parse_args() + repo_dir = str(Path(args.repo_dir).resolve()) + bench_cli = resolve_bench_cli(args.bench_cli) + head_commit = args.head_commit or get_local_head_commit(repo_dir) + pr_number = resolve_pr_number(args, head_commit, repo_dir) + gate_overview = get_gate_overview(args.project, args.repo, pr_number, repo_dir) + merge_commit = args.merge_commit or gate_overview["summary"]["mergeCommit"] + baseline_branch_local_name = args.baseline_branch_local_name or args.baseline_branch + baseline_commit = get_merge_base(repo_dir, baseline_branch_local_name, head_commit) + + current_builds = select_current_pybench_builds(get_builds_for_commit(merge_commit, repo_dir)) + job_pairs, unresolved_jobs = pair_jobs_with_baseline( + current_builds, + baseline_commit, + repo_dir, + args.ancestor_search_commits, + ) + if not job_pairs: + raise ScriptError("Could not pair any current pybench jobs with baseline jobs.") + + measurement_result = run_bench_query(bench_cli, build_measurement_query(merge_commit, job_pairs), repo_dir) + measurements = parse_measurements(measurement_result) + comparison_pairs = pick_preferred_measurements(measurements, merge_commit, job_pairs) + if not comparison_pairs: + raise ScriptError("Bench Server query returned no comparable primary metrics for paired jobs.") + + history_result = run_bench_query( + bench_cli, + build_history_query(args.baseline_branch, args.history_days, comparison_pairs), + repo_dir, + ) + history_by_key = parse_history_values(history_result) + findings = collect_findings( + comparison_pairs, + history_by_key, + args.threshold, + args.absolute_time_delta_floor, + args.min_history_points, + args.recent_history_points, + "regression", + ) + if args.show_improvements: + findings.extend( + collect_findings( + comparison_pairs, + history_by_key, + args.threshold, + args.absolute_time_delta_floor, + args.min_history_points, + args.recent_history_points, + "improvement", + ) + ) + warnings = build_warnings(job_pairs, baseline_commit, unresolved_jobs) + + header = [ + "PR {} | head {} | merge {} | baseline {}".format( + pr_number, + head_commit[:12], + merge_commit[:12], + baseline_commit[:12], + ), + "Paired pybench jobs: {} | unresolved baseline matches: {}".format(len(job_pairs), len(unresolved_jobs)), + "{} above {:.1f}%: {}".format( + "Candidate changes" if args.show_improvements else "Candidate regressions", + args.threshold, + len(findings), + ), + "", + ] + print("\n".join(header) + summarize_findings(findings, warnings)) + + if args.json_out: + json_path = Path(args.json_out) + json_path.write_text( + json.dumps( + report_json_object(pr_number, head_commit, merge_commit, baseline_commit, unresolved_jobs, findings, warnings), + indent=2, + sort_keys=True, + ) + + "\n", + encoding="utf-8", + ) + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except ScriptError as error: + print(str(error), file=sys.stderr) + raise SystemExit(1) From 7ac2817bc20ec5d8bd978a40ad070789e0383b2d Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Fri, 24 Apr 2026 13:44:06 +0200 Subject: [PATCH 2/7] Add a rota mode to the regression script --- scripts/compare_bench_regressions.py | 1348 ++++++++++++++++++++++++-- 1 file changed, 1260 insertions(+), 88 deletions(-) diff --git a/scripts/compare_bench_regressions.py b/scripts/compare_bench_regressions.py index febf32b62a..2ef40d373c 100755 --- a/scripts/compare_bench_regressions.py +++ b/scripts/compare_bench_regressions.py @@ -41,6 +41,7 @@ from __future__ import annotations import argparse +import ast import json import math import os @@ -51,26 +52,31 @@ import sys from dataclasses import asdict, dataclass +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any -DEFAULT_HISTORY_DAYS = 60 +DEFAULT_HISTORY_DAYS = 30 +DEFAULT_ROTA_DAYS = 14 DEFAULT_THRESHOLD_PERCENT = 5.0 DEFAULT_ABSOLUTE_TIME_DELTA_SECONDS = 0.002 DEFAULT_ANCESTOR_SEARCH_COMMITS = 30 DEFAULT_MIN_HISTORY_POINTS = 8 DEFAULT_RECENT_HISTORY_POINTS = 10 PREEXISTING_SHIFT_TAIL_POINTS = 3 +RECOVERY_TOLERANCE_THRESHOLD_FRACTION = 0.5 MAD_SCALE_FACTOR = 1.4826 IQR_SIGMA_FACTOR = 1.349 REAL_MAD_Z_THRESHOLD = 4.5 FLAKY_MAD_Z_THRESHOLD = 2.5 +ADJACENT_JUMP_OUTLIER_FACTOR = 2.0 DEFAULT_PROJECT = "G" DEFAULT_REPOSITORY = "graalpython" PRIMARY_TIME_METRIC = "time" PRIMARY_HEAP_METRICS = ("memory", "allocated-memory") PR_NUMBER_RE = re.compile(r"\[(?P\d+)\]") +SUITE_PY_PATH = "mx.graalpython/suite.py" @dataclass(frozen=True) @@ -111,6 +117,26 @@ class Measurement: count: int +@dataclass(frozen=True) +class HistoryPoint: + suite: str + benchmark: str + job_name: str + machine_name: str + host_vm: str + host_vm_config: str + guest_vm: str + guest_vm_config: str + branch: str + commit_rev: str + commit_timestamp: datetime + build_number: int + metric_name: str + metric_unit: str + metric_better: str + avg: float + + @dataclass(frozen=True) class HistoryStats: n: int @@ -162,6 +188,34 @@ class RegressionFinding: history: HistoryStats | None +@dataclass(frozen=True) +class RotaChangePoint: + direction: str + classification: str + benchmark: str + suite: str + machine_name: str + host_vm: str + host_vm_config: str + guest_vm: str + guest_vm_config: str + metric_name: str + metric_unit: str + good_commit: str + bad_commit: str + delta_pct: float + exact: bool + + +@dataclass(frozen=True) +class RotaDirectSuspect: + good_commit: str + bad_commit: str + bad_author_email: str + bad_subject: str + change_points: tuple[RotaChangePoint, ...] + + class ScriptError(RuntimeError): pass @@ -170,12 +224,18 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=( - "Compare GraalPy benchmark results for a PR against merge-base baseline jobs and classify " - "regressions (and optionally, improvements) as plausible, inconclusive, flaky." + "Compare GraalPy benchmark results either for a PR against merge-base baseline jobs, or in --rota mode " + "scan recent baseline-branch history for unresolved regressions. Classify regressions " + "(and optionally, improvements) as plausible, inconclusive, flaky." ) ) parser.add_argument("--pr", type=int, help="Bitbucket PR number to inspect.") parser.add_argument("--current-pr", action="store_true", help="Resolve the PR from the current HEAD commit.") + parser.add_argument( + "--rota", + action="store_true", + help="Inspect baseline-branch history and find unresolved regressions introduced in the recent rota window.", + ) parser.add_argument("--merge-commit", help="Override the PR merge commit used for current benchmark jobs.") parser.add_argument( "--head-commit", @@ -187,6 +247,12 @@ def parse_args() -> argparse.Namespace: help="Local git name for the baseline branch when it differs from the bench-server branch name.", ) parser.add_argument("--history-days", type=int, default=DEFAULT_HISTORY_DAYS, help="History window in days.") + parser.add_argument( + "--rota-days", + type=int, + default=DEFAULT_ROTA_DAYS, + help="In --rota mode, inspect regressions introduced within this many recent days.", + ) parser.add_argument( "--threshold", type=float, @@ -222,6 +288,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--repo-dir", default=".", help="Local repository directory for git operations.") parser.add_argument("--bench-cli", help="Path to bench-cli. Defaults to bench-cli on PATH or standard local build.") parser.add_argument("--show-improvements", action="store_true", help="Also show possible improvements.") + parser.add_argument("--show-inconclusive", action="store_true", help="Show inconclusive findings in text output.") + parser.add_argument("--show-flaky", action="store_true", help="Show flaky findings in text output.") parser.add_argument("--json-out", help="Write a machine-readable JSON report to this file.") return parser.parse_args() @@ -269,6 +337,10 @@ def get_local_head_commit(repo_dir: str) -> str: return run_command(["git", "rev-parse", "HEAD"], cwd=repo_dir).strip() +def resolve_git_commit(repo_dir: str, revision: str) -> str: + return run_command(["git", "rev-parse", revision], cwd=repo_dir).strip() + + def resolve_pr_number(args: argparse.Namespace, head_commit: str, repo_dir: str) -> int: if args.pr: return args.pr @@ -281,6 +353,106 @@ def resolve_pr_number(args: argparse.Namespace, head_commit: str, repo_dir: str) return int(match.group("pr")) +def get_commit_parents(repo_dir: str, commit: str) -> list[str]: + output = run_command(["git", "rev-list", "--parents", "-n", "1", commit], cwd=repo_dir).strip() + fields = output.split() + if not fields: + raise ScriptError("Could not resolve parents for commit {}.".format(commit)) + if fields[0] != commit: + raise ScriptError("Unexpected parent listing for commit {}: {}".format(commit, output)) + return fields[1:] + + +def get_commit_metadata(repo_dir: str, commit: str) -> tuple[str, str]: + output = run_command(["git", "show", "--no-patch", "--format=%ae%n%s", commit], cwd=repo_dir).splitlines() + if len(output) < 2: + raise ScriptError("Could not read author email and subject for commit {}.".format(commit)) + return output[0], output[1] + + +def read_file_at_commit(repo_dir: str, commit: str, path: str) -> str: + return run_command(["git", "show", "{}:{}".format(commit, path)], cwd=repo_dir) + + +def parse_suite_definition(text: str, path: str) -> dict[str, Any]: + module = ast.parse(text, filename=path) + for node in module.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == "suite": + value = ast.literal_eval(node.value) + if not isinstance(value, dict): + raise ScriptError("{} does not assign a dict to suite.".format(path)) + return value + raise ScriptError("Could not find a top-level suite assignment in {}.".format(path)) + + +def get_imports_suites_at_commit(repo_dir: str, commit: str) -> list[Any]: + suite_text = read_file_at_commit(repo_dir, commit, SUITE_PY_PATH) + suite_data = parse_suite_definition(suite_text, "{}:{}".format(commit, SUITE_PY_PATH)) + imports = suite_data.get("imports") + if not isinstance(imports, dict): + raise ScriptError("{} has no suite['imports'] dict.".format(SUITE_PY_PATH)) + suites = imports.get("suites") + if not isinstance(suites, list): + raise ScriptError("{} has no suite['imports']['suites'] list.".format(SUITE_PY_PATH)) + return suites + + +def resolve_import_suite_versions(imports_suites: list[Any]) -> dict[str, str]: + suites_by_name: dict[str, dict[str, Any]] = {} + for suite in imports_suites: + if not isinstance(suite, dict): + continue + name = suite.get("name") + if isinstance(name, str): + suites_by_name[name] = suite + + resolved: dict[str, str] = {} + + def resolve(name: str) -> str: + if name in resolved: + return resolved[name] + suite = suites_by_name.get(name) + if suite is None: + raise ScriptError("suite['imports']['suites'] references unknown suite '{}'.".format(name)) + version = suite.get("version") + if isinstance(version, str): + resolved[name] = version + return version + version_from = suite.get("versionFrom") + if isinstance(version_from, str): + version = resolve(version_from) + resolved[name] = version + return version + raise ScriptError("suite['imports']['suites'][{}] has neither version nor versionFrom.".format(name)) + + for name in suites_by_name: + resolve(name) + return resolved + + +def summarize_graal_import_version(imports_suites: list[Any]) -> str: + resolved_versions = resolve_import_suite_versions(imports_suites) + ordered_versions = [] + for suite_name in ("regex", "truffle", "sdk", "tools"): + version = resolved_versions.get(suite_name) + if version is not None and version not in ordered_versions: + ordered_versions.append(version) + for suite_name in sorted(resolved_versions): + version = resolved_versions[suite_name] + if version not in ordered_versions: + ordered_versions.append(version) + return ",".join(version[:12] for version in ordered_versions) + + +def format_graal_commit_range(good_imports_suites: list[Any], bad_imports_suites: list[Any]) -> str: + return "graal ({}:{}]".format( + summarize_graal_import_version(good_imports_suites), + summarize_graal_import_version(bad_imports_suites), + ) + + def get_merge_base(repo_dir: str, baseline_branch_local_name: str, head_commit: str) -> str: return run_command(["git", "merge-base", baseline_branch_local_name, head_commit], cwd=repo_dir).strip() @@ -321,7 +493,7 @@ def get_builds_for_commit(commit: str, repo_dir: str) -> dict[str, BuildInfo]: return builds -def select_current_pybench_builds(builds: dict[str, BuildInfo]) -> dict[str, BuildInfo]: +def filter_successful_pybench_builds(builds: dict[str, BuildInfo]) -> dict[str, BuildInfo]: selected = {} for key, build in builds.items(): if not key.startswith("pybench-"): @@ -331,11 +503,28 @@ def select_current_pybench_builds(builds: dict[str, BuildInfo]) -> dict[str, Bui if "/builders/ci_executor/builds/" not in build.url: continue selected[key] = build + return selected + + +def select_current_pybench_builds(builds: dict[str, BuildInfo]) -> dict[str, BuildInfo]: + selected = filter_successful_pybench_builds(builds) if not selected: raise ScriptError("No successful pybench builds found for the current merge commit.") return selected +def find_recent_pybench_builds(start_commit: str, repo_dir: str, ancestor_search_commits: int) -> tuple[str, dict[str, BuildInfo]]: + for commit in get_first_parent_ancestors(repo_dir, start_commit, ancestor_search_commits): + selected = filter_successful_pybench_builds(get_builds_for_commit(commit, repo_dir)) + if selected: + return commit, selected + raise ScriptError( + "Could not find any successful pybench builds on the baseline branch within {} first-parent commits.".format( + ancestor_search_commits + ) + ) + + def pair_jobs_with_baseline( current_builds: dict[str, BuildInfo], baseline_start_commit: str, @@ -661,6 +850,66 @@ def build_history_query( } +def build_rota_history_query( + baseline_branch: str, + history_days: int, +) -> dict[str, Any]: + return { + "queries": [ + { + "format": {"type": "application/bench-results+json"}, + "selectors": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "commit.committer-ts", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + {"avg": "metric.value"}, + ], + "filters": { + "metric.name": {"enum": [PRIMARY_TIME_METRIC, *PRIMARY_HEAP_METRICS]}, + "branch": {"enum": [baseline_branch]}, + "commit.committer-ts": {"last-n": history_days, "unit": "D"}, + }, + "grouping": [ + "bench-suite", + "benchmark", + "build.job-name", + "machine.name", + "host-vm", + "host-vm-config", + "guest-vm", + "guest-vm-config", + "branch", + "commit.rev", + "commit.committer-ts", + "build.number", + "metric.unit", + "metric.name", + "metric.better", + ], + "clustering": [ + {"build.job-name": "asc"}, + {"benchmark": "asc"}, + {"machine.name": "asc"}, + {"commit.committer-ts": "asc"}, + ], + "samplers": [], + } + ] + } + + def history_key_for_measurement(measurement: Measurement) -> tuple[str, str, str, str, str, str, str, str]: return ( measurement.job_name, @@ -674,26 +923,238 @@ def history_key_for_measurement(measurement: Measurement) -> tuple[str, str, str ) -def parse_history_values(query_result: dict[str, Any]) -> dict[tuple[str, str, str, str, str, str, str, str], list[float]]: +def history_base_key_for_point(point: HistoryPoint) -> tuple[str, str, str, str, str, str, str]: + return ( + point.job_name, + point.machine_name, + point.host_vm, + point.host_vm_config, + point.guest_vm, + point.guest_vm_config, + point.benchmark, + ) + + +def history_key_for_point(point: HistoryPoint) -> tuple[str, str, str, str, str, str, str, str]: + return history_base_key_for_point(point) + (point.metric_name,) + + +def parse_commit_timestamp(raw_value: Any) -> datetime: + if isinstance(raw_value, datetime): + value = raw_value + elif isinstance(raw_value, (int, float)): + timestamp = float(raw_value) + if timestamp > 1_000_000_000_000: + timestamp /= 1000.0 + value = datetime.fromtimestamp(timestamp, tz=timezone.utc) + elif isinstance(raw_value, str): + stripped = raw_value.strip() + if re.fullmatch(r"-?\d+(\.\d+)?", stripped): + timestamp = float(stripped) + if timestamp > 1_000_000_000_000: + timestamp /= 1000.0 + value = datetime.fromtimestamp(timestamp, tz=timezone.utc) + else: + normalized = stripped.replace("Z", "+00:00") + value = datetime.fromisoformat(normalized) + else: + raise ScriptError("Unsupported commit.committer-ts value: {!r}".format(raw_value)) + if value.tzinfo is None: + return value.replace(tzinfo=timezone.utc) + return value.astimezone(timezone.utc) + + +def parse_history_points(query_result: dict[str, Any]) -> list[HistoryPoint]: selectors = query_result["metadata"]["selectors"] selector_to_index = {name: index for index, name in enumerate(selectors)} - history: dict[tuple[str, str, str, str, str, str, str, str], list[float]] = {} + history_points = [] for cluster in query_result["results"]: for row in cluster["data"]: - key = ( - row[selector_to_index["build.job-name"]], - row[selector_to_index["machine.name"]], - row[selector_to_index["host-vm"]], - row[selector_to_index["host-vm-config"]], - row[selector_to_index["guest-vm"]], - row[selector_to_index["guest-vm-config"]], - row[selector_to_index["benchmark"]], - row[selector_to_index["metric.name"]], + history_points.append( + HistoryPoint( + suite=row[selector_to_index["bench-suite"]], + benchmark=row[selector_to_index["benchmark"]], + job_name=row[selector_to_index["build.job-name"]], + machine_name=row[selector_to_index["machine.name"]], + host_vm=row[selector_to_index["host-vm"]], + host_vm_config=row[selector_to_index["host-vm-config"]], + guest_vm=row[selector_to_index["guest-vm"]], + guest_vm_config=row[selector_to_index["guest-vm-config"]], + branch=row[selector_to_index["branch"]], + commit_rev=row[selector_to_index["commit.rev"]], + commit_timestamp=parse_commit_timestamp(row[selector_to_index["commit.committer-ts"]]), + build_number=int(row[selector_to_index["build.number"]]), + metric_unit=row[selector_to_index["metric.unit"]], + metric_name=row[selector_to_index["metric.name"]], + metric_better=row[selector_to_index["metric.better"]], + avg=float(row[selector_to_index["metric.value:AVG"]]), + ) ) - history.setdefault(key, []).append(float(row[selector_to_index["metric.value:AVG"]])) + return history_points + + +def parse_history_values(query_result: dict[str, Any]) -> dict[tuple[str, str, str, str, str, str, str, str], list[float]]: + history: dict[tuple[str, str, str, str, str, str, str, str], list[float]] = {} + for point in parse_history_points(query_result): + history.setdefault(history_key_for_point(point), []).append(point.avg) return history +def select_preferred_history_series( + history_points: list[HistoryPoint], +) -> dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]]: + grouped: dict[tuple[str, str, str, str, str, str, str], dict[str, list[HistoryPoint]]] = {} + for point in history_points: + metric_groups = grouped.setdefault(history_base_key_for_point(point), {}) + metric_groups.setdefault(point.metric_name, []).append(point) + + selected: dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]] = {} + for base_key, metric_groups in grouped.items(): + job_name = base_key[0] + chosen_metric = None + for metric_name in metric_names_for_job(job_name): + metric_points = metric_groups.get(metric_name) + if metric_points: + chosen_metric = metric_name + selected[base_key + (metric_name,)] = sorted( + metric_points, + key=lambda point: (point.commit_timestamp, point.build_number, point.commit_rev), + ) + break + if chosen_metric is None: + # Keep the existing job-specific ordering when the metric set changes unexpectedly. + metric_name = sorted(metric_groups)[0] + selected[base_key + (metric_name,)] = sorted( + metric_groups[metric_name], + key=lambda point: (point.commit_timestamp, point.build_number, point.commit_rev), + ) + return selected + + +def rota_history_base_key_for_point(point: HistoryPoint) -> tuple[str, str, str, str, str, str, str]: + return ( + point.machine_name, + point.host_vm, + point.host_vm_config, + point.guest_vm, + point.guest_vm_config, + point.benchmark, + point.suite, + ) + + +def select_preferred_rota_series( + history_points: list[HistoryPoint], +) -> dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]]: + grouped: dict[tuple[str, str, str, str, str, str, str], dict[str, list[HistoryPoint]]] = {} + base_key_points: dict[tuple[str, str, str, str, str, str, str], list[HistoryPoint]] = {} + for point in history_points: + base_key = rota_history_base_key_for_point(point) + metric_groups = grouped.setdefault(base_key, {}) + metric_groups.setdefault(point.metric_name, []).append(point) + base_key_points.setdefault(base_key, []).append(point) + + selected: dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]] = {} + for base_key, metric_groups in grouped.items(): + points = base_key_points[base_key] + if any("heap" in point.job_name for point in points): + preferred_metrics = (*PRIMARY_HEAP_METRICS, PRIMARY_TIME_METRIC) + else: + preferred_metrics = (PRIMARY_TIME_METRIC, *PRIMARY_HEAP_METRICS) + chosen_metric = None + for metric_name in preferred_metrics: + metric_points = metric_groups.get(metric_name) + if metric_points: + chosen_metric = metric_name + selected[base_key + (metric_name,)] = sorted( + metric_points, + key=lambda point: (point.commit_timestamp, point.build_number, point.commit_rev, point.job_name), + ) + break + if chosen_metric is None: + metric_name = sorted(metric_groups)[0] + selected[base_key + (metric_name,)] = sorted( + metric_groups[metric_name], + key=lambda point: (point.commit_timestamp, point.build_number, point.commit_rev, point.job_name), + ) + return selected + + +def measurement_from_history_point(point: HistoryPoint) -> Measurement: + return Measurement( + suite=point.suite, + benchmark=point.benchmark, + job_name=point.job_name, + machine_name=point.machine_name, + host_vm=point.host_vm, + host_vm_config=point.host_vm_config, + guest_vm=point.guest_vm, + guest_vm_config=point.guest_vm_config, + branch=point.branch, + commit_rev=point.commit_rev, + build_number=point.build_number, + metric_name=point.metric_name, + metric_unit=point.metric_unit, + metric_better=point.metric_better, + avg=point.avg, + stddev=0.0, + count=1, + ) + + +def select_reference_point(history_points: list[HistoryPoint]) -> HistoryPoint: + if not history_points: + raise ValueError("select_reference_point() requires at least one history point") + median_value = statistics.median(sorted(point.avg for point in history_points)) + return min( + history_points, + key=lambda point: ( + abs(point.avg - median_value), + -point.commit_timestamp.timestamp(), + -point.build_number, + ), + ) + + +def has_later_recovery( + future_points: list[HistoryPoint], + reference_value: float, + lower_is_better: bool, + direction: str, + tolerance: float, +) -> bool: + for point in future_points: + if directional_delta(point.avg, reference_value, lower_is_better, direction) <= tolerance: + return True + return False + + +def has_later_subthreshold_point( + future_points: list[HistoryPoint], + reference_value: float, + lower_is_better: bool, + direction: str, + threshold_percent: float, +) -> bool: + for point in future_points: + if directional_percent(point.avg, reference_value, lower_is_better, direction) <= threshold_percent: + return True + return False + + +def recovery_tolerance( + reference_value: float, + metric_name: str, + threshold_percent: float, + absolute_time_delta_floor: float, + history: HistoryStats | None, +) -> float: + relative_tolerance = abs(reference_value) * (threshold_percent / 100.0) * RECOVERY_TOLERANCE_THRESHOLD_FRACTION + sigma_tolerance = history.recent_sigma if history is not None else 0.0 + absolute_tolerance = absolute_time_delta_floor if metric_name == PRIMARY_TIME_METRIC else 0.0 + return max(relative_tolerance, sigma_tolerance, absolute_tolerance) + + def calculate_history_stats( values: list[float], current_value: float, @@ -898,6 +1359,98 @@ def collect_findings( return findings +def collect_rota_findings( + series_by_key: dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]], + recent_cutoff: datetime, + threshold_percent: float, + absolute_time_delta_floor: float, + min_history_points: int, + recent_history_points: int, + direction: str, +) -> list[RegressionFinding]: + classification_rank = {"plausible": 2, "inconclusive": 1, "flaky": 0} + findings = [] + for series_points in series_by_key.values(): + if not series_points: + continue + lower_is_better = series_points[0].metric_better == "lower" + series_candidates = [] + for index, point in enumerate(series_points): + if point.commit_timestamp < recent_cutoff: + continue + prior_points = series_points[:index] + if not prior_points: + continue + if directional_delta(point.avg, prior_points[-1].avg, lower_is_better, direction) <= 0: + continue + reference_window = prior_points[-min(len(prior_points), recent_history_points) :] + baseline_point = select_reference_point(reference_window) + current = measurement_from_history_point(point) + baseline = measurement_from_history_point(baseline_point) + delta_pct = directional_percent(current.avg, baseline.avg, lower_is_better, direction) + if delta_pct <= threshold_percent: + continue + classification, reason, history = classify_change( + current, + baseline, + [prior_point.avg for prior_point in prior_points], + threshold_percent, + absolute_time_delta_floor, + min_history_points, + recent_history_points, + direction, + ) + recovery_reference_value = history.recent_median if history is not None else baseline.avg + if ( + classification == "plausible" + and has_later_subthreshold_point( + series_points[index + 1 :], + recovery_reference_value, + lower_is_better, + direction, + threshold_percent, + ) + ): + classification = "flaky" + reason = "later point retreated below the reporting threshold" + if has_later_recovery( + series_points[index + 1 :], + recovery_reference_value, + lower_is_better, + direction, + recovery_tolerance( + recovery_reference_value, + current.metric_name, + threshold_percent, + absolute_time_delta_floor, + history, + ), + ): + continue + series_candidates.append(make_finding(current, baseline, classification, reason, history, direction)) + if not series_candidates: + continue + series_candidates.sort( + key=lambda finding: ( + -classification_rank[finding.classification], + -finding.delta_pct, + -finding.current_build_number, + ) + ) + findings.append(series_candidates[0]) + findings.sort( + key=lambda finding: ( + -finding.delta_pct, + finding.host_vm, + finding.host_vm_config, + finding.guest_vm, + finding.guest_vm_config, + finding.benchmark, + ) + ) + return findings + + def render_table(rows: list[list[str]], headers: list[str]) -> str: widths = [len(header) for header in headers] for row in rows: @@ -910,67 +1463,470 @@ def render_table(rows: list[list[str]], headers: list[str]) -> str: def configuration_label(finding: RegressionFinding) -> str: + return configuration_label_from_parts( + finding.host_vm, + finding.host_vm_config, + finding.guest_vm, + finding.guest_vm_config, + ) + + +def configuration_label_from_parts( + host_vm: str, + host_vm_config: str, + guest_vm: str, + guest_vm_config: str, +) -> str: return ",".join( [ - finding.host_vm, - finding.host_vm_config, - finding.guest_vm, - finding.guest_vm_config, + host_vm, + host_vm_config, + guest_vm, + guest_vm_config, ] ) -def append_direction_sections(lines: list[str], findings: list[RegressionFinding], direction: str) -> None: +def append_finding_section(lines: list[str], title: str, findings: list[RegressionFinding]) -> None: + if findings: + rows = [ + [ + finding.benchmark, + "{:+.1f}%".format(finding.delta_pct), + configuration_label(finding), + finding.reason, + ] + for finding in findings + ] + lines.append("{}:".format(title)) + lines.append(render_table(rows, ["benchmark", "delta", "configuration", "reason"])) + else: + lines.append("{}: none".format(title)) + + +def append_direction_sections( + lines: list[str], + findings: list[RegressionFinding], + direction: str, + *, + show_inconclusive: bool, + show_flaky: bool, +) -> None: plausible_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "plausible"] flaky_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "flaky"] inconclusive_findings = [finding for finding in findings if finding.direction == direction and finding.classification == "inconclusive"] plural = "{}s".format(direction) - title = direction.capitalize() - if plausible_findings: - rows = [] - for finding in plausible_findings: - rows.append( - [ - finding.benchmark, - "{:+.1f}%".format(finding.delta_pct), - configuration_label(finding), - finding.reason, - ] - ) - lines.append("{}:".format(plural)) - lines.append(render_table(rows, ["benchmark", "delta", "configuration", "reason"])) - else: - lines.append("{}: none".format(plural)) - if inconclusive_findings: - lines.append("") - lines.append("Inconclusive {}:".format(plural)) - for finding in inconclusive_findings: - lines.append( - "- {} ({:+.1f}%, {}): {}".format( - finding.benchmark, - finding.delta_pct, - configuration_label(finding), - finding.reason, - ) - ) - else: + append_finding_section(lines, plural.capitalize(), plausible_findings) + if show_inconclusive: lines.append("") - lines.append("Inconclusive {}: none".format(plural)) - if flaky_findings: + append_finding_section(lines, "Inconclusive {}".format(plural), inconclusive_findings) + if show_flaky: lines.append("") - lines.append("Flaky {}:".format(plural)) - for finding in flaky_findings: - lines.append( - "- {} ({:+.1f}%, {}): {}".format( - finding.benchmark, - finding.delta_pct, - configuration_label(finding), - finding.reason, - ) + append_finding_section(lines, "Flaky {}".format(plural), flaky_findings) + + +def finding_rota_series_key(finding: RegressionFinding) -> tuple[str, str, str, str, str, str, str, str]: + return ( + finding.machine_name, + finding.host_vm, + finding.host_vm_config, + finding.guest_vm, + finding.guest_vm_config, + finding.benchmark, + finding.suite, + finding.metric_name, + ) + + +def status_visible(classification: str, *, show_inconclusive: bool, show_flaky: bool) -> bool: + if classification == "plausible": + return True + if classification == "inconclusive": + return show_inconclusive + if classification == "flaky": + return show_flaky + raise ValueError("Unsupported classification: {}".format(classification)) + + +def find_series_point_index(series_points: list[HistoryPoint], finding: RegressionFinding) -> int | None: + for index, point in enumerate(series_points): + if ( + point.commit_rev == finding.current_commit + and point.build_number == finding.current_build_number + and point.avg == finding.current_value + ): + return index + for index, point in enumerate(series_points): + if point.commit_rev == finding.current_commit and point.build_number == finding.current_build_number: + return index + return None + + +def localize_rota_change_point( + series_points: list[HistoryPoint], + finding: RegressionFinding, + threshold_percent: float, + absolute_time_delta_floor: float, +) -> RotaChangePoint: + fallback = RotaChangePoint( + direction=finding.direction, + classification=finding.classification, + benchmark=finding.benchmark, + suite=finding.suite, + machine_name=finding.machine_name, + host_vm=finding.host_vm, + host_vm_config=finding.host_vm_config, + guest_vm=finding.guest_vm, + guest_vm_config=finding.guest_vm_config, + metric_name=finding.metric_name, + metric_unit=finding.metric_unit, + good_commit=finding.baseline_commit, + bad_commit=finding.current_commit, + delta_pct=finding.delta_pct, + exact=False, + ) + current_index = find_series_point_index(series_points, finding) + if current_index is None or current_index <= 0: + return fallback + + lower_is_better = series_points[0].metric_better == "lower" + reference_value = finding.history.recent_median if finding.history is not None else finding.baseline_value + tolerance = recovery_tolerance( + reference_value, + finding.metric_name, + threshold_percent, + absolute_time_delta_floor, + finding.history, + ) + statuses = [] + for point in series_points[: current_index + 1]: + delta = directional_delta(point.avg, reference_value, lower_is_better, finding.direction) + delta_pct = directional_percent(point.avg, reference_value, lower_is_better, finding.direction) + if delta <= tolerance: + statuses.append("good") + elif delta_pct > threshold_percent: + statuses.append("bad") + else: + statuses.append("unknown") + + suffix_start = current_index + while suffix_start > 0 and statuses[suffix_start - 1] != "good": + suffix_start -= 1 + first_bad_index = next((index for index in range(suffix_start, current_index + 1) if statuses[index] == "bad"), None) + if first_bad_index is None: + return fallback + last_good_index = suffix_start - 1 if suffix_start > 0 else None + if last_good_index is None: + return fallback + + good_point = series_points[last_good_index] + bad_point = series_points[first_bad_index] + delta_pct = directional_percent(bad_point.avg, good_point.avg, lower_is_better, finding.direction) + adjacent_threshold = threshold_percent + if finding.history is not None: + adjacent_threshold = max(adjacent_threshold, finding.history.typical_adjacent_change_pct * 2.0) + exact = ( + first_bad_index == last_good_index + 1 + and delta_pct > adjacent_threshold + and directional_delta(bad_point.avg, good_point.avg, lower_is_better, finding.direction) > tolerance + ) + if not exact and first_bad_index > 0: + adjacent_good_point = series_points[first_bad_index - 1] + adjacent_delta_pct = directional_percent( + bad_point.avg, + adjacent_good_point.avg, + lower_is_better, + finding.direction, + ) + prior_positive_adjacent_changes = [ + directional_percent(current.avg, previous.avg, lower_is_better, finding.direction) + for previous, current in zip(series_points[: first_bad_index - 1], series_points[1:first_bad_index]) + if directional_percent(current.avg, previous.avg, lower_is_better, finding.direction) > 0 + ] + prior_max_adjacent_change = max(prior_positive_adjacent_changes) if prior_positive_adjacent_changes else 0.0 + adjacent_outlier_threshold = max( + adjacent_threshold, + prior_max_adjacent_change * ADJACENT_JUMP_OUTLIER_FACTOR, + ) + if ( + adjacent_delta_pct > adjacent_outlier_threshold + and directional_delta(bad_point.avg, adjacent_good_point.avg, lower_is_better, finding.direction) > tolerance + ): + good_point = adjacent_good_point + delta_pct = adjacent_delta_pct + exact = True + return RotaChangePoint( + direction=finding.direction, + classification=finding.classification, + benchmark=finding.benchmark, + suite=finding.suite, + machine_name=finding.machine_name, + host_vm=finding.host_vm, + host_vm_config=finding.host_vm_config, + guest_vm=finding.guest_vm, + guest_vm_config=finding.guest_vm_config, + metric_name=finding.metric_name, + metric_unit=finding.metric_unit, + good_commit=good_point.commit_rev, + bad_commit=bad_point.commit_rev, + delta_pct=delta_pct, + exact=exact, + ) + + +def collect_rota_change_points( + series_by_key: dict[tuple[str, str, str, str, str, str, str, str], list[HistoryPoint]], + findings: list[RegressionFinding], + threshold_percent: float, + absolute_time_delta_floor: float, +) -> list[RotaChangePoint]: + change_points = [] + for finding in findings: + if finding.direction != "regression": + continue + series_points = series_by_key.get(finding_rota_series_key(finding)) + if not series_points: + continue + change_points.append( + localize_rota_change_point(series_points, finding, threshold_percent, absolute_time_delta_floor) + ) + change_points.sort( + key=lambda change_point: ( + change_point.good_commit, + change_point.bad_commit, + not change_point.exact, + -change_point.delta_pct, + change_point.host_vm, + change_point.host_vm_config, + change_point.guest_vm, + change_point.guest_vm_config, + change_point.benchmark, + ) + ) + return change_points + + +def append_rota_change_points_section( + lines: list[str], + change_points: list[RotaChangePoint], + *, + group_annotations: dict[tuple[str, str], str] | None = None, + title: str = "Regression change points", + show_inconclusive: bool, + show_flaky: bool, +) -> None: + visible_change_points = [ + change_point + for change_point in change_points + if change_point.direction == "regression" + and status_visible( + change_point.classification, + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) + ] + if not visible_change_points: + lines.append("{}: none".format(title)) + return + + grouped_change_points: dict[tuple[str, str], list[RotaChangePoint]] = {} + group_order: list[tuple[str, str]] = [] + for change_point in visible_change_points: + group_key = (change_point.good_commit, change_point.bad_commit) + if group_key not in grouped_change_points: + grouped_change_points[group_key] = [] + group_order.append(group_key) + grouped_change_points[group_key].append(change_point) + + lines.append("{}:".format(title)) + for index, group_key in enumerate(group_order): + if index > 0: + lines.append("") + group_change_points = grouped_change_points[group_key] + header_kind = "point" if any(change_point.exact for change_point in group_change_points) else "range" + header = "{} {}".format( + header_kind, + format_change_point_group_label(group_key[0], group_key[1], exact=(header_kind == "point")), + ) + if group_annotations is not None and group_key in group_annotations: + header += " | {}".format(group_annotations[group_key]) + lines.append(header) + rows = [ + [ + "point" if change_point.exact else "range", + "{:+.1f}%".format(change_point.delta_pct), + change_point.benchmark, + configuration_label_from_parts( + change_point.host_vm, + change_point.host_vm_config, + change_point.guest_vm, + change_point.guest_vm_config, + ), + change_point.classification, + ] + for change_point in group_change_points + ] + lines.append(render_table(rows, ["kind", "delta", "benchmark", "configuration", "class"])) + + +def collect_rota_direct_suspects( + change_points: list[RotaChangePoint], + repo_dir: str, +) -> list[RotaDirectSuspect]: + grouped_change_points: dict[tuple[str, str], list[RotaChangePoint]] = {} + group_order: list[tuple[str, str]] = [] + for change_point in change_points: + if change_point.direction != "regression": + continue + group_key = (change_point.good_commit, change_point.bad_commit) + if group_key not in grouped_change_points: + grouped_change_points[group_key] = [] + group_order.append(group_key) + grouped_change_points[group_key].append(change_point) + + imports_suites_cache: dict[str, list[Any]] = {} + metadata_cache: dict[str, tuple[str, str]] = {} + direct_suspects = [] + for good_commit, bad_commit in group_order: + try: + parents = get_commit_parents(repo_dir, bad_commit) + except ScriptError: + continue + if good_commit not in parents: + continue + try: + if good_commit not in imports_suites_cache: + imports_suites_cache[good_commit] = get_imports_suites_at_commit(repo_dir, good_commit) + if bad_commit not in imports_suites_cache: + imports_suites_cache[bad_commit] = get_imports_suites_at_commit(repo_dir, bad_commit) + except ScriptError: + continue + if imports_suites_cache[good_commit] != imports_suites_cache[bad_commit]: + continue + try: + if bad_commit not in metadata_cache: + metadata_cache[bad_commit] = get_commit_metadata(repo_dir, bad_commit) + except ScriptError: + continue + bad_author_email, bad_subject = metadata_cache[bad_commit] + direct_suspects.append( + RotaDirectSuspect( + good_commit=good_commit, + bad_commit=bad_commit, + bad_author_email=bad_author_email, + bad_subject=bad_subject, + change_points=tuple(grouped_change_points[(good_commit, bad_commit)]), ) - else: - lines.append("") - lines.append("Flaky {}: none".format(plural)) + ) + return direct_suspects + + +def append_rota_direct_suspects_section( + lines: list[str], + direct_suspects: list[RotaDirectSuspect], + *, + show_inconclusive: bool, + show_flaky: bool, +) -> None: + visible_suspects = [] + for suspect in direct_suspects: + visible_change_points = [ + change_point + for change_point in suspect.change_points + if status_visible( + change_point.classification, + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) + ] + if not visible_change_points: + continue + visible_suspects.append((suspect, visible_change_points)) + + if not visible_suspects: + lines.append("Direct bad-commit suspects: none") + return + + lines.append("Direct bad-commit suspects:") + for index, (suspect, visible_change_points) in enumerate(visible_suspects): + if index > 0: + lines.append("") + header_kind = "point" if any(change_point.exact for change_point in visible_change_points) else "range" + lines.append( + "{} {} | {} | {}".format( + header_kind, + format_change_point_group_label(suspect.good_commit, suspect.bad_commit, exact=(header_kind == "point")), + suspect.bad_author_email, + suspect.bad_subject, + ) + ) + rows = [ + [ + "point" if change_point.exact else "range", + "{:+.1f}%".format(change_point.delta_pct), + change_point.benchmark, + configuration_label_from_parts( + change_point.host_vm, + change_point.host_vm_config, + change_point.guest_vm, + change_point.guest_vm_config, + ), + change_point.classification, + ] + for change_point in visible_change_points + ] + lines.append(render_table(rows, ["kind", "delta", "benchmark", "configuration", "class"])) + + +def direct_suspect_group_keys(direct_suspects: list[RotaDirectSuspect]) -> set[tuple[str, str]]: + return {(suspect.good_commit, suspect.bad_commit) for suspect in direct_suspects} + + +def collect_rota_graal_update_annotations( + change_points: list[RotaChangePoint], + excluded_group_keys: set[tuple[str, str]], + repo_dir: str, +) -> dict[tuple[str, str], str]: + grouped_change_points: dict[tuple[str, str], list[RotaChangePoint]] = {} + for change_point in change_points: + if change_point.direction != "regression": + continue + group_key = (change_point.good_commit, change_point.bad_commit) + if group_key in excluded_group_keys: + continue + grouped_change_points.setdefault(group_key, []).append(change_point) + + imports_suites_cache: dict[str, list[Any]] = {} + annotations = {} + for (good_commit, bad_commit), group_change_points in grouped_change_points.items(): + if not any(change_point.exact for change_point in group_change_points): + continue + try: + parents = get_commit_parents(repo_dir, bad_commit) + except ScriptError: + continue + if good_commit not in parents: + continue + try: + if good_commit not in imports_suites_cache: + imports_suites_cache[good_commit] = get_imports_suites_at_commit(repo_dir, good_commit) + if bad_commit not in imports_suites_cache: + imports_suites_cache[bad_commit] = get_imports_suites_at_commit(repo_dir, bad_commit) + except ScriptError: + continue + if imports_suites_cache[good_commit] == imports_suites_cache[bad_commit]: + continue + annotations[(good_commit, bad_commit)] = format_graal_commit_range( + imports_suites_cache[good_commit], + imports_suites_cache[bad_commit], + ) + return annotations + + +def format_change_point_group_label(good_commit: str, bad_commit: str, *, exact: bool) -> str: + if exact: + return bad_commit[:12] + return "({}:{}]".format(good_commit[:12], bad_commit[:12]) def build_warnings(job_pairs: list[JobPair], baseline_commit: str, unresolved_jobs: list[str]) -> list[str]: @@ -1008,31 +1964,113 @@ def build_warnings(job_pairs: list[JobPair], baseline_commit: str, unresolved_jo return warnings -def summarize_findings(findings: list[RegressionFinding], warnings: list[str]) -> str: +def summarize_findings( + findings: list[RegressionFinding], + warnings: list[str], + *, + show_inconclusive: bool, + show_flaky: bool, + rota_direct_suspects: list[RotaDirectSuspect] | None = None, + rota_change_points: list[RotaChangePoint] | None = None, + rota_change_point_annotations: dict[tuple[str, str], str] | None = None, +) -> str: lines = [] if warnings: lines.append("Warnings:") for warning in warnings: lines.append("- {}".format(warning)) lines.append("") - append_direction_sections(lines, findings, "regression") + append_direction_sections( + lines, + findings, + "regression", + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) + remaining_rota_change_points = rota_change_points + remaining_rota_change_point_annotations = rota_change_point_annotations + if rota_direct_suspects is not None: + lines.append("") + append_rota_direct_suspects_section( + lines, + rota_direct_suspects, + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) + if rota_change_points is not None: + attributed_group_keys = direct_suspect_group_keys(rota_direct_suspects) + remaining_rota_change_points = [ + change_point + for change_point in rota_change_points + if (change_point.good_commit, change_point.bad_commit) not in attributed_group_keys + ] + if remaining_rota_change_point_annotations is not None: + remaining_rota_change_point_annotations = { + group_key: annotation + for group_key, annotation in remaining_rota_change_point_annotations.items() + if group_key not in attributed_group_keys + } + if rota_change_points is not None: + lines.append("") + append_rota_change_points_section( + lines, + remaining_rota_change_points or [], + group_annotations=remaining_rota_change_point_annotations, + title="Unattributed regression change points", + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) if any(finding.direction == "improvement" for finding in findings): lines.append("") - append_direction_sections(lines, findings, "improvement") + append_direction_sections( + lines, + findings, + "improvement", + show_inconclusive=show_inconclusive, + show_flaky=show_flaky, + ) return "\n".join(lines) +def findings_json_payload(findings: list[RegressionFinding]) -> dict[str, Any]: + return { + "plausible": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "plausible"], + "flaky": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "flaky"], + "inconclusive": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "inconclusive"], + "improvements": [finding_to_dict(finding) for finding in findings if finding.direction == "improvement"], + } + + +def change_point_to_dict(change_point: RotaChangePoint) -> dict[str, Any]: + return asdict(change_point) + + +def direct_suspect_to_dict(suspect: RotaDirectSuspect) -> dict[str, Any]: + return { + "good_commit": suspect.good_commit, + "bad_commit": suspect.bad_commit, + "bad_author_email": suspect.bad_author_email, + "bad_subject": suspect.bad_subject, + "change_points": [change_point_to_dict(change_point) for change_point in suspect.change_points], + } + + def report_json_object( - pr_number: int, + pr_number: int | None, head_commit: str, merge_commit: str, baseline_commit: str, unresolved_jobs: list[str], findings: list[RegressionFinding], warnings: list[str], + *, + mode: str, + direct_suspects: list[RotaDirectSuspect] | None = None, + change_points: list[RotaChangePoint] | None = None, ) -> dict[str, Any]: - return { + payload = { "context": { + "mode": mode, "pr": pr_number, "head_commit": head_commit, "merge_commit": merge_commit, @@ -1040,11 +2078,13 @@ def report_json_object( "unresolved_jobs": unresolved_jobs, "warnings": warnings, }, - "plausible": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "plausible"], - "flaky": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "flaky"], - "inconclusive": [finding_to_dict(finding) for finding in findings if finding.direction == "regression" and finding.classification == "inconclusive"], - "improvements": [finding_to_dict(finding) for finding in findings if finding.direction == "improvement"], } + payload.update(findings_json_payload(findings)) + if direct_suspects is not None: + payload["direct_suspects"] = [direct_suspect_to_dict(suspect) for suspect in direct_suspects] + if change_points is not None: + payload["change_points"] = [change_point_to_dict(change_point) for change_point in change_points] + return payload def finding_to_dict(finding: RegressionFinding) -> dict[str, Any]: @@ -1054,10 +2094,23 @@ def finding_to_dict(finding: RegressionFinding) -> dict[str, Any]: return data -def main() -> int: - args = parse_args() - repo_dir = str(Path(args.repo_dir).resolve()) - bench_cli = resolve_bench_cli(args.bench_cli) +def validate_args(args: argparse.Namespace) -> None: + if args.rota and (args.pr or args.current_pr or args.merge_commit or args.head_commit): + raise ScriptError("--rota cannot be combined with PR-specific arguments such as --pr, --current-pr, --merge-commit, or --head-commit.") + if args.history_days <= 0: + raise ScriptError("--history-days must be positive.") + if args.rota_days <= 0: + raise ScriptError("--rota-days must be positive.") + + +def emit_json_report(args: argparse.Namespace, payload: dict[str, Any]) -> None: + if not args.json_out: + return + json_path = Path(args.json_out) + json_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def run_pr_mode(args: argparse.Namespace, repo_dir: str, bench_cli: str) -> int: head_commit = args.head_commit or get_local_head_commit(repo_dir) pr_number = resolve_pr_number(args, head_commit, repo_dir) gate_overview = get_gate_overview(args.project, args.repo, pr_number, repo_dir) @@ -1125,22 +2178,141 @@ def main() -> int: ), "", ] - print("\n".join(header) + summarize_findings(findings, warnings)) - - if args.json_out: - json_path = Path(args.json_out) - json_path.write_text( - json.dumps( - report_json_object(pr_number, head_commit, merge_commit, baseline_commit, unresolved_jobs, findings, warnings), - indent=2, - sort_keys=True, + print( + "\n".join(header) + + summarize_findings( + findings, + warnings, + show_inconclusive=args.show_inconclusive, + show_flaky=args.show_flaky, + ) + ) + + emit_json_report( + args, + report_json_object( + pr_number, + head_commit, + merge_commit, + baseline_commit, + unresolved_jobs, + findings, + warnings, + mode="pr", + ), + ) + return 0 + + +def run_rota_mode(args: argparse.Namespace, repo_dir: str, bench_cli: str) -> int: + baseline_branch_local_name = args.baseline_branch_local_name or args.baseline_branch + branch_head_commit = resolve_git_commit(repo_dir, baseline_branch_local_name) + history_result = run_bench_query( + bench_cli, + build_rota_history_query(args.baseline_branch, args.history_days), + repo_dir, + ) + history_points = [ + point + for point in parse_history_points(history_result) + if point.job_name.startswith("pybench-") and point.guest_vm == "graalpython" + ] + series_by_key = select_preferred_rota_series(history_points) + if not series_by_key: + raise ScriptError("Bench Server query returned no pybench primary metrics on {}.".format(args.baseline_branch)) + recent_cutoff = datetime.now(timezone.utc) - timedelta(days=args.rota_days) + findings = collect_rota_findings( + series_by_key, + recent_cutoff, + args.threshold, + args.absolute_time_delta_floor, + args.min_history_points, + args.recent_history_points, + "regression", + ) + if args.show_improvements: + findings.extend( + collect_rota_findings( + series_by_key, + recent_cutoff, + args.threshold, + args.absolute_time_delta_floor, + args.min_history_points, + args.recent_history_points, + "improvement", ) - + "\n", - encoding="utf-8", ) + warnings = [] + change_points = collect_rota_change_points( + series_by_key, + findings, + args.threshold, + args.absolute_time_delta_floor, + ) + direct_suspects = collect_rota_direct_suspects(change_points, repo_dir) + change_point_annotations = collect_rota_graal_update_annotations( + change_points, + direct_suspect_group_keys(direct_suspects), + repo_dir, + ) + header = [ + "ROTA {} | head {} | recent window {}d | history {}d".format( + args.baseline_branch, + branch_head_commit[:12], + args.rota_days, + args.history_days, + ), + "Tracked pybench jobs in history: {} | benchmark series: {}".format( + len({point.job_name for point in history_points}), + len(series_by_key), + ), + "{} above {:.1f}%: {}".format( + "Candidate changes" if args.show_improvements else "Candidate regressions", + args.threshold, + len(findings), + ), + "", + ] + print( + "\n".join(header) + + summarize_findings( + findings, + warnings, + show_inconclusive=args.show_inconclusive, + show_flaky=args.show_flaky, + rota_direct_suspects=direct_suspects, + rota_change_points=change_points, + rota_change_point_annotations=change_point_annotations, + ) + ) + emit_json_report( + args, + report_json_object( + None, + branch_head_commit, + branch_head_commit, + branch_head_commit, + [], + findings, + warnings, + mode="rota", + direct_suspects=direct_suspects, + change_points=change_points, + ), + ) return 0 +def main() -> int: + args = parse_args() + validate_args(args) + repo_dir = str(Path(args.repo_dir).resolve()) + bench_cli = resolve_bench_cli(args.bench_cli) + if args.rota: + return run_rota_mode(args, repo_dir, bench_cli) + return run_pr_mode(args, repo_dir, bench_cli) + + if __name__ == "__main__": try: raise SystemExit(main()) From 2769f60e11ee42a1803c25ffd057326833067d65 Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Fri, 24 Apr 2026 14:53:45 +0200 Subject: [PATCH 3/7] Add a skill for rota benchmark analysis --- .../rota-bench-regression-analysis/SKILL.md | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 .agents/skills/rota-bench-regression-analysis/SKILL.md diff --git a/.agents/skills/rota-bench-regression-analysis/SKILL.md b/.agents/skills/rota-bench-regression-analysis/SKILL.md new file mode 100644 index 0000000000..e9a035abfd --- /dev/null +++ b/.agents/skills/rota-bench-regression-analysis/SKILL.md @@ -0,0 +1,76 @@ +--- +name: rota-bench-regression-analysis +description: Analyze recent GraalPy benchmark regressions on `master` as part of the weekly rota. Use when asked to analyze benchmarks for rota. +--- + +# Bench Regression Analysis + +## Use This Skill For +- Recent regression summaries from `scripts/compare_bench_regressions.py --rota`. +- Follow-up inspection of unattributed plausible change points. + +## Core Workflow +1. Run: +```bash +scripts/compare_bench_regressions.py --rota --json-out /tmp/compare_bench_regressions_rota.json +``` +2. Use the text output for the human summary and the JSON for precise inspection. +3. Focus on `plausible` regressions. Ignore `flaky` and `inconclusive` items unless they help explain a plausible shift. +4. Split the summary into: +- `Attributed` +- `Unattributed` + +## Useful JSON Queries +```bash +jq '.direct_suspects[] | {good_commit, bad_commit, bad_author_email, bad_subject}' \ + /tmp/compare_bench_regressions_rota.json + +jq '[.change_points[] | select(.classification == "plausible")]' \ + /tmp/compare_bench_regressions_rota.json +``` + +## Attributed Regressions +- Start from `direct_suspects` in `/tmp/compare_bench_regressions_rota.json`. +- For each suspect, keep the abbreviated bad commit ID, full author email, full commit subject, and the worst example benchmarks per suite, not the full list. +- Prefer one worst example per affected suite such as `micro`, `meso`, `macro`... + +## Unattributed Regressions +- Start from plausible `change_points` whose `(good_commit, bad_commit]` pair is not already covered by `direct_suspects`. +- Inspect the range with: +```bash +git log --first-parent --reverse --format='%H%x09%ae%x09%s' GOOD..BAD +git show --stat --summary --format=fuller BAD +git diff --stat GOOD..BAD +``` +- If needed, inspect individual commits in the range with `git show --stat --summary --format=fuller COMMIT`. + +## Attribution Rules +- If the change point is an exact single-parent GraalPy commit and `mx.graalpython/suite.py` imports did not change, it can usually be attributed to that commit. +- Changes to imports in `mx.graalpython/suite.py` can never be confidently attributed without bisecting Graal. Keep those unattributed and say so explicitly (including the graal commit range) +- If an unattributed first-parent range contains one plausible GraalPy code change and the rest are documentation, tests, retags, or other non-performance changes, attribute it to that one code change. +- If the series is already shifted by an earlier attributed commit and a later unattributed range only preserves the new level, fold the later item into the earlier attribution. +- Cross-configuration correlation matters. +- If `native` shows an exact jump on one commit and `jvm` later shows the same benchmark shifted upward through a range containing that commit, treat them as likely the same cause unless the later range has a better candidate. +- If both `jvm` and `native` jump at or immediately after a Graal import update, keep both under the same unattributed Graal-side cause. + +## Flakiness Check +- Use Bench Server data when the unattributed item is small or suspicious. +- Query the benchmark series with `bench-cli run -` and check whether the change is a clean step up that stays high, a one-point spike that immediately falls back, or already present before the reported range. +- A stable step change is a real regression candidate. +- An isolated last-point bump with no supporting related regressions is usually watch-and-rerun material, not a strong attribution. + +## Bench Server Checks +- Prefer querying only the specific benchmark and configuration under investigation. +- Typical filters: `branch = master`, target benchmark, `host-vm = graalvm-ee`, target `host-vm-config`, `guest-vm = graalpython`, target `guest-vm-config`, `metric.name = time`, `commit.committer-ts last-n 30d`. +- Reduce output to `commit.rev` and average metric value so the step pattern is easy to inspect. +- `bench-cli` sometimes fails with 404 when the server is overloaded. If that happens, wait for a minute and try again + +## Output Contract +- List findings first, not process notes. +- Keep two top-level sections: `Attributed` and `Unattributed`. +- In the attributed section, use this header format: `abcd1234efgh | author@oracle.com | Full subject` +- In the unattributed section, say whether the item looks real, flaky, or likely the same cause as another attributed item. +- Do not abbreviate commit subjects. +- Keep author emails. +- Abbreviate commit IDs to 12 characters. +- Do not list every benchmark; only the worst examples from each affected suite. From fa890b916caf0560f95f9bf793f4ed0eb68a76a9 Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Sat, 25 Apr 2026 01:08:41 +0200 Subject: [PATCH 4/7] Automate bisecting benchmarks --- .../rota-bench-regression-analysis/SKILL.md | 21 +- bisect-benchmark.ini | 2 +- ci/python-bench.libsonnet | 4 +- mx.graalpython/mx_graalpython_bisect.py | 38 +- scripts/bisect_benchmark_regression.py | 640 ++++++++++++++++++ 5 files changed, 698 insertions(+), 7 deletions(-) create mode 100644 scripts/bisect_benchmark_regression.py diff --git a/.agents/skills/rota-bench-regression-analysis/SKILL.md b/.agents/skills/rota-bench-regression-analysis/SKILL.md index e9a035abfd..4f84da770d 100644 --- a/.agents/skills/rota-bench-regression-analysis/SKILL.md +++ b/.agents/skills/rota-bench-regression-analysis/SKILL.md @@ -18,7 +18,14 @@ scripts/compare_bench_regressions.py --rota --json-out /tmp/compare_bench_regres 3. Focus on `plausible` regressions. Ignore `flaky` and `inconclusive` items unless they help explain a plausible shift. 4. Split the summary into: - `Attributed` -- `Unattributed` +- `To bisect` +- `To watch` +5. Show the current summary +6. Execute the bisect script for each "to bisect" entry in parallel, then wait for all of them to finish. + The builds can take many hours without the script showing any output, make sure you wait for them with a long timeout. + If running in codex: round-robin poll the processes with `write_stdin` and 1 hour timeout (the configuration might cap this at a lower timeout in practice) +7. Collect the bisect results and move any benchmarks that were attributed by the bisections. +8. Show the final summary. Note any failed bisects. ## Useful JSON Queries ```bash @@ -67,10 +74,16 @@ git diff --stat GOOD..BAD ## Output Contract - List findings first, not process notes. -- Keep two top-level sections: `Attributed` and `Unattributed`. +- Keep three top-level sections (if not empty): `Attributed` and `To bisect` and `To watch` - In the attributed section, use this header format: `abcd1234efgh | author@oracle.com | Full subject` -- In the unattributed section, say whether the item looks real, flaky, or likely the same cause as another attributed item. +- Unattributed changes that look plausible go to "to bisect", flaky ones go to "to watch" +- In the "to bisect" section, add an invocation (don't execute yet) of `scripts/bisect_benchmark_regression.py` that can bisect it (use unabbreviated commits in this case) +- In the "to watch" section, say whether the item looks flaky, or likely the same cause as another attributed item. - Do not abbreviate commit subjects. - Keep author emails. - Abbreviate commit IDs to 12 characters. -- Do not list every benchmark; only the worst examples from each affected suite. +- Do not list every benchmark if there are many; only the worst examples from each affected suite. If you didn't list all, say "and X others". + +## Guardrails +- If the script or you can't find `bench-cli`, ask the user to provide it from the `bench-server` repo. +- Don't submit more than 5 bisect jobs. If there are more in the "to bisect" list, pick 5 that look the most serious and leave the rest as "to bisect". diff --git a/bisect-benchmark.ini b/bisect-benchmark.ini index c118098366..fc01ba2188 100644 --- a/bisect-benchmark.ini +++ b/bisect-benchmark.ini @@ -3,7 +3,7 @@ # Usage: # - Create a temporary branch based on the main branch (or the bad commit) # - Fill in this configuration file, preferably using the automated script -# graal-enterprise/graalpython-enterprise/scripts/create-bisect-config +# scripts/create_bisect_config.py # - Commit and push the file # - The push command output should give you a link to create a PR. Open it, but # don't create a PR. Instead, you should execute the job on your commit using diff --git a/ci/python-bench.libsonnet b/ci/python-bench.libsonnet index 81aba0fca1..a56bc066a2 100644 --- a/ci/python-bench.libsonnet +++ b/ci/python-bench.libsonnet @@ -240,7 +240,9 @@ downloads: downloads(self.os, self.arch), name: "bisect-benchmark", targets: ['bench'], - logs +: logs(self.os, self.arch), + logs +: logs(self.os, self.arch) + [ + "bisect-benchmark-result.json", + ], deploysArtifacts: true, packages +: packages(self.os, self.arch) + { "apache/ant": ">=1.9.4", diff --git a/mx.graalpython/mx_graalpython_bisect.py b/mx.graalpython/mx_graalpython_bisect.py index 6dcb78cf4b..b5033ac3ba 100644 --- a/mx.graalpython/mx_graalpython_bisect.py +++ b/mx.graalpython/mx_graalpython_bisect.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. # # The Universal Permissive License (UPL), Version 1.0 @@ -71,6 +71,8 @@ def print_line(l): GRAAL_DIR: GRAAL_ENTERPRISE_DIR, } +RESULTS_JSON_PATH = 'bisect-benchmark-result.json' + def get_commit(repo_path, ref='HEAD'): if repo_path: @@ -178,6 +180,22 @@ def summarize(self): .format(self.repo_name, self.bad_commit, get_message(self.repo_path, self.bad_commit))) return '' + def to_dict(self): + return { + 'repo_name': self.repo_name, + 'repo_path': str(self.repo_path), + 'commits': self.commits, + 'results': [result.to_dict() if result is not None else None for result in self.results], + 'good_index': self.good_index, + 'bad_index': self.bad_index, + 'good_commit': self.good_commit, + 'bad_commit': self.bad_commit, + 'dependency_results': { + str(index): dependency_result.to_dict() + for index, dependency_result in self.dependency_results.items() + }, + } + class BenchmarkResult(abc.ABC): def __init__(self, value, unit=None): @@ -201,6 +219,14 @@ def bound_is_significant(self, bad_result, epsilon): def is_good(self, good_result, bad_result): pass + def to_dict(self): + return { + 'kind': type(self).__name__, + 'value': self.value, + 'unit': self.unit, + 'display': str(self), + } + class LowerIsBetterResult(BenchmarkResult): def is_good(self, good_result, bad_result): @@ -358,6 +384,16 @@ def benchmark_callback(repo_path: Path, commit, bench_command=args.benchmark_com print() print(summary) + with open(RESULTS_JSON_PATH, 'w', encoding='utf-8') as result_file: + json.dump({ + 'bisect_id': bisect_id, + 'summary': summary, + 'visualization': visualization, + 'result': result.to_dict(), + 'build_url': os.environ.get('BUILD_URL'), + }, result_file, indent=2, sort_keys=True) + result_file.write('\n') + if args.rerun_with_commands: print('\n\nRerunning the good and bad commits with extra benchmark commands:') repo_path = DIR diff --git a/scripts/bisect_benchmark_regression.py b/scripts/bisect_benchmark_regression.py new file mode 100644 index 0000000000..6192c6218d --- /dev/null +++ b/scripts/bisect_benchmark_regression.py @@ -0,0 +1,640 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import argparse +import base64 +import json +import re +import shlex +import subprocess +import sys +import tempfile +import time + +from dataclasses import dataclass +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.parse import parse_qs, quote, urlencode, urlparse +from urllib.request import Request, urlopen + + +CONFIG_FILENAME = "bisect-benchmark.ini" +RESULT_FILENAME = "bisect-benchmark-result.json" +BISECT_JOB_NAME = "bisect-benchmark" +JOB_SUBMISSION_REGEX = BISECT_JOB_NAME +BRANCH_SUPPORT_FILES = ( + Path("mx.graalpython") / "mx_graalpython_bisect.py", + Path("ci") / "python-bench.libsonnet", +) +DEFAULT_PROJECT = "G" +DEFAULT_REPOSITORY = "graalpython" +AUTH_FILE = Path.home() / ".ol" / "ola_auth.json" +ENUMERATION_POLL_SECONDS = 5 +BUILD_POLL_SECONDS = 15 +ENUMERATION_TIMEOUT_SECONDS = 20 * 60 +BUILD_TIMEOUT_SECONDS = 24 * 60 * 60 +CONFIG_VIEW_BITBUCKET_RE = re.compile(r"^\s*[•*]?\s*bitbucket:\s*(\S+)\s*$", re.MULTILINE) + +DEBUG = False + + +class ScriptError(RuntimeError): + pass + + +@dataclass(frozen=True) +class BuildRecord: + key: str + state: str + url: str + date_added: int + build_number: int | None + request_id: int | None + + @property + def sort_key(self) -> tuple[int, int, int]: + return ( + self.date_added, + self.build_number if self.build_number is not None else -1, + self.request_id if self.request_id is not None else -1, + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Generate and optionally submit a bisect-benchmark CI workflow for a benchmark regression.", + ) + parser.add_argument("benchmark_job_name", help="Benchmark job key, for example pybench-micro-graalvm_ee_default-post_merge-linux-amd64-jdk-latest.") + parser.add_argument("benchmark_name", help="Benchmark selector to narrow the benchmark command to a single benchmark.") + parser.add_argument("metric", help="Benchmark metric name, or WORKS.") + parser.add_argument("good_commit", help="Known good GraalPy commit or ref.") + parser.add_argument("bad_commit", help="Known bad GraalPy commit or ref.") + parser.add_argument("--config-only", action="store_true", help="Print the generated bisect config and exit.") + parser.add_argument("--force-rebuild", action="store_true", help="Submit a fresh bisect job even if one already exists.") + parser.add_argument("--debug", action="store_true", help="Print progress information to stderr.") + parser.add_argument("--repo-dir", default=".", help=argparse.SUPPRESS) + return parser.parse_args() + + +def debug(message: str) -> None: + if DEBUG: + print(message, file=sys.stderr, flush=True) + + +def run_command(command: list[str], *, cwd: str | Path | None = None) -> str: + process = subprocess.run( + command, + cwd=str(cwd) if cwd is not None else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if process.returncode != 0: + raise ScriptError( + "Command failed with exit code {}:\n{}\n{}".format( + process.returncode, + " ".join(command), + process.stderr.strip(), + ) + ) + return process.stdout + + +def get_repo_root(repo_dir: str | Path) -> Path: + return Path(run_command(["git", "rev-parse", "--show-toplevel"], cwd=repo_dir).strip()) + + +def resolve_commit(repo_dir: str | Path, revision: str) -> str: + return run_command(["git", "rev-parse", revision], cwd=repo_dir).strip() + + +def build_branch_name(job_name: str, benchmark_name: str, metric: str, good_commit: str, bad_commit: str) -> str: + slug = "_".join( + [ + job_name, + benchmark_name, + metric, + good_commit, + bad_commit, + ] + ) + return "bisect/{}".format(slug) + + +def parse_build_number(url: str) -> int | None: + if "/builders/ci_executor/builds/" not in url: + return None + tail = url.rstrip("/").split("/")[-1] + return int(tail) if tail.isdigit() else None + + +def parse_request_id(url: str) -> int | None: + parsed = urlparse(url) + request_id = parse_qs(parsed.query).get("brid", [None])[0] + return int(request_id) if request_id and request_id.isdigit() else None + + +def parse_build_record(value: dict[str, object]) -> BuildRecord: + url = str(value["url"]) + return BuildRecord( + key=str(value["key"]), + state=str(value["state"]), + url=url, + date_added=int(value.get("dateAdded") or 0), + build_number=parse_build_number(url), + request_id=parse_request_id(url), + ) + + +def get_matching_builds(repo_dir: str | Path, commit: str, job_name: str) -> list[BuildRecord]: + output = run_command(["gdev-cli", "bitbucket", "get-builds", "-c", commit, "--all", "--json"], cwd=repo_dir) + data = json.loads(output) + return [ + parse_build_record(value) + for value in data.get("values", []) + if value.get("key") == job_name + ] + + +def pick_newest_build(builds: list[BuildRecord]) -> BuildRecord | None: + if not builds: + return None + return max(builds, key=lambda build: build.sort_key) + + +def read_text_from_url(url: str, headers: dict[str, str] | None = None) -> str: + request_headers = {"User-Agent": "bisect-benchmark-regression"} + if headers: + request_headers.update(headers) + request = Request(url, headers=request_headers) + try: + with urlopen(request, timeout=60) as response: + return response.read().decode("utf-8") + except (HTTPError, URLError) as exc: + raise ScriptError("Failed to fetch {}: {}".format(url, exc)) from exc + + +def fetch_uploaded_log_text(build_url: str, filename: str) -> str: + artifact_url = "{}/steps/LogfileUploader/logs/{}/text".format(build_url.rstrip("/"), quote(filename, safe="")) + return read_text_from_url(artifact_url) + + +def get_reference_build(repo_dir: str | Path, job_name: str, bad_commit: str, good_commit: str) -> BuildRecord: + for commit in (bad_commit, good_commit): + build = pick_newest_build( + [record for record in get_matching_builds(repo_dir, commit, job_name) if record.build_number is not None] + ) + if build is not None: + return build + raise ScriptError( + "Could not find any completed '{}' builds for {} or {}.".format(job_name, bad_commit, good_commit) + ) + + +def benchmark_selector_for_command(benchmark_name: str) -> str: + if ":" in benchmark_name: + return benchmark_name.rsplit(":", 1)[-1] + if "." in benchmark_name: + return benchmark_name.rsplit(".", 1)[-1] + return benchmark_name + + +def narrow_command(benchmark_command: str, benchmark_name: str) -> str: + components = shlex.split(benchmark_command) + try: + benchmark_args_start = components.index("benchmark") + 1 + benchmark_args_end = components.index("--") + before = components[:benchmark_args_start] + after = components[benchmark_args_end:] + benchmark_args = components[benchmark_args_start:benchmark_args_end] + benchmark_arg = next(arg for arg in benchmark_args if not arg.startswith("-")) + suite = re.sub(r":.*", "", benchmark_arg) + benchmark_args[benchmark_args.index(benchmark_arg)] = "{}:{}".format( + suite, + benchmark_selector_for_command(benchmark_name), + ) + return shlex.join(before + benchmark_args + after) + except (StopIteration, ValueError) as exc: + raise ScriptError("Could not determine how to narrow benchmark command '{}'.".format(benchmark_command)) from exc + + +def extract_commands(log: str, benchmark_name: str) -> tuple[str, str]: + build_commands = re.findall(r"\bRunning (mx\b.*\bbuild\b.*)", log) + benchmark_commands = re.findall(r"\bRunning (mx\b.*\bbenchmark\b.*)", log) + if not build_commands: + raise ScriptError("Could not find a build command in the benchmark build log.") + if not benchmark_commands: + raise ScriptError("Could not find a benchmark command in the benchmark build log.") + return build_commands[-1], narrow_command(benchmark_commands[-1], benchmark_name) + + +def benchmark_match_score(candidate: str, selector: str) -> int: + if candidate == selector: + return 100 + if candidate.rsplit(".", 1)[-1] == selector: + return 90 + if candidate.rsplit(":", 1)[-1] == selector: + return 80 + if candidate.endswith(".{}".format(selector)): + return 70 + if candidate.endswith(":{}".format(selector)): + return 60 + return 0 + + +def resolve_results_benchmark_name(build_url: str, selector: str, metric: str) -> str | None: + if metric == "WORKS": + return None + data = json.loads(fetch_uploaded_log_text(build_url, "bench-results.json")) + candidates: list[tuple[int, int, str]] = [] + for index, document in enumerate(data.get("queries", [])): + if document.get("metric.name") != metric: + continue + benchmark = document.get("benchmark") + if not isinstance(benchmark, str): + continue + score = benchmark_match_score(benchmark, selector) + if score > 0: + candidates.append((score, index, benchmark)) + if not candidates: + return None + candidates.sort() + best_score = candidates[-1][0] + best_matches = [benchmark for score, _index, benchmark in candidates if score == best_score] + if len(set(best_matches)) != 1: + return None + best_match = best_matches[-1] + return best_match if best_match != selector else None + + +def build_config_text( + build_command: str, + benchmark_command: str, + good_commit: str, + bad_commit: str, + metric: str, + enterprise: bool, + benchmark_name: str | None, +) -> str: + lines = [ + "[bisect-benchmark]", + "build_command = {}".format(build_command), + "benchmark_command = {}".format(benchmark_command), + ] + if benchmark_name: + lines.append("benchmark_name = {}".format(benchmark_name)) + lines.extend( + [ + "bad = {}".format(bad_commit), + "good = {}".format(good_commit), + "enterprise = {}".format("true" if enterprise else "false"), + "benchmark_metric = {}".format(metric), + ] + ) + return "\n".join(lines) + "\n" + + +def get_bitbucket_base_url(repo_dir: str | Path) -> str: + output = run_command(["gdev-cli", "config-view"], cwd=repo_dir) + match = CONFIG_VIEW_BITBUCKET_RE.search(output) + if not match: + raise ScriptError("Could not determine Bitbucket base URL from gdev-cli config-view.") + return match.group(1) + + +def get_bitbucket_token(bitbucket_base_url: str) -> str: + with AUTH_FILE.open(encoding="utf-8") as auth_file: + data = json.load(auth_file) + try: + encoded_token = data["auths"][bitbucket_base_url]["token"] + except KeyError as exc: + raise ScriptError("Could not find a token for {} in {}.".format(bitbucket_base_url, AUTH_FILE)) from exc + try: + return base64.b64decode(encoded_token).decode("utf-8") + except Exception as exc: # pylint: disable=broad-except + raise ScriptError("Could not decode the Bitbucket token from {}.".format(AUTH_FILE)) from exc + + +def enumerate_commit( + bitbucket_base_url: str, + bitbucket_token: str, + project: str, + repository: str, + commit: str, + branch_name: str, + force: bool, +) -> dict[str, object]: + query = urlencode( + { + "branch": branch_name, + "force": "true" if force else "false", + "toBranch": "null", + } + ) + url = ( + "{}/rest/ci/1.0/base/projects/{}/repos/{}/enumerate/{}?{}".format( + bitbucket_base_url.rstrip("/"), + project, + repository, + quote(commit, safe=""), + query, + ) + ) + debug("Enumerate {}".format(url)) + return json.loads(read_text_from_url(url, headers={"Authorization": "Bearer {}".format(bitbucket_token)})) + + +def get_remote_branch_head(repo_dir: str | Path, branch_name: str) -> str | None: + output = run_command(["git", "ls-remote", "--heads", "origin", branch_name], cwd=repo_dir).strip() + if not output: + return None + return output.split()[0] + + +def wait_for_enumeration( + repo_dir: str | Path, + project: str, + repository: str, + branch_name: str, + commit: str, + required_job: str, +) -> None: + bitbucket_base_url = get_bitbucket_base_url(repo_dir) + bitbucket_token = get_bitbucket_token(bitbucket_base_url) + deadline = time.monotonic() + ENUMERATION_TIMEOUT_SECONDS + last_status: tuple[object, ...] | None = None + while time.monotonic() < deadline: + data = enumerate_commit(bitbucket_base_url, bitbucket_token, project, repository, commit, branch_name, False) + enumeration = data.get("enumeration") or [] + status = ( + data.get("finished"), + data.get("successful"), + data.get("buildnumber"), + data.get("url"), + len(enumeration), + ) + if status != last_status: + debug( + "Enumeration state: finished={} successful={} buildnumber={} url={} jobs={}".format( + data.get("finished"), + data.get("successful"), + data.get("buildnumber"), + data.get("url"), + len(enumeration), + ) + ) + last_status = status + if any(job.get("name") == required_job for job in enumeration): + return + time.sleep(ENUMERATION_POLL_SECONDS) + raise ScriptError( + "Commit {} was not enumerated with '{}' within {} seconds.".format( + commit, required_job, ENUMERATION_TIMEOUT_SECONDS + ) + ) + + +def submit_bisect_job(repo_dir: str | Path, branch_name: str, commit: str) -> None: + debug("Submitting {} on {} ({})".format(BISECT_JOB_NAME, branch_name, commit)) + run_command( + [ + "gdev-cli", + "bitbucket", + "run-gates", + "-p", + DEFAULT_PROJECT, + "-r", + DEFAULT_REPOSITORY, + "-b", + branch_name, + "-c", + commit, + "-rf", + JOB_SUBMISSION_REGEX, + ], + cwd=repo_dir, + ) + + +def is_terminal_state(state: str) -> bool: + return state not in {"INPROGRESS", "PENDING", "QUEUED", "SCHEDULED"} + + +def wait_for_bisect_build( + repo_dir: str | Path, + commit: str, + previous_marker: tuple[int, int, int] | None = None, +) -> BuildRecord: + deadline = time.monotonic() + BUILD_TIMEOUT_SECONDS + printed_url = False + last_status: tuple[object, ...] | None = None + while time.monotonic() < deadline: + builds = get_matching_builds(repo_dir, commit, BISECT_JOB_NAME) + if previous_marker is not None: + builds = [build for build in builds if build.sort_key > previous_marker] + build = pick_newest_build(builds) + if build is not None: + status = (build.state, build.url, build.build_number, build.request_id) + if status != last_status: + debug( + "Bisect build state: state={} build_number={} request_id={} url={}".format( + build.state, + build.build_number, + build.request_id, + build.url, + ) + ) + last_status = status + if build.build_number is not None and not printed_url: + print(build.url, flush=True) + printed_url = True + if build.build_number is not None and is_terminal_state(build.state): + return build + time.sleep(BUILD_POLL_SECONDS) + raise ScriptError("Timed out waiting for '{}' on commit {}.".format(BISECT_JOB_NAME, commit)) + + +def write_temp_branch(repo_dir: Path, branch_name: str, config_text: str) -> str: + origin_url = run_command(["git", "remote", "get-url", "origin"], cwd=repo_dir).strip() + base_commit = resolve_commit(repo_dir, "HEAD") + commit_message = "Add bisect benchmark config" + debug("Creating branch {} from {}".format(branch_name, base_commit)) + with tempfile.TemporaryDirectory(prefix="bisect-benchmark-") as temp_root: + clone_dir = Path(temp_root) / "repo" + run_command(["git", "clone", str(repo_dir), str(clone_dir)], cwd=repo_dir) + run_command(["git", "remote", "set-url", "origin", origin_url], cwd=clone_dir) + for key in ("user.name", "user.email"): + try: + value = run_command(["git", "config", "--get", key], cwd=repo_dir).strip() + except ScriptError: + continue + if value: + run_command(["git", "config", key, value], cwd=clone_dir) + run_command(["git", "checkout", "-b", branch_name, base_commit], cwd=clone_dir) + (clone_dir / CONFIG_FILENAME).write_text(config_text, encoding="utf-8") + for relative_path in BRANCH_SUPPORT_FILES: + source_path = repo_dir / relative_path + destination_path = clone_dir / relative_path + destination_path.parent.mkdir(parents=True, exist_ok=True) + destination_path.write_text(source_path.read_text(encoding="utf-8"), encoding="utf-8") + run_command(["git", "add", CONFIG_FILENAME], cwd=clone_dir) + run_command(["git", "add", *[str(path) for path in BRANCH_SUPPORT_FILES]], cwd=clone_dir) + run_command(["git", "commit", "-m", commit_message], cwd=clone_dir) + commit = resolve_commit(clone_dir, "HEAD") + run_command(["git", "push", "origin", "HEAD:refs/heads/{}".format(branch_name)], cwd=clone_dir) + debug("Pushed branch {} at {}".format(branch_name, commit)) + return commit + + +def render_bisect_results(build: BuildRecord) -> str: + data = json.loads(fetch_uploaded_log_text(build.url, RESULT_FILENAME)) + summary = data.get("summary") or "" + visualization = data.get("visualization") or "" + if summary and visualization: + return "{}\n\n{}".format(summary, visualization) + if summary: + return summary + if visualization: + return visualization + return json.dumps(data, indent=2, sort_keys=True) + + +def generate_config( + repo_dir: Path, + benchmark_job_name: str, + benchmark_name: str, + metric: str, + good_commit: str, + bad_commit: str, +) -> str: + reference_build = get_reference_build(repo_dir, benchmark_job_name, bad_commit, good_commit) + debug("Using reference build {} ({})".format(reference_build.build_number, reference_build.url)) + build_log = run_command(["gdev-cli", "buildbot", "get-log", str(reference_build.build_number)], cwd=repo_dir) + build_command, benchmark_command = extract_commands(build_log, benchmark_name) + results_benchmark_name = resolve_results_benchmark_name(reference_build.url, benchmark_name, metric) + enterprise = "enterprise" in build_command + return build_config_text( + build_command=build_command, + benchmark_command=benchmark_command, + good_commit=good_commit, + bad_commit=bad_commit, + metric=metric, + enterprise=enterprise, + benchmark_name=results_benchmark_name, + ) + + +def main() -> int: + args = parse_args() + global DEBUG + DEBUG = args.debug + repo_dir = get_repo_root(args.repo_dir) + debug("Repo root: {}".format(repo_dir)) + good_commit = resolve_commit(repo_dir, args.good_commit) + bad_commit = resolve_commit(repo_dir, args.bad_commit) + debug("Resolved good={} bad={}".format(good_commit, bad_commit)) + config_text = generate_config( + repo_dir=repo_dir, + benchmark_job_name=args.benchmark_job_name, + benchmark_name=args.benchmark_name, + metric=args.metric, + good_commit=good_commit, + bad_commit=bad_commit, + ) + + if args.config_only: + print(config_text, end="") + return 0 + + branch_name = build_branch_name( + args.benchmark_job_name, + args.benchmark_name, + args.metric, + good_commit, + bad_commit, + ) + debug("Branch name: {}".format(branch_name)) + + branch_head = get_remote_branch_head(repo_dir, branch_name) + if branch_head is None: + branch_head = write_temp_branch(repo_dir, branch_name, config_text) + wait_for_enumeration( + repo_dir, + DEFAULT_PROJECT, + DEFAULT_REPOSITORY, + branch_name, + branch_head, + BISECT_JOB_NAME, + ) + submit_bisect_job(repo_dir, branch_name, branch_head) + build = wait_for_bisect_build(repo_dir, branch_head) + else: + debug("Remote branch head: {}".format(branch_head)) + existing_builds = get_matching_builds(repo_dir, branch_head, BISECT_JOB_NAME) + if existing_builds and not args.force_rebuild: + build = wait_for_bisect_build(repo_dir, branch_head) + else: + wait_for_enumeration( + repo_dir, + DEFAULT_PROJECT, + DEFAULT_REPOSITORY, + branch_name, + branch_head, + BISECT_JOB_NAME, + ) + previous_build = pick_newest_build(existing_builds) + previous_marker = previous_build.sort_key if previous_build is not None else None + submit_bisect_job(repo_dir, branch_name, branch_head) + build = wait_for_bisect_build(repo_dir, branch_head, previous_marker=previous_marker) + + if build.state != "SUCCESSFUL": + print("Job failed.") + return 1 + + print(render_bisect_results(build)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From ec7fbb2dc8aa4c66afc35d473b63c9f495f3601e Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Wed, 29 Apr 2026 11:36:05 +0200 Subject: [PATCH 5/7] Refactor rota skills --- .agents/skills/github-pr-mirror/SKILL.md | 89 ++----------- .../skills/graalpython-bitbucket-pr/SKILL.md | 124 ++++++++++++++++++ .agents/skills/graalpython-rota/SKILL.md | 67 ---------- .../skills/rota-check-periodic-jobs/SKILL.md | 53 ++++++++ .agents/skills/rota-update-import/SKILL.md | 34 +++++ 5 files changed, 224 insertions(+), 143 deletions(-) create mode 100644 .agents/skills/graalpython-bitbucket-pr/SKILL.md delete mode 100644 .agents/skills/graalpython-rota/SKILL.md create mode 100644 .agents/skills/rota-check-periodic-jobs/SKILL.md create mode 100644 .agents/skills/rota-update-import/SKILL.md diff --git a/.agents/skills/github-pr-mirror/SKILL.md b/.agents/skills/github-pr-mirror/SKILL.md index 82a0962abd..e04c567727 100644 --- a/.agents/skills/github-pr-mirror/SKILL.md +++ b/.agents/skills/github-pr-mirror/SKILL.md @@ -1,6 +1,6 @@ --- name: github-pr-mirror -description: Mirror an external GitHub pull request into the internal GraalPython Bitbucket review flow, including OCA label checks, Jira creation or reuse, preserving PR commits, pre-commit cleanup, Bitbucket PR creation, Graal Bot task handling, and gate follow-up. +description: Mirror an external GitHub pull request into the internal GraalPython Bitbucket review flow, including OCA label checks, Jira creation or reuse, preserving PR commits, pre-commit cleanup, and handoff to the shared GraalPython Bitbucket PR flow for PR creation, Graal Bot tasks, gates, and fixes. --- # GitHub PR Mirror @@ -12,8 +12,7 @@ This workflow intentionally preserves the original GitHub PR commits. Do not reb ## Companion Skills - Use the `jira` skill when creating or transitioning Jira issues. -- Use the `bitbucket` skill when creating the Bitbucket PR or managing comments/tasks. -- Use the `buildbot` skill when starting, watching, or investigating gates. +- Use the `graalpython-bitbucket-pr` skill after the mirror branch is ready, to create the Bitbucket PR, handle Graal Bot tasks, start/watch gates, and fix or report gate failures. ## Inputs @@ -129,80 +128,18 @@ git commit -m "[GR-] Apply pre-commit fixes for GitHub PR#" Only commit mechanical pre-commit output here. If pre-commit reveals non-mechanical problems, fix them in a separate follow-up commit when trivial; otherwise stop and ask the user. -### 5. Push to Bitbucket and create the PR +### 5. Create the Bitbucket PR and follow gates -Push the branch to the Bitbucket remote: +Use the `graalpython-bitbucket-pr` skill with these inputs: +- Source branch: `github-pr/` +- Bitbucket remote: `` +- Target branch: `` +- Title: `[] GitHub PR#: ` +- Description: `Mirrors GitHub for internal review.` +- Project/repository: `G` / `graalpython` +- Reviewers: repo-level default reviewers -```bash -git push github-pr/ -``` - -Create the Bitbucket PR with default reviewers (listed in repo-level AGENTS.md): - -```bash -gdev-cli bitbucket create-pr \ - -p G \ - -r graalpython \ - -fb github-pr/ \ - -tb \ - -t "[] GitHub PR#: " \ - -rv "" \ - -d "Mirrors GitHub for internal review." \ - --gate -``` - -Capture and print the Bitbucket PR URL. If `--gate` is not accepted or fails transiently, create the PR first, then start gates with: - -```bash -gdev-cli buildbot start-gate -p G -r graalpython -pr -``` - -### 6. Handle Graal Bot comments and tasks - -Wait about 30 seconds after PR creation, then list comments and tasks: - -```bash -gdev-cli bitbucket comment list -p G -r graalpython -pr --all --json -gdev-cli bitbucket task list -p G -r graalpython -pr --json -``` - -For tasks or comments authored by Graal Bot: -- Resolve tasks that are clearly administrative and already satisfied by this mirror workflow. -- Changelog tasks are normally resolvable for bugfixes or small compatibility additions. Ask the user before resolving them for user-facing API/option changes or large features. - -Resolve a task only when justified: - -```bash -gdev-cli bitbucket task resolve -p G -r graalpython -pr -cm -``` - -### 7. Watch gates - -Use sparse polling because full gates can take about an hour. Prefer structured output for status decisions: - -```bash -gdev-cli buildbot gate-overview -p G -r graalpython -pr --json -gdev-cli buildbot gate-builds -p G -r graalpython -pr --json -``` - -Human-readable output is fine for reporting, but avoid parsing it when JSON is available. - -Suggested cadence: -- First check after PR creation and bot handling. -- Then every 10 minutes while many jobs are running. -- When fewer than about 5 jobs remain, poll every 5 minutes. -- There's no need to send progress updates while gates are still running. Poll silently -- If there are failures, stop polling and move to fixing/reporting the failure. - -Treat the gate as successful only when there are no running and no failed gate builds. Other PR vetoes, such as reviewer approval, merge queue state, or GitHub mirroring consideration, are not gate failures. - -If gates fail: -- Inspect failing build logs with the `buildbot` skill. -- Fix trivial issues yourself, such as style output, generated pre-commit fallout, missing `@TruffleBoundary`, obvious test-selector mistakes, or small import/order problems. -- Commit fixes on top of the mirrored branch, push again, and restart or rerun gates as appropriate. -- Report non-trivial semantic failures, broad compatibility failures, or failures that require product judgment to the user with the Bitbucket PR URL, Jira key, and concise failure summary. - -Do not leave a long sleep process running after an interruption or handoff. If monitoring is interrupted, clean up any background sleep/polling process you started, then resume with a fresh status check. +Let that skill create the PR, handle Graal Bot comments/tasks, start or verify gates, watch gates, and fix or report failures. ## Final Report @@ -213,4 +150,4 @@ Always report: ## Guardrails - Don't comment on the github PR unless asked. Never mention the internal bitbucket/buildbot/etc URLs in comments on github. -- Do not stop monitoring the gates after they are created until they finish, fail, or become blocked by tooling. Sleep in 10 minute intervals by default. +- After handing off to `graalpython-bitbucket-pr`, do not stop monitoring the gates until they finish, fail, or become blocked by tooling. diff --git a/.agents/skills/graalpython-bitbucket-pr/SKILL.md b/.agents/skills/graalpython-bitbucket-pr/SKILL.md new file mode 100644 index 0000000000..af57a9e896 --- /dev/null +++ b/.agents/skills/graalpython-bitbucket-pr/SKILL.md @@ -0,0 +1,124 @@ +--- +name: graalpython-bitbucket-pr +description: Create or continue a GraalPython Bitbucket pull request and drive it through Graal Bot tasks, gate start, gate monitoring, failure investigation, fixes, pushes, and gate reruns. Use after a branch is ready for internal GraalPython review, or when an automation command has already created the PR and the remaining work is task cleanup and gate follow-up. +--- + +# GraalPython Bitbucket PR + +## Overview +Create or continue a GraalPython Bitbucket PR, resolve administrative Graal Bot tasks, start or verify gates, watch gates, and fix or report failures. + +## Companion Skills +- Use the `bitbucket` skill for PR creation, comments, and tasks. +- Use the `buildbot` skill for starting, watching, rerunning, and investigating gates. + +## Inputs +Required for new PRs: +- Source branch already pushed or ready to push. +- Target branch. +- PR title. +- PR description. +- Jira key in the PR title. + +Required for existing PRs: +- Bitbucket PR ID or URL. + +Defaults: +- Project/repository: `G` / `graalpython`. +- Reviewers: use the repo-level default reviewers unless the caller supplies a different list. +- Target branch: master + +## Workflow + +### 1. Push and create the PR, if needed + +If the source branch is not pushed yet: + +```bash +git push +``` + +Create the Bitbucket PR: + +```bash +gdev-cli bitbucket create-pr \ + -p G \ + -r graalpython \ + -fb \ + -tb \ + -t "" \ + -rv "" \ + -d "" \ + --gate +``` + +Capture the Bitbucket PR URL and ID. If `--gate` is not accepted or fails transiently, create the PR first, then start gates with: + +```bash +gdev-cli buildbot start-gate -p G -r graalpython -pr +``` + +If the PR already exists, capture the PR URL and ID, then continue with task handling and gate checks. + +### 2. Handle Graal Bot comments and tasks + +Wait about 15 seconds after PR creation, then list comments and tasks: + +```bash +gdev-cli bitbucket comment list -p G -r graalpython -pr --all --json +gdev-cli bitbucket task list -p G -r graalpython -pr --json +``` + +For tasks or comments authored by Graal Bot: +- Resolve tasks that are clearly administrative and already satisfied by the PR workflow. +- There should be a task regarding changelog. Changelog items are usually not needed for bugfixes and compatibility improvements. + They are needed for added context options, changed public API's in `polyglot` module or large features. If a changelog item is needed + but not present, suggest one and notify the user. + +Resolve a task only when justified: + +```bash +gdev-cli bitbucket task resolve -p G -r graalpython -pr -cm +``` + +### 3. Watch gates + +Use sparse polling because full gates can take about an hour. Prefer structured output for status decisions: + +```bash +gdev-cli buildbot gate-overview -p G -r graalpython -pr --json +gdev-cli buildbot gate-builds -p G -r graalpython -pr --json +``` + +Human-readable output is fine for reporting, but avoid parsing it when JSON is available. + +Suggested cadence: +- First check after PR creation and bot handling. +- Then every 10 minutes while many jobs are running. +- When fewer than about 5 jobs remain, poll every 5 minutes. +- Poll silently while gates are still running. +- If there are failures, try to diagnose them as soon as they appear. + +Treat the gate as successful only when there are no running and no failed gate builds. Other PR vetoes, such as reviewer approval, merge queue state, or GitHub mirroring consideration, are not gate failures. + +### 4. Fix or report gate failures + +If gates fail: +- Inspect failing build logs with the `buildbot` skill. +- Fix trivial issues yourself, such as style output, generated pre-commit fallout, missing `@TruffleBoundary`, obvious test-selector mistakes, or small import/order problems. +- Commit fixes on top of the PR branch, push again, and restart or rerun gates as appropriate. +- Report non-trivial semantic failures, broad compatibility failures, or failures that require product judgment to the user with the Bitbucket PR URL, Jira key, and concise failure summary. +- Many gate failures are transient. Use your judgement to determine if a failure might be transient and if it's the case, restart the gate and go back to polling. +- Typical signs of a transient failure: + - Infrastructure issues in the worker setup, git checkout or cleanup + - Single failed test on a single platform that succeeded on other platforms + - Test failures in tests involving weakrefs, subprocesses or multi-threading in PRs that didn't touch those subsystems. + +Do not leave a long sleep process running after an interruption or handoff. If monitoring is interrupted, clean up any background sleep/polling process you started, then resume with a fresh status check. + +## Final Report +Always report: +- Bitbucket PR URL +- Gate status +- Any unresolved failures or Graal Bot tasks +- Any transient issues encountered, with links to failed builds diff --git a/.agents/skills/graalpython-rota/SKILL.md b/.agents/skills/graalpython-rota/SKILL.md deleted file mode 100644 index 8ad580a6d2..0000000000 --- a/.agents/skills/graalpython-rota/SKILL.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -name: graalpython-rota -description: Run GraalPy ROTA maintenance workflows for (1) import update pull requests and (2) triage of recent periodic job failures in Jira. Use when asked to perform or guide recurring ROTA tasks from `docs/contributor/ROTA.md`, including branch setup, `mx` update commands, PR creation with reviewers/gates via `gdev-cli bitbucket`, and date-bounded periodic-failure issue triage via `gdev-cli jira`. ---- - -# GraalPy ROTA - -## Overview -Execute recurring GraalPy ROTA tasks with exact commands and strict output structure. Prefer the procedures in this skill. - -## Choose Workflow -- Use `Import update` when asked to refresh imports and open the standard PR. -- Use `Recent periodic issues` when asked to triage periodic job failures in Jira. - -## Import Update Workflow -1. Run the automated branch setup, import update, GitHub unittest-tag refresh, enterprise unittest-tag refresh, push, and standard PR creation: -```bash -mx python-update-import --rota -``` -2. If the command reports that `../graal-enterprise/graalpython-enterprise` is missing, stop and ask the user to provide that checkout. -3. Inspect the two generated commits and the created PR for plausibility. Expect mostly additions, not removals in the combined unittest-tag commit. -4. Use `gdev-cli bitbucket` to start gates on the created PR. Reviewer assignment comes from the default `gdev-cli` configuration. -5. Fix gate failures and push updates until gates pass. - -## Recent Periodic Issues Workflow -1. Verify creator identity mapping: -- Treat `ol-automation_ww` as Jira username `olauto`. -- If query returns zero results, test both identities, then keep `creator = olauto` once verified. - -2. Filter to recent periodic job failures, excluding in progress or closed. -- Default to the last 14 days unless user specifies otherwise. -- Always state concrete start/end calendar dates in the response. -```bash -gdev-cli jira search --json --max 100 \ - -f key,summary,creator,created,status,labels,components,assignee \ - -jql "project = GR AND component = Python AND creator = olauto AND labels = periodic-job-failures AND created >= -14d AND status != Closed AND status != 'In Progress' ORDER BY created DESC" -``` - -3. Fetch shortlisted issue details with `get-issue`: -```bash -gdev-cli jira get-issue --json -id GR-XXXX \ - | jq '{key, summary:.fields.summary, status:.fields.status.name, created:.fields.created, labels:.fields.labels, assignee:(.fields.assignee.name // null), description:.fields.description, comments:(.fields.comment.comments | map({author:.author.name, created, body}))}' -``` - -7. Convert findings into an implementation-ready plan per issue: -- Extract failing job name, error signature, and log clue. -- Map probable source area in repo. -- Propose first verification command. -- Define exit criteria to close ticket. -- Prepare temporary git worktree per issue with branch naming based on Jira key plus very short hyphenated description. - -## Output Contract For Periodic Triage -Return exactly: -1. Query scope used (component, creator, time window, status filter). -2. Count summary (total recent automation issues vs periodic failures). -3. Issue list with key, created date, summary, status. -4. Per-issue plan with: -- Hypothesis -- First code locations to inspect -- First reproducibility command -- Exit criteria for closing ticket -5. Recommended implementation order. - -## Guardrails -- State concrete dates for recency windows. -- Prefer `--json` and explicit `-f` fields in searches. -- Use `get-issue` only for shortlisted issues to keep output small. diff --git a/.agents/skills/rota-check-periodic-jobs/SKILL.md b/.agents/skills/rota-check-periodic-jobs/SKILL.md new file mode 100644 index 0000000000..5a80398649 --- /dev/null +++ b/.agents/skills/rota-check-periodic-jobs/SKILL.md @@ -0,0 +1,53 @@ +--- +name: rota-check-periodic-jobs +description: Analyze recent GraalPy periodic job failure Jira tickets for ROTA. Use when asked to triage, summarize, or plan work for recent periodic-job-failures issues, including date-bounded Jira searches with gdev-cli, issue detail inspection, hypotheses, reproduction commands, and implementation order. +--- + +# ROTA Periodic Job Check + +## Overview +Triage recent GraalPy periodic job failure Jira tickets and produce implementation-ready plans. + +## Workflow +1. Verify creator identity mapping: +- Treat `ol-automation_ww` as Jira username `olauto`. +- If a query returns zero results unexpectedly, test both identities, then keep `creator = olauto` once verified. + +2. Filter to recent periodic job failures, excluding in-progress or closed issues: +- Default to the last 14 days unless the user specifies otherwise. +- Always state concrete start and end calendar dates in the response. +```bash +gdev-cli jira search --json --max 100 \ + -f key,summary,creator,created,status,labels,components,assignee \ + -jql "project = GR AND component = Python AND creator = olauto AND labels = periodic-job-failures AND created >= -14d AND status != Closed AND status != 'In Progress' ORDER BY created DESC" +``` + +3. Fetch shortlisted issue details with `get-issue`: +```bash +gdev-cli jira get-issue --json -id GR-XXXX \ + | jq '{key, summary:.fields.summary, status:.fields.status.name, created:.fields.created, labels:.fields.labels, assignee:(.fields.assignee.name // null), description:.fields.description, comments:(.fields.comment.comments | map({author:.author.name, created, body}))}' +``` + +4. Convert findings into an implementation-ready plan per issue: +- Extract failing job name, error signature, and log clue. +- Map probable source area in repo. +- Propose the first verification command. +- Define exit criteria to close the ticket. +- Prepare a temporary git worktree per issue with branch naming based on Jira key plus a very short hyphenated description. + +## Output Contract +Return exactly: +1. Query scope used: component, creator, time window, status filter. +2. Count summary: total recent automation issues vs periodic failures. +3. Issue list with key, created date, summary, and status. +4. Per-issue plan with: +- Hypothesis +- First code locations to inspect +- First reproducibility command +- Exit criteria for closing ticket +5. Recommended implementation order. + +## Guardrails +- State concrete dates for recency windows. +- Prefer `--json` and explicit `-f` fields in searches. +- Use `get-issue` only for shortlisted issues to keep output small. diff --git a/.agents/skills/rota-update-import/SKILL.md b/.agents/skills/rota-update-import/SKILL.md new file mode 100644 index 0000000000..ca4a46e601 --- /dev/null +++ b/.agents/skills/rota-update-import/SKILL.md @@ -0,0 +1,34 @@ +--- +name: rota-update-import +description: Run the GraalPy ROTA import update workflow. Use when asked to refresh imports, create the standard Graal import update pull request, inspect generated commits, and hand off to the shared GraalPython Bitbucket PR flow for tasks, gates, and failure fixes. +--- + +# ROTA Import Update + +## Overview +Execute the GraalPy ROTA import update workflow using the repo's automated command, then use the shared GraalPython Bitbucket PR workflow for post-creation tasks and gates. + +## Companion Skills +- Use the `graalpython-bitbucket-pr` skill after `mx python-update-import --rota` creates the PR, to handle Graal Bot tasks, start or verify gates, watch gates, and fix or report failures. + +## Workflow +1. Run the automated branch setup, import update, GitHub unittest-tag refresh, enterprise unittest-tag refresh, push, and standard PR creation: +```bash +mx python-update-import --rota +``` + +2. If the command reports that `../graal-enterprise/graalpython-enterprise` is missing, stop and ask the user to provide that checkout. + +3. Inspect the two generated commits and the created PR for plausibility: +- Expect one import update commit. +- Expect one combined unittest-tag update commit. +- Expect mostly additions, not removals, in the combined unittest-tag commit. + +4. Use the `graalpython-bitbucket-pr` skill in existing-PR mode with the PR created by `mx python-update-import --rota`. + +5. Let that skill handle Graal Bot tasks, start or verify gates, watch gates, and fix or report failures. + +## Guardrails +- Use `mx python-update-import --rota`; do not manually reconstruct the standard ROTA sequence unless the command fails and the failure requires targeted recovery. +- Preserve the automated commit structure unless there is a concrete reason to amend it. +- When reporting status, include the branch name, PR link or ID, gate status, and any follow-up failures. From 8c7a147ad5881e409c96317198deca3f65eb2f2e Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Wed, 29 Apr 2026 13:26:42 +0200 Subject: [PATCH 6/7] Add a script to fetch periodic failures and update the skill to use it --- .../skills/rota-check-periodic-jobs/SKILL.md | 71 +++--- scripts/rota_ci_failures.py | 228 ++++++++++++++++++ 2 files changed, 268 insertions(+), 31 deletions(-) create mode 100755 scripts/rota_ci_failures.py diff --git a/.agents/skills/rota-check-periodic-jobs/SKILL.md b/.agents/skills/rota-check-periodic-jobs/SKILL.md index 5a80398649..c4d18c3975 100644 --- a/.agents/skills/rota-check-periodic-jobs/SKILL.md +++ b/.agents/skills/rota-check-periodic-jobs/SKILL.md @@ -1,53 +1,62 @@ --- name: rota-check-periodic-jobs -description: Analyze recent GraalPy periodic job failure Jira tickets for ROTA. Use when asked to triage, summarize, or plan work for recent periodic-job-failures issues, including date-bounded Jira searches with gdev-cli, issue detail inspection, hypotheses, reproduction commands, and implementation order. +description: Analyze current GraalPy periodic job failures for ROTA. Use when asked to triage, summarize, or plan work for current periodic job failures, starting from scripts/rota_ci_failures.py output, validating linked Jira issues, inspecting logs, forming hypotheses, reproduction commands, and implementation order. --- # ROTA Periodic Job Check ## Overview -Triage recent GraalPy periodic job failure Jira tickets and produce implementation-ready plans. +Triage current GraalPy periodic job failures and produce implementation-ready plans. ## Workflow -1. Verify creator identity mapping: -- Treat `ol-automation_ww` as Jira username `olauto`. -- If a query returns zero results unexpectedly, test both identities, then keep `creator = olauto` once verified. - -2. Filter to recent periodic job failures, excluding in-progress or closed issues: -- Default to the last 14 days unless the user specifies otherwise. -- Always state concrete start and end calendar dates in the response. +1. Verify dashboard environment and run the periodic failure collector: +- This workflow starts from `scripts/rota_ci_failures.py`, not from a Jira search. +- The script requires `OTDASHBOARD_URL` and `OTDASHBOARD_TOKEN`. +- If either variable is missing, stop and ask the user to set the missing variable(s). Do not fall back to querying Jira for the failure list. +- Run from the repository root: ```bash -gdev-cli jira search --json --max 100 \ - -f key,summary,creator,created,status,labels,components,assignee \ - -jql "project = GR AND component = Python AND creator = olauto AND labels = periodic-job-failures AND created >= -14d AND status != Closed AND status != 'In Progress' ORDER BY created DESC" +scripts/rota_ci_failures.py ``` -3. Fetch shortlisted issue details with `get-issue`: +2. Parse the script output: +- If it reports no failed jobs, report that there are no current failed periodic jobs and stop. +- For each failed row, capture target, job name, last successful run, Jira ID(s), and log URL. +- If a failed row has no Jira ID, flag it in the report and continue log analysis. + +3. Validate every reported Jira issue: +- Fetch each Jira issue linked by the script output: +```bash +gdev-cli jira get-issue --json -id GR-XXXX +``` +- Check that the Jira matches the current failure: + - The issue summary or description should identify the same error signature/root cause from the current log. +- Check that the Jira is not too broad: + - A generic timeout/build-failure ticket is acceptable only if it names this job or an intentionally scoped equivalent set of jobs. + - A ticket covering unrelated jobs, unrelated targets, or unrelated error signatures is too broad. +- Check that the Jira has component `Python`. +- Notify the user about every Jira that fails any of these checks. Include the Jira key and the failed check(s). + +4. Inspect failed job logs: +- Use the `log URL` from the script output. For Buildbot URLs, fetch the executor log with: ```bash -gdev-cli jira get-issue --json -id GR-XXXX \ - | jq '{key, summary:.fields.summary, status:.fields.status.name, created:.fields.created, labels:.fields.labels, assignee:(.fields.assignee.name // null), description:.fields.description, comments:(.fields.comment.comments | map({author:.author.name, created, body}))}' +gdev-cli buildbot get-log BUILD_ID ``` +- Use `gdev-cli buildbot rca --build BUILD_ID --wait` when useful, but still inspect the relevant raw log lines. +- Identify the exact failing command, error signature, first meaningful failure, and whether later errors are cleanup fallout. -4. Convert findings into an implementation-ready plan per issue: +5. Convert findings into an implementation-ready plan per failure: - Extract failing job name, error signature, and log clue. - Map probable source area in repo. - Propose the first verification command. -- Define exit criteria to close the ticket. -- Prepare a temporary git worktree per issue with branch naming based on Jira key plus a very short hyphenated description. +- Define exit criteria to close or update the linked ticket. ## Output Contract -Return exactly: -1. Query scope used: component, creator, time window, status filter. -2. Count summary: total recent automation issues vs periodic failures. -3. Issue list with key, created date, summary, and status. -4. Per-issue plan with: -- Hypothesis -- First code locations to inspect -- First reproducibility command -- Exit criteria for closing ticket -5. Recommended implementation order. +Group the output by the Jira issue. For each, report: +- The issue summary, status and assignee +- The failed jobs in a table with job name, last successful run and log URL. +- The analysis of the failure and the proposed plan ## Guardrails -- State concrete dates for recency windows. -- Prefer `--json` and explicit `-f` fields in searches. -- Use `get-issue` only for shortlisted issues to keep output small. +- If `OTDASHBOARD_URL` or `OTDASHBOARD_TOKEN` is missing, ask the user to set the missing variable(s). Do not try to set them yourself +- Do not echo the `OTDASHBOARD_TOKEN` variable and do not leak it anywhere. +- Prefer `--json` for Jira issue fetches. diff --git a/scripts/rota_ci_failures.py b/scripts/rota_ci_failures.py new file mode 100755 index 0000000000..f07bb448f5 --- /dev/null +++ b/scripts/rota_ci_failures.py @@ -0,0 +1,228 @@ +#!/usr/bin/python +# +# Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import annotations + +import json +import os +import sys +import urllib.error +import urllib.parse +import urllib.request +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import UTC, datetime +from typing import Any + +API_PATH = "/api/periodic-jobs/GraalVM" +REPO = "graalpython" +BRANCH = "master" +TARGETS = ["post-merge", "daily", "weekly", "monthly"] + + +@dataclass(frozen=True) +class FailureRow: + target: str + job_name: str + last_successful_run: str + jira_ids: str + log_url: str + + +class DashboardError(RuntimeError): + pass + + +def get_dashboard_token() -> str: + token = os.environ.get("OTDASHBOARD_TOKEN") + if token: + return token + raise DashboardError("OTDASHBOARD_TOKEN environment variable is not set.") + + +def get_api_base() -> str: + dashboard_url = os.environ.get("OTDASHBOARD_URL") + if dashboard_url: + return f"{dashboard_url.rstrip('/')}{API_PATH}" + raise DashboardError("OTDASHBOARD_URL environment variable is not set.") + + +def fetch_json(method: str, url: str, token: str, payload: dict[str, Any] | None = None) -> Any: + data = None + headers = { + "Accept": "application/json", + "Authorization": f"Bearer {token}", + } + if payload is not None: + data = json.dumps(payload).encode("utf-8") + headers["Content-Type"] = "application/json" + request = urllib.request.Request(url, data=data, headers=headers, method=method) + try: + with urllib.request.urlopen(request, timeout=30) as response: + return json.load(response) + except urllib.error.HTTPError as exc: + body = exc.read().decode("utf-8", "replace").strip() + message = body or exc.reason + raise DashboardError(f"Dashboard API request failed with HTTP {exc.code}: {message}") from exc + except urllib.error.URLError as exc: + raise DashboardError(f"Could not reach dashboard API: {exc.reason}") from exc + except json.JSONDecodeError as exc: + raise DashboardError(f"Dashboard API returned invalid JSON for {url}") from exc + + +def get_latest_runs(api_base: str, token: str) -> dict[str, dict[str, Any]]: + payload = { + "repo": REPO, + "branch": BRANCH, + "targets": TARGETS, + } + result = fetch_json("POST", f"{api_base}/latest", token, payload) + if not isinstance(result, dict): + raise DashboardError("Dashboard API returned an unexpected payload for the latest runs.") + + latest_runs: dict[str, dict[str, Any]] = {} + for target in TARGETS: + runs = result.get(target) + if isinstance(runs, list) and runs and isinstance(runs[0], dict): + latest_runs[target] = runs[0] + return latest_runs + + +def get_failed_jobs(api_base: str, token: str, run_id: str, target: str) -> list[dict[str, Any]]: + params = urllib.parse.urlencode({ + "id": run_id, + "status": "failed", + "target": target, + }) + url = f"{api_base}/jobs?{params}" + result = fetch_json("GET", url, token) + if not isinstance(result, list): + raise DashboardError(f"Dashboard API returned an unexpected payload for failed jobs in target {target}.") + return [job for job in result if isinstance(job, dict)] + + +def format_timestamp_ms(timestamp_ms: Any) -> str: + if not isinstance(timestamp_ms, int | float): + return "-" + return datetime.fromtimestamp(timestamp_ms / 1000, tz=UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + + +def format_jira_ids(tickets: Any) -> str: + if not isinstance(tickets, Iterable) or isinstance(tickets, str | bytes): + return "-" + ticket_ids: list[str] = [] + for ticket in tickets: + if isinstance(ticket, dict): + ticket_id = ticket.get("ticketId") + if isinstance(ticket_id, str) and ticket_id not in ticket_ids: + ticket_ids.append(ticket_id) + return ", ".join(ticket_ids) if ticket_ids else "-" + + +def get_nested_timestamp(job: dict[str, Any]) -> Any: + last_successful = job.get("lastSuccessful") + if isinstance(last_successful, dict): + return last_successful.get("run") + return None + + +def get_string(value: Any) -> str: + return value if isinstance(value, str) and value else "-" + + +def build_rows(api_base: str, token: str) -> list[FailureRow]: + rows: list[FailureRow] = [] + latest_runs = get_latest_runs(api_base, token) + for target in TARGETS: + latest_run = latest_runs.get(target) + if not latest_run: + continue + failed_count = latest_run.get("failed", 0) + run_id = latest_run.get("id") + if not isinstance(failed_count, int) or failed_count <= 0 or not isinstance(run_id, str): + continue + + for job in get_failed_jobs(api_base, token, run_id, target): + rows.append(FailureRow( + target=target, + job_name=get_string(job.get("jobName")), + last_successful_run=format_timestamp_ms(get_nested_timestamp(job)), + jira_ids=format_jira_ids(job.get("tickets")), + log_url=get_string(job.get("url")), + )) + + return rows + + +def print_table(rows: list[FailureRow]) -> None: + if not rows: + print("No failed jobs found in the latest periodic CI runs.") + return + + headers = ("target", "job name", "last successful run", "jira ID(s)", "log URL") + values = [headers] + values.extend((row.target, row.job_name, row.last_successful_run, row.jira_ids, row.log_url) for row in rows) + widths = [max(len(str(row[column])) for row in values) for column in range(len(headers))] + + def render(columns: tuple[str, ...]) -> str: + return " | ".join(value.ljust(widths[index]) for index, value in enumerate(columns)) + + separator = "-+-".join("-" * width for width in widths) + print(render(headers)) + print(separator) + for row in rows: + print(render((row.target, row.job_name, row.last_successful_run, row.jira_ids, row.log_url))) + + +def main() -> int: + try: + api_base = get_api_base() + token = get_dashboard_token() + rows = build_rows(api_base, token) + print_table(rows) + except DashboardError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From ba1f919cb176795782bbbb91e1190f3a5d1b5032 Mon Sep 17 00:00:00 2001 From: Michael Simacek Date: Wed, 29 Apr 2026 17:48:32 +0200 Subject: [PATCH 7/7] Untag transient --- .../src/tests/unittest_tags/test_time.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_time.txt b/graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_time.txt index 8ce43f8e67..fdb4946c75 100644 --- a/graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_time.txt +++ b/graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_time.txt @@ -2,7 +2,8 @@ test.test_time.TestAsctime4dyear.test_large_year @ darwin-arm64,linux-aarch64,li test.test_time.TestAsctime4dyear.test_negative @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github test.test_time.TestAsctime4dyear.test_year @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github test.test_time.TestLocale.test_bug_3061 @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github -test.test_time.TestPytime.test_localtime_timezone @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github +# Can fail in specific timezones in CI +!test.test_time.TestPytime.test_localtime_timezone, at line 757 with AssertionError: 3600 != 0 test.test_time.TestPytime.test_short_times @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github test.test_time.TestPytime.test_strptime_timezone @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github test.test_time.TestStrftime4dyear.test_large_year @ darwin-arm64,linux-aarch64,linux-aarch64-github,linux-x86_64,linux-x86_64-github,win32-AMD64,win32-AMD64-github