diff --git a/.github/workflows/analyze-bot-failures.yaml b/.github/workflows/analyze-bot-failures.yaml new file mode 100644 index 00000000..0ec9d715 --- /dev/null +++ b/.github/workflows/analyze-bot-failures.yaml @@ -0,0 +1,80 @@ +name: Analyze Bot Forecasting Failures + +on: + workflow_dispatch: + inputs: + since: + description: "Time window to analyze (e.g. 12h, 1d, 2d, 1w)" + required: false + default: "1d" + skip_agent: + description: "Skip the cursor-agent investigation and send the raw report only" + type: boolean + required: false + default: false + schedule: + - cron: "0 14 * * *" # daily at 14:00 UTC + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read + actions: read + +jobs: + analyze_failures: + runs-on: ubuntu-latest + timeout-minutes: 40 + steps: + - name: Check out repository + uses: actions/checkout@v5 + + - id: setup-python + uses: actions/setup-python@v6 + with: + python-version: "3.11" + + - name: Install poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + uses: actions/cache@v5 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + run: poetry install --no-interaction --no-root + + - name: Install Cursor CLI + run: | + curl https://cursor.com/install -fsS | bash + echo "$HOME/.local/bin" >> $GITHUB_PATH + echo "$HOME/.cursor/bin" >> $GITHUB_PATH + + - name: Run scheduled failure analysis + env: + GITHUB_TOKEN: ${{ github.token }} + CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }} + CURSOR_AGENT_MODEL: ${{ vars.CURSOR_AGENT_MODEL || 'auto' }} + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} + run: | + poetry run python scripts/skills/analyze_bot_failures/run_scheduled_analysis.py \ + --since "${{ inputs.since || '1d' }}" \ + ${{ (inputs.skip_agent == true || inputs.skip_agent == 'true') && '--skip-agent' || '' }} + + - name: Upload analysis artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: failure-analysis-${{ github.run_id }} + path: logs/workflow_failure_analysis/** + if-no-files-found: ignore + retention-days: 14 diff --git a/scripts/skills/analyze_bot_failures/SKILL.md b/scripts/skills/analyze_bot_failures/SKILL.md index caf964b7..bc8a85cf 100644 --- a/scripts/skills/analyze_bot_failures/SKILL.md +++ b/scripts/skills/analyze_bot_failures/SKILL.md @@ -7,18 +7,36 @@ description: Analyze failures from the Metaculus bot forecasting GitHub Actions Workflow for diagnosing failures in the `run-bot-aib-tournament.yaml` GitHub Actions workflow, which runs ~30 bot jobs every 30 minutes via `run_bots.py`. +## Running this automatically (cron + Slack report) + +This skill is wired up to run unattended on a daily schedule via the +`Analyze Bot Forecasting Failures` GitHub Actions workflow +(`.github/workflows/analyze-bot-failures.yaml`). That workflow calls +`scripts/skills/analyze_bot_failures/run_scheduled_analysis.py`, which runs the +parser below, hands the report to a headless Cursor agent that performs steps +2-5, and posts the agent's prioritized summary to Slack. + +Required repo secret: `CURSOR_API_KEY` (for the agent). Optional: `SLACK_BOT_TOKEN` +and `SLACK_CHANNEL_ID` for Slack delivery. Optional repo variable `CURSOR_AGENT_MODEL` +(defaults to `auto`). The default `GITHUB_TOKEN` already has the `actions: read` scope +the parser needs. + +The same orchestrator script can be run from a local crontab if preferred (set +`GITHUB_TOKEN`, `CURSOR_API_KEY`, `SLACK_BOT_TOKEN`, and `SLACK_CHANNEL_ID` in the environment). +The steps below describe the manual/interactive version of the workflow. + ## Step 1: Pull and aggregate the failure logs A GitHub token is required (the job-log API rejects unauthenticated requests). Check `GITHUB_TOKEN` env var or `gh auth token`; if neither works, ask the user to run `gh auth login`. ```bash -poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py --since 1d +poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py ``` Useful options: -- `--since 12h|2d|1w` or an ISO datetime (default `1d`) +- `--since 12h|2d|1w|4w` or an ISO datetime (default `3d`). Any reasonable period works; the whole window is analyzed by default (note the workflow runs every 30 min, so wide windows pull many runs and take longer). - `--run-id ` to analyze one specific run -- `--max-runs ` cap on runs fetched (default 50) +- `--max-runs ` optional cap on runs fetched (default: no cap, so the full `--since` window is covered). A warning is logged if the cap truncates the window. Output goes to `logs/workflow_failure_analysis//`: - `report.md` — counts by category/bot/question, plus failure groups (deduped by normalized signature) with an example message, traceback, and the deepest repo code frame diff --git a/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py b/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py index 65670038..b90ba0cf 100644 --- a/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py +++ b/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py @@ -83,7 +83,7 @@ ], } -QUESTION_URL_PATTERN = re.compile(r"https://www\.metaculus\.com/questions/\d+/") +QUESTION_URL_PATTERN = re.compile(r"https://www\.metaculus\.com/questions/\d+/?") GH_TIMESTAMP_PREFIX_PATTERN = re.compile( r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\s?" ) @@ -181,11 +181,11 @@ def parse_since_to_datetime(since: str) -> datetime: def list_workflow_runs( - repo: str, workflow: str, token: str, since: datetime, max_runs: int + repo: str, workflow: str, token: str, since: datetime, max_runs: int | None ) -> list[dict]: runs: list[dict] = [] page = 1 - while len(runs) < max_runs: + while max_runs is None or len(runs) < max_runs: data = github_get( f"/repos/{repo}/actions/workflows/{workflow}/runs", token, @@ -201,7 +201,14 @@ def list_workflow_runs( break runs.extend(page_runs) page += 1 - return runs[:max_runs] + if max_runs is not None and len(runs) > max_runs: + logger.warning( + f"--max-runs={max_runs} truncated the {len(runs)} runs found in the " + f"--since window; some of the time period is not covered. Raise or drop " + f"--max-runs to analyze the full window." + ) + return runs[:max_runs] + return runs def list_failed_jobs(repo: str, run_id: int, token: str) -> list[dict]: @@ -466,13 +473,64 @@ def build_failure_group_section(group: FailureGroup) -> list[str]: return section_lines -def build_report(job_analyses: list[JobAnalysis], output_dir: Path) -> str: +def build_question_failure_section(all_events: list[FailureEvent]) -> list[str]: + events_with_question = [event for event in all_events if event.question_url] + if not events_with_question: + return [] + + stats_by_question: dict[str, dict] = {} + for event in events_with_question: + question_stats = stats_by_question.setdefault( + event.question_url, + {"total": 0, "run_ids": set(), "bot_names": set(), "categories": {}}, + ) + question_stats["total"] += 1 + question_stats["run_ids"].add(event.run_id) + question_stats["bot_names"].add(event.bot_name) + for category in event.categories: + question_stats["categories"][category] = ( + question_stats["categories"].get(category, 0) + 1 + ) + + questions_ranked_by_consistency = sorted( + stats_by_question.items(), + key=lambda item: (-len(item[1]["run_ids"]), -item[1]["total"]), + ) + + section_lines = [ + "\n## Failures by question (most distinct runs first)\n", + "A question failing across many *distinct runs* is far more likely to be " + "genuinely broken than one that failed many times within a single run. " + "Questions recurring across several runs are candidates for " + "`POST_IDS_TO_SKIP` or `POST_IDS_TO_NOT_RAISE_ERRORS_FOR` in `run_bots.py`.\n", + "| Question | Failures | Distinct runs | Distinct bots | Top categories |", + "| --- | --- | --- | --- | --- |", + ] + for question_url, question_stats in questions_ranked_by_consistency: + top_categories = ", ".join( + f"{category} ({count})" + for category, count in sorted( + question_stats["categories"].items(), key=lambda pair: -pair[1] + )[:3] + ) + section_lines.append( + f"| {question_url} | {question_stats['total']} | " + f"{len(question_stats['run_ids'])} | {len(question_stats['bot_names'])} | " + f"{top_categories} |" + ) + return section_lines + + +def build_report( + job_analyses: list[JobAnalysis], output_dir: Path, time_window_str: str +) -> str: all_events = [event for analysis in job_analyses for event in analysis.events] groups = group_failures(all_events) report_lines = [ "# Bot Workflow Failure Report", f"\nGenerated: {datetime.now(timezone.utc).isoformat()}", + f"Time window: {time_window_str}", f"\nFailed jobs analyzed: {len(job_analyses)}", f"Individual failures parsed: {len(all_events)}", f"Unique failure signatures: {len(groups)}", @@ -488,14 +546,7 @@ def build_report(job_analyses: list[JobAnalysis], output_dir: Path) -> str: for bot_name, count in count_by([event.bot_name for event in all_events]): report_lines.append(f"- {bot_name}: {count}") - question_urls = [event.question_url for event in all_events if event.question_url] - if question_urls: - report_lines.append( - "\n## Questions appearing in failures " - "(recurring ones are POST_IDS_TO_SKIP candidates)\n" - ) - for question_url, count in count_by(question_urls): - report_lines.append(f"- {question_url} : {count}") + report_lines.extend(build_question_failure_section(all_events)) report_lines.append("\n## Failure groups (most frequent first)\n") for group in groups: @@ -587,13 +638,14 @@ def analyze_runs( repo: str, workflow: str, since: str, - max_runs: int, + max_runs: int | None, output_dir: Path, run_id: int | None = None, ) -> str: token = resolve_github_token() if run_id is not None: runs = [github_get(f"/repos/{repo}/actions/runs/{run_id}", token)] + time_window_str = f"Run {run_id}" else: since_datetime = parse_since_to_datetime(since) all_runs = list_workflow_runs(repo, workflow, token, since_datetime, max_runs) @@ -603,6 +655,7 @@ def analyze_runs( if run.get("conclusion") not in ("success", None) or run.get("status") != "completed" ] + time_window_str = f"Since {since_datetime.isoformat()}" logger.info( f"Found {len(all_runs)} runs since {since_datetime.isoformat()}, " f"{len(runs)} with failures" @@ -621,7 +674,7 @@ def analyze_runs( if not job_analyses: logger.info("No failed jobs found in the selected window.") - report_path = build_report(job_analyses, analysis_dir) + report_path = build_report(job_analyses, analysis_dir, time_window_str) logger.info(f"Report written to {report_path}") logger.info(f"Raw logs saved under {raw_logs_dir}") return report_path @@ -634,15 +687,28 @@ def main() -> None: ) parser.add_argument( "--since", - default="1d", - help="Time window like 12h, 2d, 1w, or an ISO datetime (default: 1d)", + default="3d", + help=( + "Time window to analyze. Accepts h / d / w (e.g. 12h, 2d, 1w, " + "4w) or an ISO datetime. The whole window is analyzed by default " + "(default: 3d)" + ), ) parser.add_argument( "--run-id", type=int, default=None, help="Analyze a single specific run id" ) parser.add_argument("--repo", default=DEFAULT_REPO) parser.add_argument("--workflow", default=DEFAULT_WORKFLOW) - parser.add_argument("--max-runs", type=int, default=50) + parser.add_argument( + "--max-runs", + type=int, + default=None, + help=( + "Optional cap on the number of workflow runs fetched. Default is no cap, " + "so the full --since window is analyzed. Only set this to limit work on " + "very large windows (a warning is logged if it truncates the window)." + ), + ) parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) args = parser.parse_args() diff --git a/scripts/skills/analyze_bot_failures/run_scheduled_analysis.py b/scripts/skills/analyze_bot_failures/run_scheduled_analysis.py new file mode 100644 index 00000000..e862f0a3 --- /dev/null +++ b/scripts/skills/analyze_bot_failures/run_scheduled_analysis.py @@ -0,0 +1,277 @@ +""" +Scheduled, agent-driven version of the analyze-bot-failures skill. + +This orchestrates the whole skill end to end so it can run unattended on a +cron (GitHub Actions or a local crontab): + +1. Runs the deterministic failure-log parser (``analyze_bot_run_failures.py``) + to produce ``report.md`` / ``failures.json`` / raw logs. +2. Hands the report to a headless Cursor agent (``cursor-agent``) which follows + the investigation/triage steps of the skill and writes a prioritized + markdown summary. +3. Delivers that summary to Slack using a bot token, and + always prints it to stdout so it shows up in the job logs. + +Environment variables: + GITHUB_TOKEN / GH_TOKEN token for downloading job logs (required) + CURSOR_API_KEY auth for the headless cursor-agent (required for the agent step) + CURSOR_AGENT_MODEL model slug for cursor-agent (default: "auto") + SLACK_BOT_TOKEN Slack bot token (xoxb-...) to post the report + SLACK_CHANNEL_ID Slack channel ID to post the report to + GITHUB_SERVER_URL/GITHUB_REPOSITORY/GITHUB_RUN_ID + used to link back to the triggering Actions run + +Example: + poetry run python scripts/skills/analyze_bot_failures/run_scheduled_analysis.py --since 1d +""" + +import argparse +import json +import logging +import os +import signal +import subprocess +import sys +import urllib.request +from pathlib import Path + +logger = logging.getLogger(__name__) + +SCRIPT_DIR = Path(__file__).resolve().parent +ANALYSIS_SCRIPT = SCRIPT_DIR / "analyze_bot_run_failures.py" +SKILL_FILE = SCRIPT_DIR / "SKILL.md" +DEFAULT_OUTPUT_DIR = Path("logs/workflow_failure_analysis") +AGENT_REPORT_FILENAME = "agent_report.md" +SLACK_MESSAGE_CHAR_LIMIT = 3500 + + +def run_failure_log_analysis(since: str, output_dir: Path) -> None: + logger.info("Running failure-log parser (since=%s)", since) + subprocess.run( + [ + sys.executable, + str(ANALYSIS_SCRIPT), + "--since", + since, + "--output-dir", + str(output_dir), + ], + check=True, + ) + + +def find_latest_analysis_dir(output_dir: Path) -> Path: + timestamped_dirs = [path for path in output_dir.iterdir() if path.is_dir()] + if not timestamped_dirs: + raise RuntimeError(f"No analysis output found under {output_dir}") + return max(timestamped_dirs, key=lambda path: path.name) + + +def count_parsed_failures(analysis_dir: Path) -> int: + failures_json = analysis_dir / "failures.json" + if not failures_json.exists(): + return 0 + return len(json.loads(failures_json.read_text())) + + +def build_agent_prompt(analysis_dir: Path, agent_report_path: Path) -> str: + return f"""You are running the "analyze-bot-failures" skill unattended on a schedule. + +The deterministic log parser has already run. Its output is here: +- Report: `{analysis_dir / "report.md"}` +- Machine-readable failures: `{analysis_dir / "failures.json"}` +- Raw per-job logs: `{analysis_dir / "raw_logs"}/` + +The full skill instructions are at `{SKILL_FILE}`. Read it, then carry out +steps 2-5 (read the report, spot-check 2-3 raw logs for the most frequent +signatures, triage transient noise vs real bugs, map likely-real bugs to code +by reading the relevant files, and identify recurring question-id skip +candidates). + +Hard constraints for this unattended run: +- DO NOT modify any code, skip lists, or workflows. This is read-only + investigation. The only file you may write is the report described below. +- Be efficient: a handful of targeted file reads and at most a few raw-log + spot-checks. Do not attempt to fix anything. + +When done, WRITE your findings as markdown to exactly this path: +`{agent_report_path}` + +Structure the report so it is useful as a Slack digest: +1. One-line health summary (how many failed jobs / failures, overall severity). +2. Failure counts by category and by bot (from report.md). +3. Real bugs found: each with a short evidence excerpt + code location + (`path:line`) + a concrete proposed fix. If none, say so explicitly. +4. Transient noise: what to ignore and why. +5. Question skip candidates: recurring question IDs with counts, if any. +Keep it tight and skimmable. Lead with what a human needs to act on. + +After writing the file, reply with just the word DONE.""" + + +def run_cursor_agent( + prompt: str, model: str, workspace: Path, timeout_seconds: int +) -> None: + command = [ + "cursor-agent", + "--print", + "--force", + "--output-format", + "text", + "--model", + model, + prompt, + ] + logger.info("Invoking cursor-agent (model=%s, timeout=%ss)", model, timeout_seconds) + process = subprocess.Popen( + command, + cwd=str(workspace), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + start_new_session=True, + ) + try: + stdout, _ = process.communicate(timeout=timeout_seconds) + logger.info("cursor-agent output:\n%s", stdout) + except subprocess.TimeoutExpired: + logger.warning( + "cursor-agent did not exit within %ss (known headless hang); " + "terminating and using whatever report it already wrote.", + timeout_seconds, + ) + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + process.wait() + + +def read_report_or_fallback(agent_report_path: Path, analysis_dir: Path) -> str: + if agent_report_path.exists() and agent_report_path.read_text().strip(): + return agent_report_path.read_text().strip() + logger.warning( + "Agent did not produce %s; falling back to the deterministic report.", + agent_report_path, + ) + deterministic_report = analysis_dir / "report.md" + if deterministic_report.exists(): + return ( + "NOTE: the agent investigation step produced no report; " + "showing the raw deterministic report instead.\n\n" + + deterministic_report.read_text().strip() + ) + return "No report could be generated." + + +def build_run_link() -> str | None: + server = os.getenv("GITHUB_SERVER_URL") + repository = os.getenv("GITHUB_REPOSITORY") + run_id = os.getenv("GITHUB_RUN_ID") + if server and repository and run_id: + return f"{server}/{repository}/actions/runs/{run_id}" + return None + + +def post_to_slack(report: str, token: str, channel_id: str) -> None: + run_link = build_run_link() + header = "*Bot forecasting failure report*" + if run_link: + header += f" (<{run_link}|full logs & artifacts>)" + body = report + if len(body) > SLACK_MESSAGE_CHAR_LIMIT: + body = body[:SLACK_MESSAGE_CHAR_LIMIT] + "\n…(truncated — see full artifacts)" + + payload = json.dumps({"channel": channel_id, "text": f"{header}\n\n{body}"}).encode( + "utf-8" + ) + request = urllib.request.Request( + "https://slack.com/api/chat.postMessage", + data=payload, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {token}", + }, + ) + with urllib.request.urlopen(request, timeout=30) as response: + logger.info("Posted report to Slack (status %s)", response.status) + + +def deliver_report(report: str) -> None: + print("\n===== REPORT =====\n") + print(report) + print("\n==================\n") + slack_token = os.getenv("SLACK_BOT_TOKEN") + channel_id = os.getenv("SLACK_CHANNEL_ID") + if slack_token and channel_id: + post_to_slack(report, slack_token, channel_id) + else: + logger.info( + "Skipping Slack post; SLACK_BOT_TOKEN and/or SLACK_CHANNEL_ID not set." + ) + + +def run_scheduled_analysis( + since: str, + output_dir: Path, + model: str, + agent_timeout_seconds: int, + skip_agent: bool, +) -> None: + run_failure_log_analysis(since, output_dir) + analysis_dir = find_latest_analysis_dir(output_dir) + failure_count = count_parsed_failures(analysis_dir) + logger.info("Parsed %s failures in %s", failure_count, analysis_dir) + + if failure_count == 0: + deliver_report( + f"No bot job failures found in the last `{since}`. All green. āœ…" + ) + return + + if skip_agent or not os.getenv("CURSOR_API_KEY"): + logger.warning( + "Skipping agent step (skip_agent=%s, CURSOR_API_KEY set=%s); " + "delivering the deterministic report.", + skip_agent, + bool(os.getenv("CURSOR_API_KEY")), + ) + deliver_report((analysis_dir / "report.md").read_text().strip()) + return + + agent_report_path = analysis_dir / AGENT_REPORT_FILENAME + prompt = build_agent_prompt(analysis_dir, agent_report_path) + run_cursor_agent( + prompt=prompt, + model=model, + workspace=Path.cwd(), + timeout_seconds=agent_timeout_seconds, + ) + report = read_report_or_fallback(agent_report_path, analysis_dir) + deliver_report(report) + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + parser = argparse.ArgumentParser( + description="Run the analyze-bot-failures skill end to end on a schedule" + ) + parser.add_argument("--since", default="1d") + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--model", default=os.getenv("CURSOR_AGENT_MODEL", "auto")) + parser.add_argument("--agent-timeout-seconds", type=int, default=1500) + parser.add_argument( + "--skip-agent", + action="store_true", + help="Skip the cursor-agent step and deliver the deterministic report only", + ) + args = parser.parse_args() + + run_scheduled_analysis( + since=args.since, + output_dir=args.output_dir, + model=args.model, + agent_timeout_seconds=args.agent_timeout_seconds, + skip_agent=args.skip_agent, + ) + + +if __name__ == "__main__": + main()