Metaculus · CodexVeritas · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.github/workflows/analyze-bot-failures.yaml b/.github/workflows/analyze-bot-failures.yaml
@@ -0,0 +1,80 @@
+name: Analyze Bot Forecasting Failures
+
+on:
+  workflow_dispatch:
+    inputs:
+      since:
+        description: "Time window to analyze (e.g. 12h, 1d, 2d, 1w)"
+        required: false
+        default: "1d"
+      skip_agent:
+        description: "Skip the cursor-agent investigation and send the raw report only"
+        type: boolean
+        required: false
+        default: false
+  schedule:
+    - cron: "0 14 * * *" # daily at 14:00 UTC
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+  actions: read
+
+jobs:
+  analyze_failures:
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v5
+
+      - id: setup-python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.11"
+
+      - name: Install poetry
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+          installer-parallel: true
+
+      - name: Load cached venv
+        uses: actions/cache@v5
+        with:
+          path: .venv
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
+
+      - name: Install dependencies
+        run: poetry install --no-interaction --no-root
+
+      - name: Install Cursor CLI
+        run: |
+          curl https://cursor.com/install -fsS | bash
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+          echo "$HOME/.cursor/bin" >> $GITHUB_PATH
+
+      - name: Run scheduled failure analysis
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
+          CURSOR_AGENT_MODEL: ${{ vars.CURSOR_AGENT_MODEL || 'auto' }}
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+          SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }}
+        run: |
+          poetry run python scripts/skills/analyze_bot_failures/run_scheduled_analysis.py \
+            --since "${{ inputs.since || '1d' }}" \
+            ${{ (inputs.skip_agent == true || inputs.skip_agent == 'true') && '--skip-agent' || '' }}
+
+      - name: Upload analysis artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: failure-analysis-${{ github.run_id }}
+          path: logs/workflow_failure_analysis/**
+          if-no-files-found: ignore
+          retention-days: 14
diff --git a/scripts/skills/analyze_bot_failures/SKILL.md b/scripts/skills/analyze_bot_failures/SKILL.md
@@ -7,18 +7,36 @@ description: Analyze failures from the Metaculus bot forecasting GitHub Actions
 
 Workflow for diagnosing failures in the `run-bot-aib-tournament.yaml` GitHub Actions workflow, which runs ~30 bot jobs every 30 minutes via `run_bots.py`.
 
+## Running this automatically (cron + Slack report)
+
+This skill is wired up to run unattended on a daily schedule via the
+`Analyze Bot Forecasting Failures` GitHub Actions workflow
+(`.github/workflows/analyze-bot-failures.yaml`). That workflow calls
+`scripts/skills/analyze_bot_failures/run_scheduled_analysis.py`, which runs the
+parser below, hands the report to a headless Cursor agent that performs steps
+2-5, and posts the agent's prioritized summary to Slack.
+
+Required repo secret: `CURSOR_API_KEY` (for the agent). Optional: `SLACK_BOT_TOKEN`
+and `SLACK_CHANNEL_ID` for Slack delivery. Optional repo variable `CURSOR_AGENT_MODEL`
+(defaults to `auto`). The default `GITHUB_TOKEN` already has the `actions: read` scope
+the parser needs.
+
+The same orchestrator script can be run from a local crontab if preferred (set
+`GITHUB_TOKEN`, `CURSOR_API_KEY`, `SLACK_BOT_TOKEN`, and `SLACK_CHANNEL_ID` in the environment).
+The steps below describe the manual/interactive version of the workflow.
+
 ## Step 1: Pull and aggregate the failure logs
 
 A GitHub token is required (the job-log API rejects unauthenticated requests). Check `GITHUB_TOKEN` env var or `gh auth token`; if neither works, ask the user to run `gh auth login`.
 
 ```bash
-poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py --since 1d
+poetry run python scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py
 ```
 
 Useful options:
-- `--since 12h|2d|1w` or an ISO datetime (default `1d`)
+- `--since 12h|2d|1w|4w` or an ISO datetime (default `3d`). Any reasonable period works; the whole window is analyzed by default (note the workflow runs every 30 min, so wide windows pull many runs and take longer).
 - `--run-id <id>` to analyze one specific run
-- `--max-runs <n>` cap on runs fetched (default 50)
+- `--max-runs <n>` optional cap on runs fetched (default: no cap, so the full `--since` window is covered). A warning is logged if the cap truncates the window.
 
 Output goes to `logs/workflow_failure_analysis/<timestamp>/`:
 - `report.md` — counts by category/bot/question, plus failure groups (deduped by normalized signature) with an example message, traceback, and the deepest repo code frame

diff --git a/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py b/scripts/skills/analyze_bot_failures/analyze_bot_run_failures.py
@@ -83,7 +83,7 @@
     ],
 }
 
-QUESTION_URL_PATTERN = re.compile(r"https://www\.metaculus\.com/questions/\d+/")
+QUESTION_URL_PATTERN = re.compile(r"https://www\.metaculus\.com/questions/\d+/?")
 GH_TIMESTAMP_PREFIX_PATTERN = re.compile(
     r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\s?"
 )
@@ -181,11 +181,11 @@ def parse_since_to_datetime(since: str) -> datetime:
 
 
 def list_workflow_runs(
-    repo: str, workflow: str, token: str, since: datetime, max_runs: int
+    repo: str, workflow: str, token: str, since: datetime, max_runs: int | None
 ) -> list[dict]:
     runs: list[dict] = []
     page = 1
-    while len(runs) < max_runs:
+    while max_runs is None or len(runs) < max_runs:
         data = github_get(
             f"/repos/{repo}/actions/workflows/{workflow}/runs",
             token,
@@ -201,7 +201,14 @@ def list_workflow_runs(
             break
         runs.extend(page_runs)
         page += 1
-    return runs[:max_runs]
+    if max_runs is not None and len(runs) > max_runs:
+        logger.warning(
+            f"--max-runs={max_runs} truncated the {len(runs)} runs found in the "
+            f"--since window; some of the time period is not covered. Raise or drop "
+            f"--max-runs to analyze the full window."
+        )
+        return runs[:max_runs]
+    return runs
 
 
 def list_failed_jobs(repo: str, run_id: int, token: str) -> list[dict]:
@@ -466,13 +473,64 @@ def build_failure_group_section(group: FailureGroup) -> list[str]:
     return section_lines
 
 
-def build_report(job_analyses: list[JobAnalysis], output_dir: Path) -> str:
+def build_question_failure_section(all_events: list[FailureEvent]) -> list[str]:
+    events_with_question = [event for event in all_events if event.question_url]
+    if not events_with_question:
+        return []
+
+    stats_by_question: dict[str, dict] = {}
+    for event in events_with_question:
+        question_stats = stats_by_question.setdefault(
+            event.question_url,
+            {"total": 0, "run_ids": set(), "bot_names": set(), "categories": {}},
+        )
+        question_stats["total"] += 1
+        question_stats["run_ids"].add(event.run_id)
+        question_stats["bot_names"].add(event.bot_name)
+        for category in event.categories:
+            question_stats["categories"][category] = (
+                question_stats["categories"].get(category, 0) + 1
+            )
+
+    questions_ranked_by_consistency = sorted(
+        stats_by_question.items(),
+        key=lambda item: (-len(item[1]["run_ids"]), -item[1]["total"]),
+    )
+
+    section_lines = [
+        "\n## Failures by question (most distinct runs first)\n",
+        "A question failing across many *distinct runs* is far more likely to be "
+        "genuinely broken than one that failed many times within a single run. "
+        "Questions recurring across several runs are candidates for "
+        "`POST_IDS_TO_SKIP` or `POST_IDS_TO_NOT_RAISE_ERRORS_FOR` in `run_bots.py`.\n",
+        "| Question | Failures | Distinct runs | Distinct bots | Top categories |",
+        "| --- | --- | --- | --- | --- |",
+    ]
+    for question_url, question_stats in questions_ranked_by_consistency:
+        top_categories = ", ".join(
+            f"{category} ({count})"
+            for category, count in sorted(
+                question_stats["categories"].items(), key=lambda pair: -pair[1]
+            )[:3]
+        )
+        section_lines.append(
+            f"| {question_url} | {question_stats['total']} | "
+            f"{len(question_stats['run_ids'])} | {len(question_stats['bot_names'])} | "
+            f"{top_categories} |"
+        )
+    return section_lines
+
+
+def build_report(
+    job_analyses: list[JobAnalysis], output_dir: Path, time_window_str: str
+) -> str:
     all_events = [event for analysis in job_analyses for event in analysis.events]
     groups = group_failures(all_events)
 
     report_lines = [
         "# Bot Workflow Failure Report",
         f"\nGenerated: {datetime.now(timezone.utc).isoformat()}",
+        f"Time window: {time_window_str}",
         f"\nFailed jobs analyzed: {len(job_analyses)}",
         f"Individual failures parsed: {len(all_events)}",
         f"Unique failure signatures: {len(groups)}",
@@ -488,14 +546,7 @@ def build_report(job_analyses: list[JobAnalysis], output_dir: Path) -> str:
     for bot_name, count in count_by([event.bot_name for event in all_events]):
         report_lines.append(f"- {bot_name}: {count}")
 
-    question_urls = [event.question_url for event in all_events if event.question_url]
-    if question_urls:
-        report_lines.append(
-            "\n## Questions appearing in failures "
-            "(recurring ones are POST_IDS_TO_SKIP candidates)\n"
-        )
-        for question_url, count in count_by(question_urls):
-            report_lines.append(f"- {question_url} : {count}")
+    report_lines.extend(build_question_failure_section(all_events))
 
     report_lines.append("\n## Failure groups (most frequent first)\n")
     for group in groups:
@@ -587,13 +638,14 @@ def analyze_runs(
     repo: str,
     workflow: str,
     since: str,
-    max_runs: int,
+    max_runs: int | None,
     output_dir: Path,
     run_id: int | None = None,
 ) -> str:
     token = resolve_github_token()
     if run_id is not None:
         runs = [github_get(f"/repos/{repo}/actions/runs/{run_id}", token)]
+        time_window_str = f"Run {run_id}"
     else:
         since_datetime = parse_since_to_datetime(since)
         all_runs = list_workflow_runs(repo, workflow, token, since_datetime, max_runs)
@@ -603,6 +655,7 @@ def analyze_runs(
             if run.get("conclusion") not in ("success", None)
             or run.get("status") != "completed"
         ]
+        time_window_str = f"Since {since_datetime.isoformat()}"
         logger.info(
             f"Found {len(all_runs)} runs since {since_datetime.isoformat()}, "
             f"{len(runs)} with failures"
@@ -621,7 +674,7 @@ def analyze_runs(
     if not job_analyses:
         logger.info("No failed jobs found in the selected window.")
 
-    report_path = build_report(job_analyses, analysis_dir)
+    report_path = build_report(job_analyses, analysis_dir, time_window_str)
     logger.info(f"Report written to {report_path}")
     logger.info(f"Raw logs saved under {raw_logs_dir}")
     return report_path
@@ -634,15 +687,28 @@ def main() -> None:
     )
     parser.add_argument(
         "--since",
-        default="1d",
-        help="Time window like 12h, 2d, 1w, or an ISO datetime (default: 1d)",
+        default="3d",
+        help=(
+            "Time window to analyze. Accepts <N>h / <N>d / <N>w (e.g. 12h, 2d, 1w, "
+            "4w) or an ISO datetime. The whole window is analyzed by default "
+            "(default: 3d)"
+        ),
     )
     parser.add_argument(
         "--run-id", type=int, default=None, help="Analyze a single specific run id"
     )
     parser.add_argument("--repo", default=DEFAULT_REPO)
     parser.add_argument("--workflow", default=DEFAULT_WORKFLOW)
-    parser.add_argument("--max-runs", type=int, default=50)
+    parser.add_argument(
+        "--max-runs",
+        type=int,
+        default=None,
+        help=(
+            "Optional cap on the number of workflow runs fetched. Default is no cap, "
+            "so the full --since window is analyzed. Only set this to limit work on "
+            "very large windows (a warning is logged if it truncates the window)."
+        ),
+    )
     parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
     args = parser.parse_args()