From 75f68de9b20e28176202b2882eb061a1939b2275 Mon Sep 17 00:00:00 2001
From: Jaden Earl <jadendwightearl@gmail.com>
Date: Thu, 25 Jun 2026 16:11:28 -0600
Subject: [PATCH] Add source archive

Capture HTML + screenshot + markdown for the URLs a forecasting bot cited and
store them (S3 or local) with provenance, deduplicated by url + content-hash.
Heavy backends are an optional `source-archive` extra.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .env.template                                 |  34 +-
 .gitignore                                    |   4 +
 _typos.toml                                   |   2 +
 .../test_source_archive/test_backends.py      | 266 +++++++++
 .../test_source_archive/test_canonicalize.py  |  72 +++
 .../test_source_archive/test_catalog.py       | 122 ++++
 .../test_comment_harvester.py                 |  91 ---
 .../test_source_archive/test_content_store.py |  90 ++-
 .../test_source_archive/test_coverage.py      | 110 ++++
 .../test_source_archive/test_metaculus_db.py  | 124 ++++
 .../test_pipeline_and_manifest.py             | 113 +++-
 .../test_source_archive/test_reindex.py       |  87 +++
 .../test_source_archive/test_reports.py       |  51 ++
 .../test_screenshot_encoding.py               |  67 +++
 .../test_trace_extraction.py                  | 215 +++++++
 .../test_url_extraction.py                    |  19 +
 .../agents_and_tools/source_archive/README.md | 195 +++++-
 .../source_archive/__init__.py                |   6 +-
 .../source_archive/benchmark.py               | 459 ++++++++++++++
 .../source_archive/canonicalize.py            | 115 ++++
 .../source_archive/catalog.py                 | 562 ++++++++++++++++++
 .../agents_and_tools/source_archive/cli.py    | 274 ++++++++-
 .../agents_and_tools/source_archive/config.py |  42 +-
 .../source_archive/content_store.py           | 203 ++++++-
 .../source_archive/coverage.py                | 237 ++++++++
 .../source_archive/fetchers/__init__.py       |  96 ++-
 .../fetchers/cloakbrowser_fetcher.py          |  62 ++
 .../fetchers/firecrawl_fetcher.py             |  46 +-
 .../fetchers/hyperbrowser_fetcher.py          | 149 +++++
 .../source_archive/fetchers/pdf_fetcher.py    | 146 +++++
 .../fetchers/playwright_fetcher.py            | 140 ++++-
 .../source_archive/ingest/__init__.py         |  21 +-
 .../ingest/metaculus_comments.py              | 180 ------
 .../source_archive/ingest/metaculus_db.py     | 215 +++++++
 .../source_archive/ingest/trace_extraction.py | 380 ++++++++++++
 .../source_archive/ingest/url_extraction.py   |  49 +-
 .../source_archive/manifest.py                |  17 +-
 .../agents_and_tools/source_archive/models.py |  21 +-
 .../source_archive/pipeline.py                | 170 ++++++
 .../source_archive/reindex.py                 | 278 +++++++++
 .../source_archive/reports.py                 |  72 +++
 .../source_archive/storage/blob_store.py      |   5 +
 .../source_archive/storage/local_store.py     |  10 +
 .../source_archive/storage/s3_store.py        |   6 +
 .../agents_and_tools/source_archive/viewer.py | 409 +++++++++++++
 poetry.lock                                   |  83 ++-
 pyproject.toml                                |   9 +-
 47 files changed, 5690 insertions(+), 434 deletions(-)
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py
 delete mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py
 create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/benchmark.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/canonicalize.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/catalog.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/coverage.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py
 delete mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/reindex.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/reports.py
 create mode 100644 forecasting_tools/agents_and_tools/source_archive/viewer.py

diff --git a/.env.template b/.env.template
index 7f167fd0..e96d4a48 100644
--- a/.env.template
+++ b/.env.template
@@ -17,8 +17,40 @@ METACULUS_API_BASE_URL=https://www.metaculus.com/api
 # As of Jan 23rd 2025, only used for free semantic similarity calculation in Deduplicator, but defaults to OpenAI if not filled in
 HUGGINGFACE_API_KEY=
 
-# As of Jun 10 2025, used for browser use agents
+# As of Jun 10 2025, used for browser use agents.
+# Also a fallback capture backend for the source archive (see below).
 HYPERBROWSER_API_KEY=
 
+# --- Source archive (agents_and_tools/source_archive) -----------------------
+# Capture HTML + screenshot + markdown for every URL a bot cites. All optional;
+# blank WEB_ARCHIVE_S3_BUCKET stores locally instead of S3.
+WEB_ARCHIVE_S3_BUCKET=
+WEB_ARCHIVE_S3_PREFIX=source-archive
+WEB_ARCHIVE_AWS_PROFILE=
+# Set to a local capture directory to run/view the archive with no S3 (the
+# viewer reads from here when set). E.g. `capture --local ./archive`.
+WEB_ARCHIVE_LOCAL_DIR=
+WEB_ARCHIVE_TTL_DAYS=14
+# Managed fallback backends for the anti-bot / PDF tail behind self-hosted
+# Playwright. FIRECRAWL also parses PDFs natively (OCR fallback for PdfFetcher).
+FIRECRAWL_API_KEY=
+# Firecrawl proxy mode for hardened anti-bot sites: basic (1 credit) | auto |
+# stealth/enhanced (5 credits). Leave "basic" unless you need Cloudflare bypass.
+WEB_ARCHIVE_FIRECRAWL_PROXY=basic
+# Hyperbrowser session knobs (proxy turns a 1-credit scrape into 10 credits).
+WEB_ARCHIVE_HYPERBROWSER_PROXY=true
+WEB_ARCHIVE_HYPERBROWSER_STEALTH=true
+WEB_ARCHIVE_HYPERBROWSER_CAPTCHA=true
+# CloakBrowser (self-hosted anti-bot Playwright fork) module, if installed
+# (`pip install cloakbrowser`). Exposes cloakbrowser.launch() -> Browser.
+WEB_ARCHIVE_CLOAKBROWSER_IMPORT=cloakbrowser
+WEB_ARCHIVE_PDF_MAX_PAGES=50
+# Operator-only: database DSN for `harvest-db` (reads a bot's cited URLs straight
+# from Postgres). libpq DSN or postgresql:// URL — e.g. a Neon connection string.
+# This DSN is a real secret. PREFER the macOS Keychain (item `metaculus-db-dsn`)
+# over this file — see the source_archive README "DSN resolution". Leave blank to
+# use the Keychain / local default.
+METACULUS_DB_DSN=
+
 # Disable if in Streamlit Cloud
 FILE_WRITING_ALLOWED=TRUE
diff --git a/.gitignore b/.gitignore
index 96b48188..4a05a450 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 # .idea/
+
+# Private bot trace samples must never land in this public repo (kept locally).
+butler-traces/
+**/butler-traces/
diff --git a/_typos.toml b/_typos.toml
index 0359061b..83ba4592 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,6 +1,8 @@
 [default]
 extend-ignore-identifiers-re = [
     "AttributeID.*Supress.*",
+    # Real tracking-query params stripped during URL canonicalization (not typos).
+    "oly_.*",
 ]
 
 [default.extend-identifiers]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py
new file mode 100644
index 00000000..3adcf9be
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py
@@ -0,0 +1,266 @@
+"""Unit tests for the backup capture backends and the bake-off pricing model.
+
+These mock the vendor SDKs so they run without API keys, network, browsers, or
+the optional pymupdf/playwright/cloakbrowser packages installed.
+"""
+
+from __future__ import annotations
+
+import base64
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive import benchmark as B
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+    build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+    CloakBrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+    FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+    HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+    PdfFetcher,
+    looks_like_pdf,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+# --- Firecrawl proxy/stealth wiring ------------------------------------------
+def test_firecrawl_basic_sends_no_proxy_key():
+    f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy="basic"))
+    assert "proxy" not in f._scrape_kwargs(["markdown"])
+
+
+@pytest.mark.parametrize("mode", ["auto", "stealth", "enhanced"])
+def test_firecrawl_stealth_sends_proxy_key(mode):
+    f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy=mode))
+    assert f._scrape_kwargs(["markdown"])["proxy"] == mode
+
+
+def test_firecrawl_fetch_pdf_markdown():
+    class FakeClient:
+        def scrape(self, url, **kwargs):
+            assert kwargs["formats"] == ["markdown"]
+            return {"markdown": "# PDF body " + "x " * 200}
+
+    f = FirecrawlFetcher(ArchiveConfig(firecrawl_api_key="k"), client=FakeClient())
+    assert f.fetch_pdf_markdown("https://x/y.pdf").startswith("# PDF body")
+
+
+# --- Hyperbrowser screenshot coercion + result mapping -----------------------
+def test_hyperbrowser_coerce_screenshot_data_uri():
+    raw = b"\x89PNG fake"
+    uri = "data:image/png;base64," + base64.b64encode(raw).decode()
+    shot, ctype = HyperbrowserFetcher._coerce_screenshot(uri)
+    assert shot == raw and ctype == "image/png"
+
+
+def test_hyperbrowser_coerce_screenshot_bare_base64():
+    raw = b"\x89PNG fake"
+    shot, ctype = HyperbrowserFetcher._coerce_screenshot(base64.b64encode(raw).decode())
+    assert shot == raw and ctype == "image/png"
+
+
+def test_hyperbrowser_coerce_screenshot_none():
+    assert HyperbrowserFetcher._coerce_screenshot(None) == (None, None)
+
+
+def test_hyperbrowser_fetch_maps_result(monkeypatch):
+    class Data:
+        metadata = {"statusCode": 200, "title": "T", "sourceURL": "https://final"}
+        html = "<html>ok</html>"
+        markdown = "ok " * 100
+        screenshot = None
+
+    class Resp:
+        status = "completed"
+        error = None
+        data = Data()
+
+    class FakeClient:
+        class scrape:
+            @staticmethod
+            def start_and_wait(params):
+                return Resp()
+
+    f = HyperbrowserFetcher(
+        ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient()
+    )
+    # Avoid constructing real SDK request models in the unit test.
+    monkeypatch.setattr(f, "_params", lambda url: None)
+    result = f.fetch("https://x")
+    assert result.fetcher == "hyperbrowser"
+    assert result.final_url == "https://final"
+    assert result.status_code == 200
+    assert result.metadata["used_proxy"] is True
+
+
+def test_hyperbrowser_failed_job_raises(monkeypatch):
+    class Resp:
+        status = "failed"
+        error = "blocked"
+        data = None
+
+    class FakeClient:
+        class scrape:
+            @staticmethod
+            def start_and_wait(params):
+                return Resp()
+
+    f = HyperbrowserFetcher(
+        ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient()
+    )
+    monkeypatch.setattr(f, "_params", lambda url: None)
+    with pytest.raises(FetchError):
+        f.fetch("https://x")
+
+
+def test_hyperbrowser_requires_key():
+    with pytest.raises(FetchError):
+        HyperbrowserFetcher(ArchiveConfig(hyperbrowser_api_key=None)).fetch("https://x")
+
+
+# --- PDF fetcher -------------------------------------------------------------
+def test_looks_like_pdf():
+    assert looks_like_pdf("https://x/report.pdf")
+    assert looks_like_pdf("https://x/report.PDF?v=2")
+    assert not looks_like_pdf("https://x/report.html")
+
+
+def test_pdf_rejects_non_pdf_bytes():
+    f = PdfFetcher(
+        ArchiveConfig(),
+        downloader=lambda url, t: (b"<html>not a pdf</html>", url, 200),
+    )
+    with pytest.raises(FetchError):
+        f.fetch("https://x/fake.pdf")
+
+
+def test_pdf_falls_back_to_firecrawl_when_local_thin(monkeypatch):
+    class FakeFirecrawl:
+        def fetch_pdf_markdown(self, url):
+            return "# Scanned doc recovered by OCR " + "y " * 200
+
+    f = PdfFetcher(
+        ArchiveConfig(),
+        firecrawl=FakeFirecrawl(),
+        downloader=lambda url, t: (b"%PDF- minimal", url, 200),
+    )
+    # Force the local parser to look thin regardless of whether pymupdf is present.
+    monkeypatch.setattr(f, "_parse_local", lambda data: (None, None, None, 3, "none"))
+    result = f.fetch("https://x/scan.pdf")
+    assert result.metadata["pdf_engine"] == "firecrawl"
+    assert "OCR" in result.markdown
+
+
+def test_pdf_uses_local_when_text_is_rich(monkeypatch):
+    f = PdfFetcher(
+        ArchiveConfig(),
+        downloader=lambda url, t: (b"%PDF- minimal", url, 200),
+    )
+    rich = "# Title\n" + "real body text " * 100
+    monkeypatch.setattr(
+        f, "_parse_local", lambda data: (rich, b"png", "image/png", 5, "pymupdf4llm")
+    )
+    result = f.fetch("https://x/clean.pdf")
+    assert result.metadata["pdf_engine"] == "pymupdf4llm"
+    assert result.metadata["pdf_pages"] == 5
+    assert result.screenshot == b"png"
+
+
+# --- CloakBrowser ------------------------------------------------------------
+def test_cloakbrowser_missing_package_gives_clear_error(monkeypatch):
+    # Force every import to fail so this passes whether or not cloakbrowser is
+    # actually installed in the test environment.
+    import forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher as cb
+
+    def _boom(name):
+        raise ImportError(name)
+
+    monkeypatch.setattr(cb.importlib, "import_module", _boom)
+    f = CloakBrowserFetcher(ArchiveConfig())
+    with pytest.raises(FetchError) as exc:
+        f._launch_browser()
+    assert "cloakbrowser" in str(exc.value).lower()
+
+
+# --- Pricing model -----------------------------------------------------------
+def test_pricing_self_host_is_floor():
+    r = CaptureResult(url="u", final_url="u")
+    assert B.estimate_cost("playwright", r, 1_000_000, B.Pricing()) == 0.00001
+    assert B.estimate_cost("cloakbrowser", r, 1_000_000, B.Pricing()) == 0.00001
+
+
+def test_pricing_firecrawl_basic_vs_stealth():
+    basic = CaptureResult(url="u", final_url="u", metadata={"firecrawl_proxy": "basic"})
+    stealth = CaptureResult(
+        url="u", final_url="u", metadata={"firecrawl_proxy": "auto"}
+    )
+    assert B.estimate_cost("firecrawl", basic, 0, B.Pricing()) == pytest.approx(0.00083)
+    assert B.estimate_cost(
+        "firecrawl-stealth", stealth, 0, B.Pricing()
+    ) == pytest.approx(0.00415)
+
+
+def test_pricing_hyperbrowser_proxy_includes_bandwidth():
+    r = CaptureResult(url="u", final_url="u", metadata={"used_proxy": True})
+    # 10 credits ($0.01) + 1MB * $10/GB ($0.01) = $0.02
+    assert B.estimate_cost("hyperbrowser", r, 1_000_000, B.Pricing()) == pytest.approx(
+        0.02
+    )
+
+
+def test_pricing_pdf_local_is_free_firecrawl_is_per_page():
+    local = CaptureResult(
+        url="u", final_url="u", metadata={"pdf_engine": "pymupdf4llm"}
+    )
+    ocr = CaptureResult(
+        url="u", final_url="u", metadata={"pdf_engine": "firecrawl", "pdf_pages": 10}
+    )
+    assert B.estimate_cost("pdf", local, 0, B.Pricing()) == 0.0
+    assert B.estimate_cost("pdf", ocr, 0, B.Pricing()) == pytest.approx(0.0083)
+
+
+# --- Default tiered chain composition ----------------------------------------
+def _fake_browser():
+    from unittest.mock import MagicMock
+
+    return None, MagicMock()  # (playwright_handle, browser) — browser.close() ok
+
+
+def test_default_chain_cloakbrowser_is_primary(monkeypatch):
+    # CloakBrowser available -> it is the single self-hosted browser tier.
+    monkeypatch.setattr(
+        CloakBrowserFetcher, "_launch_browser", lambda self: _fake_browser()
+    )
+    config = ArchiveConfig(hyperbrowser_api_key="h", firecrawl_api_key="f")
+    with build_default_fetcher(config) as fetcher:
+        names = [b.name for b in fetcher._tiered.backends]
+    # Note: exactly one browser tier (cloakbrowser), not vanilla + cloak.
+    assert names == ["cloakbrowser", "pdf", "hyperbrowser", "firecrawl"]
+
+
+def test_default_chain_falls_back_to_playwright_and_skips_unkeyed(monkeypatch):
+    from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+        PlaywrightFetcher,
+    )
+
+    # CloakBrowser not installed -> vanilla Playwright is the browser tier.
+    def raise_unavailable(self):
+        raise FetchError("cloakbrowser not installed")
+
+    monkeypatch.setattr(CloakBrowserFetcher, "_launch_browser", raise_unavailable)
+    monkeypatch.setattr(
+        PlaywrightFetcher, "_launch_browser", lambda self: _fake_browser()
+    )
+    config = ArchiveConfig(hyperbrowser_api_key=None, firecrawl_api_key=None)
+    with build_default_fetcher(config) as fetcher:
+        names = [b.name for b in fetcher._tiered.backends]
+    assert names == ["playwright", "pdf"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py
new file mode 100644
index 00000000..e9476409
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import url_hash
+
+# (raw, expected canonical) — each pair documents one normalization rule.
+CASES = [
+    # fragment dropped
+    ("https://a.test/x#section", "https://a.test/x"),
+    # trailing slash dropped (non-root)
+    ("https://a.test/x/", "https://a.test/x"),
+    # root path collapses (with or without slash) to host only
+    ("https://a.test/", "https://a.test"),
+    ("https://a.test", "https://a.test"),
+    # scheme + host lowercased, path case preserved
+    ("HTTPS://A.TEST/Path", "https://a.test/Path"),
+    # default ports stripped, non-default kept
+    ("http://a.test:80/x", "http://a.test/x"),
+    ("https://a.test:443/x", "https://a.test/x"),
+    ("https://a.test:8443/x", "https://a.test:8443/x"),
+    # tracking params removed, meaningful params kept
+    ("https://a.test/x?utm_source=z&utm_medium=email", "https://a.test/x"),
+    ("https://a.test/x?id=7&fbclid=abc", "https://a.test/x?id=7"),
+    ("https://a.test/x?gclid=abc&igshid=q", "https://a.test/x"),
+    # remaining params sorted (order-independent)
+    ("https://a.test/x?b=2&a=1", "https://a.test/x?a=1&b=2"),
+    # bare "ref"/"source" are intentionally preserved
+    ("https://a.test/x?ref=home", "https://a.test/x?ref=home"),
+    # combination
+    (
+        "HTTPS://A.TEST:443/Path/?b=2&utm_campaign=spring&a=1#frag",
+        "https://a.test/Path?a=1&b=2",
+    ),
+    # non-http(s) left alone
+    ("mailto:someone@a.test", "mailto:someone@a.test"),
+]
+
+
+@pytest.mark.parametrize("raw,expected", CASES)
+def test_canonicalize_cases(raw: str, expected: str):
+    assert canonicalize_url(raw) == expected
+
+
+@pytest.mark.parametrize("raw,_expected", CASES)
+def test_canonicalize_is_idempotent(raw: str, _expected: str):
+    once = canonicalize_url(raw)
+    assert canonicalize_url(once) == once
+
+
+def test_near_duplicates_share_a_url_hash():
+    variants = [
+        "https://a.test/article",
+        "https://a.test/article/",
+        "https://a.test/article#intro",
+        "https://a.test/article?utm_source=newsletter",
+        "HTTPS://A.test/article",
+    ]
+    hashes = {url_hash(v) for v in variants}
+    assert len(hashes) == 1
+
+
+def test_distinct_pages_keep_distinct_hashes():
+    assert url_hash("https://a.test/x?id=1") != url_hash("https://a.test/x?id=2")
+    assert url_hash("https://a.test/x") != url_hash("https://a.test/y")
+
+
+def test_empty_and_none_safe():
+    assert canonicalize_url("") == ""
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py
new file mode 100644
index 00000000..e50775ee
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.catalog import (
+    build_catalog,
+    write_catalog,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CaptureResult,
+    CitationRecord,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _capture(url: str, html: str) -> CaptureResult:
+    return CaptureResult(
+        url=url,
+        final_url=url,
+        status_code=200,
+        html=html,
+        markdown="md " * 30,
+        screenshot=b"img",
+        screenshot_content_type="image/png",
+        fetcher="fake",
+    )
+
+
+def _seed(tmp_path):
+    store = LocalBlobStore(tmp_path)
+    config = ArchiveConfig(s3_prefix="t")
+    cstore = ContentStore(store, config)
+    cstore.store(_capture("https://a.test/p", "<p>a</p>"))
+    cstore.store(_capture("https://b.test/q", "<p>b</p>"))
+    # uncaptured.test/x is cited but never captured.
+    records = [
+        CitationRecord(
+            url="https://a.test/p?utm_source=news",  # canonicalizes to /p
+            run_id="r1",
+            bot="alpha",
+            question_id="100",
+            question_url="https://www.metaculus.com/questions/100/",
+            tool_name="web_search",
+        ),
+        CitationRecord(
+            url="https://b.test/q",
+            run_id="r1",
+            bot="beta",
+            question_id="100",
+            question_url="https://www.metaculus.com/questions/100/",
+            tool_name="page_fetch",
+        ),
+        CitationRecord(
+            url="https://uncaptured.test/x",
+            run_id="r1",
+            bot="alpha",
+            question_id="100",
+        ),
+        # A data/API call made only via run_code -> excluded from the catalog.
+        CitationRecord(
+            url="https://data.test/api?fmt=csv",
+            run_id="r1",
+            bot="beta",
+            question_id="100",
+            tool_name="run_code",
+        ),
+    ]
+    manifest_io.write_blob(store, "r1", records, config)
+    return store, config
+
+
+def test_build_catalog_joins_and_canonicalizes(tmp_path):
+    store, config = _seed(tmp_path)
+    data = build_catalog(store, config)
+
+    # The two a.test variants collapse to one source; the run_code API call is
+    # excluded (tool/API call, not a page).
+    urls = {s.canonical_url for s in data.sources}
+    assert urls == {
+        "https://a.test/p",
+        "https://b.test/q",
+        "https://uncaptured.test/x",
+    }
+    assert data.excluded.get("tool_call") == 1
+    assert "https://data.test/api?fmt=csv" not in urls
+    captured = {s.canonical_url for s in data.sources if s.captured}
+    assert captured == {"https://a.test/p", "https://b.test/q"}
+
+    by_q = data.by_question()
+    assert set(by_q) == {"100"}
+    assert len(by_q["100"]) == 3
+    by_bot = data.by_bot()
+    assert set(by_bot) == {"alpha", "beta"}
+
+
+def test_write_catalog_emits_views(tmp_path):
+    store, config = _seed(tmp_path)
+    summary = write_catalog(store, config)
+
+    assert summary.sources == 3
+    assert summary.captured == 2
+    assert summary.questions == 1
+    assert summary.excluded.get("tool_call") == 1
+
+    keys = set(store.list_keys("t/catalog/"))
+    assert "t/catalog/index.html" in keys
+    assert "t/catalog/READ_ME_FIRST.html" in keys
+    assert "t/catalog/by-question/100.html" in keys
+    assert "t/catalog/by-question/100.csv" in keys
+    assert "t/catalog/by-bot/alpha.html" in keys
+    assert "t/catalog/by-domain/a.test.html" in keys
+
+    q_html = store.get("t/catalog/by-question/100.html").decode("utf-8")
+    assert "https://a.test/p" in q_html
+    assert "alpha" in q_html  # bot tag present
+    # Local links are relative into the content store.
+    assert "../../content/" in q_html
+
+    q_csv = store.get("t/catalog/by-question/100.csv").decode("utf-8")
+    assert "https://uncaptured.test/x" in q_csv
+    assert "no" in q_csv  # uncaptured row marked
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
deleted file mode 100644
index 81874d80..00000000
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import annotations
-
-from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
-    MetaculusCommentHarvester,
-)
-
-
-def _leaderboard():
-    return {
-        "leaderboard_entries": [
-            {"user": {"id": 1, "username": "botA", "is_bot": True}},
-            {"user": {"id": 2, "username": "human", "is_bot": False}},
-            {"user": {"id": 3, "username": "botB", "is_bot": True}},
-        ]
-    }
-
-
-def test_enumerate_bots_filters_non_bots():
-    def fetch(path, params):
-        assert path == "/leaderboards/project/123/"
-        assert params["with_entries"] == "true"
-        return _leaderboard()
-
-    h = MetaculusCommentHarvester(fetch_json=fetch)
-    bots = h.enumerate_bots(123)
-    assert [b["id"] for b in bots] == [1, 3]
-
-
-def test_harvest_author_builds_records_with_provenance():
-    def fetch(path, params):
-        assert path == "/comments/"
-        if params["offset"] == 0:
-            return {
-                "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}]
-            }
-        return {"results": []}
-
-    h = MetaculusCommentHarvester(fetch_json=fetch)
-    records = h.harvest_author(1, run_id="r1", bot="botA")
-    assert len(records) == 1
-    rec = records[0]
-    assert rec.url == "https://a.test/x"
-    assert rec.bot == "botA"
-    assert rec.run_id == "r1"
-    assert rec.question_id == "555"
-    assert rec.question_url == "https://www.metaculus.com/questions/555/"
-    assert rec.trace == "comment:10"
-    assert rec.origin == "metaculus_comment"
-
-
-def test_iter_comments_paginates_until_short_page():
-    calls = []
-
-    def fetch(path, params):
-        calls.append(params["offset"])
-        if params["offset"] == 0:
-            return {"results": [{"id": i, "text": ""} for i in range(100)]}
-        return {"results": [{"id": 999, "text": ""}]}  # short page -> stop
-
-    h = MetaculusCommentHarvester(fetch_json=fetch)
-    comments = list(h.iter_comments(1))
-    assert len(comments) == 101
-    assert calls == [0, 100]
-
-
-def test_harvest_project_aggregates_bots():
-    def fetch(path, params):
-        if path.startswith("/leaderboards/project/"):
-            return _leaderboard()
-        # one URL per bot, single page each
-        if params["offset"] == 0:
-            author = params["author"]
-            return {
-                "results": [
-                    {"id": author, "on_post": 1, "text": f"https://bot{author}.test"}
-                ]
-            }
-        return {"results": []}
-
-    h = MetaculusCommentHarvester(fetch_json=fetch)
-    records = h.harvest_project(123)
-    assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"}
-    assert {r.bot for r in records} == {"botA", "botB"}
-    assert all(r.run_id == "metaculus-comments-123" for r in records)
-
-
-def test_custom_base_url_drives_web_base():
-    h = MetaculusCommentHarvester(
-        base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []}
-    )
-    assert h.web_base == "https://example.org"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
index c6f83ef3..a1c1d6c0 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
@@ -15,10 +15,10 @@ def _store(tmp_path, **cfg) -> ContentStore:
     return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg))
 
 
-def _result(url: str, html: str) -> CaptureResult:
+def _result(url: str, html: str, final_url: str | None = None) -> CaptureResult:
     return CaptureResult(
         url=url,
-        final_url=url,
+        final_url=final_url if final_url is not None else url,
         status_code=200,
         html=html,
         markdown="md " * 50,
@@ -73,3 +73,89 @@ def test_changed_content_creates_new_capture(tmp_path):
     second = store.store(_result("https://a.test", "<p>v2 changed</p>"))
     assert second.created is True
     assert first.capture.content_hash != second.capture.content_hash
+
+
+# --- Phase B: redirect aliasing -------------------------------------------
+def test_redirect_keys_capture_by_final_url(tmp_path):
+    store = _store(tmp_path)
+    res = store.store(
+        _result("https://bit.ly/x", "<p>dest</p>", final_url="https://dest.test/page")
+    )
+    # Capture is stored under the FINAL url's hash, not the shortener's.
+    assert res.capture.url == "https://dest.test/page"
+    assert res.capture.url_hash == url_hash("https://dest.test/page")
+    # The canonical index records the cited shortener as an alias.
+    canonical = store._read_index(url_hash("https://dest.test/page"))
+    assert "https://bit.ly/x" in canonical["aliases"]
+
+
+def test_lookup_via_shortener_and_final_both_hit(tmp_path):
+    store = _store(tmp_path)
+    store.store(
+        _result("https://bit.ly/x", "<p>dest</p>", final_url="https://dest.test/page")
+    )
+    via_alias = store.lookup("https://bit.ly/x")
+    via_final = store.lookup("https://dest.test/page")
+    assert via_alias is not None and via_final is not None
+    assert via_alias.content_hash == via_final.content_hash
+    assert via_alias.url == "https://dest.test/page"
+
+
+def test_two_shorteners_to_same_page_store_once(tmp_path):
+    store = _store(tmp_path)
+    first = store.store(
+        _result("https://bit.ly/x", "<p>same</p>", final_url="https://dest.test/page")
+    )
+    second = store.store(
+        _result("https://t.co/y", "<p>same</p>", final_url="https://dest.test/page")
+    )
+    assert first.created is True
+    assert second.created is False  # identical content deduped, not re-stored
+    canonical = store._read_index(url_hash("https://dest.test/page"))
+    assert set(canonical["aliases"]) == {"https://bit.ly/x", "https://t.co/y"}
+    assert len(canonical["captures"]) == 1
+
+
+# --- Phase C: cross-URL content dedup -------------------------------------
+def test_identical_content_across_distinct_urls_reuses_blobs(tmp_path):
+    store = _store(tmp_path)
+    a = store.store(_result("https://a.test/x", "<p>same</p>"))
+    b = store.store(_result("https://b.test/y", "<p>same</p>"))
+
+    # Both are real captures (each URL has its own index entry)...
+    assert a.created is True and b.created is True
+    # ...but B reuses A's blobs instead of writing its own.
+    assert a.capture.content_alias_of is None
+    assert b.capture.content_alias_of == url_hash("https://a.test/x")
+    assert b.capture.html_key == a.capture.html_key
+
+    # No duplicate blob was written under B's url hash.
+    b_own_key = (
+        f"t/content/{url_hash('https://b.test/y')}/{b.capture.content_hash}.html"
+    )
+    assert not store.blobs.exists(b_own_key)
+    assert store.blobs.exists(a.capture.html_key)
+
+
+def test_content_reverse_index_tracks_members(tmp_path):
+    store = _store(tmp_path)
+    store.store(_result("https://a.test/x", "<p>same</p>"))
+    store.store(_result("https://b.test/y", "<p>same</p>"))
+
+    ch = store.store(_result("https://c.test/z", "<p>same</p>")).capture.content_hash
+    reverse = store._read_content_index(ch)
+    assert reverse["canonical_url_hash"] == url_hash("https://a.test/x")
+    member_hashes = {m["url_hash"] for m in reverse["members"]}
+    assert member_hashes == {
+        url_hash("https://a.test/x"),
+        url_hash("https://b.test/y"),
+        url_hash("https://c.test/z"),
+    }
+
+
+def test_different_content_not_aliased(tmp_path):
+    store = _store(tmp_path)
+    a = store.store(_result("https://a.test/x", "<p>one</p>"))
+    b = store.store(_result("https://b.test/y", "<p>two different</p>"))
+    assert b.capture.content_alias_of is None
+    assert b.capture.html_key != a.capture.html_key
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py
new file mode 100644
index 00000000..155d3772
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.catalog import Citation, Source
+from forecasting_tools.agents_and_tools.source_archive.coverage import (
+    coverage_from_sources,
+)
+
+
+def _src(url, domain, captured, cits):
+    return Source(canonical_url=url, domain=domain, captured=captured, citations=cits)
+
+
+def _trace(bot, q, tool):
+    return Citation(bot=bot, question_id=q, tool_name=tool, origin="tool_result")
+
+
+def _comment(bot, q):
+    return Citation(bot=bot, question_id=q, origin="metaculus_comment")
+
+
+SOURCES = [
+    _src(
+        "https://a.test/1",
+        "a.test",
+        True,
+        [_trace("template", "100", "scrape_webpage")],
+    ),
+    _src(
+        "https://b.test/2",
+        "b.test",
+        False,
+        [_trace("template", "100", "scrape_webpage")],
+    ),
+    _src("https://c.test/3", "c.test", True, [_comment("otherbot", "200")]),
+    # run_code-only -> excluded as a tool/API call
+    _src(
+        "https://data.test/x",
+        "data.test",
+        False,
+        [_trace("template", "100", "run_code")],
+    ),
+    # search-engine result page -> excluded as a non-source
+    _src(
+        "https://www.google.com/search?q=x",
+        "google.com",
+        False,
+        [_trace("template", "100", "scrape_webpage")],
+    ),
+    # malformed (extractor junk) -> excluded
+    _src(
+        "https://a.test/y%5B1%5D",
+        "a.test",
+        False,
+        [_trace("template", "100", "scrape_webpage")],
+    ),
+]
+
+
+def test_trace_report_excludes_non_sources_and_counts_pages():
+    r = coverage_from_sources(SOURCES, "trace")
+    assert r.cited == 2  # a.test/1 + b.test/2 (data/search/malformed excluded)
+    assert r.captured == 1
+    assert r.pct == 50.0
+    assert r.excluded == {"tool_call": 1, "search": 1, "malformed": 1}
+    assert r.missing == 1
+    assert r.missing_urls == ["https://b.test/2"]
+
+    by_q = {row.label: (row.cited, row.captured) for row in r.by_question}
+    assert by_q == {"100": (2, 1)}
+    by_tool = {row.label: (row.cited, row.captured) for row in r.by_tool}
+    assert by_tool == {"scrape_webpage": (2, 1)}
+    missed = {row.label for row in r.missed_by_domain}
+    assert missed == {"b.test"}
+
+
+def test_comment_report_is_separate():
+    r = coverage_from_sources(SOURCES, "comments")
+    assert r.cited == 1  # only the metaculus_comment source
+    assert r.captured == 1
+    assert r.pct == 100.0
+    assert {row.label for row in r.by_bot} == {"otherbot"}
+
+
+def test_modes_do_not_bleed():
+    trace = coverage_from_sources(SOURCES, "trace")
+    comments = coverage_from_sources(SOURCES, "comments")
+    assert "https://c.test/3" not in trace.missing_urls  # comment source not in trace
+    # the trace bot never appears in the comment report
+    assert "template" not in {row.label for row in comments.by_bot}
+
+
+def test_csv_export_has_overall_row():
+    csv_text = coverage_from_sources(SOURCES, "trace").to_csv()
+    assert "group,label,cited,captured,pct" in csv_text
+    assert "overall,trace,2,1,50.0" in csv_text
+
+
+def test_outcomes_split_never_fetched_vs_failed():
+    # b.test/2 is the only missing page source. With no outcome for it, it's a
+    # pure collection gap (never fetched).
+    r = coverage_from_sources(SOURCES, "trace", {"https://a.test/1": "stored"})
+    assert r.has_outcomes is True
+    assert r.missing_never_fetched == 1
+    assert r.missing_fetch_failed == 0
+
+    # If a run report shows b.test/2 was fetched and failed, it's a capture
+    # problem, not a collection gap.
+    r2 = coverage_from_sources(SOURCES, "trace", {"https://b.test/2": "error"})
+    assert r2.missing_never_fetched == 0
+    assert r2.missing_fetch_failed == 1
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py
new file mode 100644
index 00000000..54cab175
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import (
+    LOCAL_DEFAULT_DSN,
+    MetaculusDbHarvester,
+    resolve_dsn,
+)
+
+
+def test_harvest_post_builds_records_with_provenance():
+    rows = [
+        {
+            "comment_id": 1,
+            "on_post_id": 42,
+            "text": "see https://a.test/x and https://b.test/y",
+            "username": "alpha",
+            "author_id": 7,
+        },
+        {
+            "comment_id": 2,
+            "on_post_id": 42,
+            "text": "https://a.test/x again",
+            "username": "beta",
+            "author_id": 8,
+        },
+    ]
+    seen = {}
+
+    def query(sql, params):
+        seen["sql"], seen["params"] = sql, params
+        return rows
+
+    records = MetaculusDbHarvester(query).harvest_post(42)
+
+    assert seen["params"] == (42,)
+    assert {r.url for r in records} == {"https://a.test/x", "https://b.test/y"}
+    r0 = next(r for r in records if r.url == "https://a.test/x")
+    assert r0.origin == "metaculus_comment"
+    assert r0.question_id == "42"
+    assert r0.question_url == "https://www.metaculus.com/questions/42/"
+    assert r0.bot in ("alpha", "beta")
+    # one record per (URL, comment): a.test/x is cited in both comments
+    assert sum(r.url == "https://a.test/x" for r in records) == 2
+
+
+def test_harvest_recent_passes_days_and_limit():
+    seen = {}
+
+    def query(sql, params):
+        seen["sql"], seen["params"] = sql, params
+        return []
+
+    MetaculusDbHarvester(query).harvest_recent(days=3, limit=50)
+    assert seen["params"] == (3, 50)
+    assert "limit %s" in seen["sql"]
+
+
+def test_harvest_recent_uncapped_by_default():
+    seen = {}
+
+    def query(sql, params):
+        seen["sql"], seen["params"] = sql, params
+        return []
+
+    # A daily sweep wants every row from the latest day, not a 1000-row cap.
+    MetaculusDbHarvester(query).harvest_recent(days=1)
+    assert seen["params"] == (1,)
+    assert "limit" not in seen["sql"].lower()
+
+
+def test_includes_private_bot_comments_by_default():
+    seen = {}
+
+    def query(sql, params):
+        seen["sql"] = sql
+        return []
+
+    # The day-behind replica's value is the now-private bot reasoning, so the
+    # default read must NOT filter private rows out.
+    MetaculusDbHarvester(query).harvest_recent(days=1)
+    assert "is_private" not in seen["sql"]
+    assert "u.is_bot" in seen["sql"]
+
+
+def test_public_only_filters_private_comments():
+    seen = {}
+
+    def query(sql, params):
+        seen["sql"] = sql
+        return []
+
+    MetaculusDbHarvester(query).harvest_post(42, include_private=False)
+    assert "not c.is_private" in seen["sql"]
+
+
+def test_resolve_dsn_prefers_explicit_then_env_then_keychain():
+    # explicit flag wins over everything
+    assert (
+        resolve_dsn(
+            "postgresql://flag",
+            env={"METACULUS_DB_DSN": "postgresql://env"},
+            keychain_reader=lambda: "postgresql://kc",
+        )
+        == "postgresql://flag"
+    )
+    # then the env var
+    assert (
+        resolve_dsn(
+            None,
+            env={"METACULUS_DB_DSN": "postgresql://env"},
+            keychain_reader=lambda: "postgresql://kc",
+        )
+        == "postgresql://env"
+    )
+    # then the keychain (the private path)
+    assert (
+        resolve_dsn(None, env={}, keychain_reader=lambda: "postgresql://kc")
+        == "postgresql://kc"
+    )
+
+
+def test_resolve_dsn_falls_back_to_local_default():
+    # nothing configured and no keychain item -> local dev DB, not a crash
+    assert resolve_dsn(None, env={}, keychain_reader=lambda: None) == LOCAL_DEFAULT_DSN
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
index 033d1689..fa87838d 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
@@ -4,7 +4,10 @@
 from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
 from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
 from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
-from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+    CapturePipeline,
+    capture_urls_concurrent,
+)
 from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
 
 
@@ -15,6 +18,114 @@ def _pipeline(tmp_path, fetcher) -> CapturePipeline:
     return CapturePipeline(fetcher, store)
 
 
+def test_capture_urls_concurrent_captures_all(tmp_path, make_fetcher):
+    from contextlib import contextmanager
+
+    config = ArchiveConfig(s3_prefix="t", concurrency=4)
+    store = ContentStore(LocalBlobStore(tmp_path), config)
+    urls = [f"https://s{i}.test/p" for i in range(12)]
+
+    @contextmanager
+    def factory(_cfg):
+        f = make_fetcher()
+        for u in urls:
+            f.add(u)
+        yield f
+
+    summary = capture_urls_concurrent(urls, store, config, factory)
+
+    assert len(summary.outcomes) == 12
+    assert summary.count("stored") == 12
+    # every URL is resolvable afterwards (proves the shared store got all writes)
+    assert all(store.lookup(u) is not None for u in urls)
+
+
+def test_concurrent_supervisor_recovers_a_stuck_worker(tmp_path, make_fetcher):
+    import threading
+    from contextlib import contextmanager
+
+    config = ArchiveConfig(s3_prefix="t", concurrency=1)
+    store = ContentStore(LocalBlobStore(tmp_path), config)
+    urls = ["https://stuck.test/x"]
+    reaped = threading.Event()
+    builds = {"n": 0}
+
+    class _Wedges:
+        name = "wedge"
+
+        def fetch(self, url):
+            # Block until the supervisor's reaper "kills the browser", then surface
+            # the dead-browser error a killed Chromium would raise.
+            reaped.wait(5)
+            raise RuntimeError("Target page, context or browser has been closed")
+
+    @contextmanager
+    def factory(_cfg):
+        builds["n"] += 1
+        if builds["n"] == 1:
+            yield _Wedges()  # first browser wedges
+        else:
+            fetcher = make_fetcher()
+            fetcher.add(urls[0])
+            yield fetcher  # rebuilt browser works
+
+    # Inject a fake reaper so the test drives the supervisor without real Chromium.
+    summary = capture_urls_concurrent(
+        urls, store, config, factory, per_url_timeout=0.3, reaper=reaped.set
+    )
+
+    assert builds["n"] == 2  # stalled -> reaped -> death -> rebuild -> retry
+    assert summary.count("stored") == 1  # recovered and captured on a fresh browser
+
+
+def test_concurrent_restarts_browser_after_death(tmp_path, make_fetcher):
+    from contextlib import contextmanager
+
+    config = ArchiveConfig(s3_prefix="t", concurrency=1)
+    store = ContentStore(LocalBlobStore(tmp_path), config)
+    urls = ["https://a.test/x"]
+    builds = {"n": 0}
+
+    class _DeadBrowser:
+        name = "dead"
+
+        def fetch(self, url):
+            raise RuntimeError("Target page, context or browser has been closed")
+
+    @contextmanager
+    def factory(_cfg):
+        builds["n"] += 1
+        if builds["n"] == 1:
+            yield _DeadBrowser()  # first browser is dead
+        else:
+            fetcher = make_fetcher()
+            fetcher.add(urls[0])
+            yield fetcher  # rebuilt browser works
+
+    summary = capture_urls_concurrent(urls, store, config, factory)
+
+    assert builds["n"] == 2  # detected death, rebuilt once
+    assert summary.count("stored") == 1  # retry on the fresh browser succeeded
+
+
+class _BoomFetcher:
+    """Raises an unexpected (non-FetchError) exception, like a bad screenshot."""
+
+    name = "boom"
+
+    def fetch(self, url):
+        raise ValueError("kaboom")
+
+
+def test_pipeline_isolates_unexpected_fetcher_errors(tmp_path):
+    # One pathological URL must not abort the whole run.
+    pipe = _pipeline(tmp_path, _BoomFetcher())
+    summary = pipe.run(["https://a.test", "https://b.test"])
+    assert summary.count("error") == 2
+    assert len(summary.outcomes) == 2
+    assert all(o.reason.startswith("unexpected:") for o in summary.outcomes)
+
+
 def test_manifest_roundtrip_and_unique_urls():
     records = [
         CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"),
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py
new file mode 100644
index 00000000..82e5f5b7
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import json
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.reindex import (
+    analyze,
+    rebuild_content_index,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _put_index(store, key: str, body: dict) -> None:
+    store.put(f"t/index/{key}.json", json.dumps(body).encode("utf-8"))
+
+
+def _canonical(url: str, content_hash: str) -> dict:
+    return {
+        "url": url,
+        "url_hash": f"hash_of_{url}",
+        "latest_content_hash": content_hash,
+        "captures": {
+            content_hash: {
+                "url": url,
+                "url_hash": f"hash_of_{url}",
+                "content_hash": content_hash,
+                "html_key": f"t/content/hash_of_{url}/{content_hash}.html",
+            }
+        },
+    }
+
+
+def _seed(tmp_path) -> tuple[LocalBlobStore, ArchiveConfig]:
+    store = LocalBlobStore(tmp_path)
+    config = ArchiveConfig(s3_prefix="t")
+    # Legacy rows stored under raw hashing: two URLs that now canonicalize equal.
+    _put_index(store, "h1", _canonical("https://x.test/p?utm_source=news", "c1"))
+    _put_index(store, "h2", _canonical("https://x.test/p", "c2"))
+    # Two distinct URLs with byte-identical content (same latest hash).
+    _put_index(store, "h3", _canonical("https://a.test/1", "cX"))
+    _put_index(store, "h4", _canonical("https://b.test/2", "cX"))
+    # Same host+path, meaningful query differs -> Phase D candidate.
+    _put_index(store, "h5", _canonical("https://q.test/item?id=1", "n1"))
+    _put_index(store, "h6", _canonical("https://q.test/item?id=2", "n2"))
+    # An alias (redirect) index -> counted but not a capture.
+    _put_index(store, "h7", {"url": "https://bit.ly/z", "alias_of": "hash_of_x"})
+    return store, config
+
+
+def test_analyze_reports_all_three_lenses(tmp_path):
+    store, config = _seed(tmp_path)
+    report = analyze(store, config)
+
+    assert report.total_url_indexes == 7
+    assert report.alias_indexes == 1
+    assert report.canonical_captures == 6
+
+    canon_keys = {c.key for c in report.canonicalization_clusters}
+    assert "https://x.test/p" in canon_keys
+
+    content_urls = {tuple(c.urls) for c in report.content_clusters}
+    assert ("https://a.test/1", "https://b.test/2") in content_urls
+
+    near_keys = {c.key for c in report.near_dup_clusters}
+    assert "https://q.test/item" in near_keys
+
+
+def test_analyze_ignores_reverse_content_index(tmp_path):
+    store, config = _seed(tmp_path)
+    # A by-content reverse index must not be mistaken for a URL index.
+    store.put(
+        "t/index/by-content/cX.json",
+        json.dumps({"content_hash": "cX", "canonical_url_hash": "x"}).encode("utf-8"),
+    )
+    report = analyze(store, config)
+    assert report.total_url_indexes == 7  # unchanged
+
+
+def test_rebuild_content_index_is_dry_by_default(tmp_path):
+    store, config = _seed(tmp_path)
+    groups = rebuild_content_index(store, config, apply=False)
+    assert groups >= 1
+    # Dry run wrote nothing under by-content/.
+    assert not list(store.list_keys("t/index/by-content/"))
+
+    rebuild_content_index(store, config, apply=True)
+    assert list(store.list_keys("t/index/by-content/"))
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py
new file mode 100644
index 00000000..a3248fa8
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+    CaptureOutcome,
+    PipelineSummary,
+)
+from forecasting_tools.agents_and_tools.source_archive.reports import (
+    read_outcomes,
+    write_run_report,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def test_run_report_roundtrip_canonicalizes(tmp_path):
+    store = LocalBlobStore(tmp_path)
+    config = ArchiveConfig(s3_prefix="t")
+    summary = PipelineSummary(
+        outcomes=[
+            CaptureOutcome(url="https://a.test/p?utm_source=x", status="stored"),
+            CaptureOutcome(url="https://b.test/q", status="error", reason="cloudflare"),
+        ]
+    )
+    write_run_report(store, "r1", summary, config)
+
+    out = read_outcomes(store, config)
+    # keys are canonicalized (tracking param stripped)
+    assert out["https://a.test/p"] == "stored"
+    assert out["https://b.test/q"] == "error"
+
+
+def test_captured_status_wins_over_failure(tmp_path):
+    store = LocalBlobStore(tmp_path)
+    config = ArchiveConfig(s3_prefix="t")
+    write_run_report(
+        store,
+        "early",
+        PipelineSummary(
+            outcomes=[CaptureOutcome(url="https://a.test", status="error")]
+        ),
+        config,
+    )
+    write_run_report(
+        store,
+        "later",
+        PipelineSummary(
+            outcomes=[CaptureOutcome(url="https://a.test", status="stored")]
+        ),
+        config,
+    )
+    assert read_outcomes(store, config)["https://a.test"] == "stored"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py
new file mode 100644
index 00000000..d357982b
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py
@@ -0,0 +1,67 @@
+"""Tests for screenshot encoding + the height cap.
+
+Regression guard for a silent truncation bug: the height cap used to be applied
+via Playwright's ``clip`` *without* ``full_page``, which is bounded by the
+viewport and chopped tall pages down to a single screen. The cap is now enforced
+by cropping the full-page render in Pillow — these tests pin that behavior.
+"""
+
+from __future__ import annotations
+
+import io
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+    _encode_screenshot,
+)
+
+Image = pytest.importorskip("PIL.Image")
+
+
+def _png(width: int, height: int) -> bytes:
+    out = io.BytesIO()
+    Image.new("RGB", (width, height), (255, 0, 0)).save(out, format="PNG")
+    return out.getvalue()
+
+
+def test_tall_page_cropped_to_max_height():
+    data, ct = _encode_screenshot(_png(1280, 12000), "webp", max_height=4000)
+    assert ct == "image/webp"
+    img = Image.open(io.BytesIO(data))
+    assert img.size == (1280, 4000)  # cropped to the cap, full width preserved
+
+
+def test_short_page_not_cropped():
+    data, _ = _encode_screenshot(_png(1280, 3000), "webp", max_height=20000)
+    assert Image.open(io.BytesIO(data)).size == (1280, 3000)  # untouched
+
+
+def test_webp_clamped_to_format_limit_even_without_cap():
+    # WebP cannot exceed 16383px; an over-tall page must crop, not crash.
+    data, _ = _encode_screenshot(_png(1280, 25000), "webp", max_height=0)
+    assert Image.open(io.BytesIO(data)).size == (1280, 16383)
+
+
+def test_webp_cap_above_format_limit_is_clamped():
+    # A configured cap above WebP's limit still degrades safely to 16383.
+    data, _ = _encode_screenshot(_png(1280, 18000), "webp", max_height=16000)
+    assert Image.open(io.BytesIO(data)).height == 16000
+
+
+def test_png_keeps_full_height_uncapped():
+    # PNG has no such limit, so max_height=0 preserves the whole render.
+    data, _ = _encode_screenshot(_png(1280, 20000), "png", max_height=0)
+    assert Image.open(io.BytesIO(data)).size == (1280, 20000)
+
+
+def test_webp_is_real_webp():
+    data, ct = _encode_screenshot(_png(800, 600), "webp")
+    assert ct == "image/webp"
+    assert data[:4] == b"RIFF" and data[8:12] == b"WEBP"
+
+
+def test_jpeg_format():
+    data, ct = _encode_screenshot(_png(800, 600), "jpeg")
+    assert ct == "image/jpeg"
+    assert Image.open(io.BytesIO(data)).format == "JPEG"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py
new file mode 100644
index 00000000..f3555523
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import (
+    extract_records_from_events,
+    extract_records_from_question_dir,
+    extract_records_from_trace_file,
+    harvest_run,
+    trace_label,
+)
+
+
+def test_trace_label_strips_prefix_and_suffix():
+    assert trace_label("/x/traces_forecast_1_attempt_1.jsonl") == "forecast_1_attempt_1"
+    assert trace_label("traces_summarize.jsonl") == "summarize"
+
+
+def test_tool_call_carries_query_and_tool_args():
+    events = [
+        {
+            "type": "tool_call",
+            "call_id": "c1",
+            "name": "search_online",
+            "args": {"query": "uk election polls", "max_results": 5},
+        }
+    ]
+    records = extract_records_from_events(events, trace="forecast_1")
+    # No URL in the args -> nothing emitted from the tool_call itself.
+    assert records == []
+
+
+def test_tool_result_attributed_to_originating_call():
+    events = [
+        {
+            "type": "tool_call",
+            "call_id": "c1",
+            "name": "search_online",
+            "args": {"query": "uk election polls"},
+        },
+        {
+            "type": "tool_result",
+            "call_id": "c1",
+            "content": "Top hit: [poll](https://a.test/poll) and https://b.test/x",
+            "timestamp": "2026-05-12T12:00:00+00:00",
+        },
+    ]
+    records = extract_records_from_events(events, trace="forecast_1", bot="template")
+    assert [r.url for r in records] == ["https://a.test/poll", "https://b.test/x"]
+    rec = records[0]
+    assert rec.origin == "tool_result"
+    assert rec.tool_name == "search_online"
+    assert rec.query == "uk election polls"
+    assert rec.tool_args == {"query": "uk election polls"}
+    assert rec.trace == "forecast_1"
+    assert rec.bot == "template"
+    assert rec.first_seen == "2026-05-12T12:00:00+00:00"
+
+
+def test_query_from_list_args():
+    events = [
+        {
+            "type": "tool_call",
+            "call_id": "c1",
+            "name": "s",
+            "args": {"queries": ["a", "b"]},
+        },
+        {"type": "tool_result", "call_id": "c1", "content": "https://a.test/x"},
+    ]
+    records = extract_records_from_events(events, trace="t")
+    assert records[0].query == "a b"
+
+
+def test_url_directly_in_tool_call_args():
+    events = [
+        {
+            "type": "tool_call",
+            "call_id": "c1",
+            "name": "fetch_page",
+            "args": {"url": "https://a.test/page"},
+        }
+    ]
+    records = extract_records_from_events(events, trace="t")
+    assert len(records) == 1
+    assert records[0].url == "https://a.test/page"
+    assert records[0].origin == "tool_call"
+    assert records[0].tool_name == "fetch_page"
+    assert records[0].tool_args == {"url": "https://a.test/page"}
+
+
+def test_initial_prompt_only_scanned_when_enabled():
+    events = [
+        {"type": "initial_prompt", "prompt": "background: https://a.test/bg"},
+    ]
+    assert extract_records_from_events(events, trace="forecast_1") == []
+    records = extract_records_from_events(
+        events, trace="summarize", include_initial_prompt=True
+    )
+    assert [r.url for r in records] == ["https://a.test/bg"]
+    assert records[0].origin == "initial_prompt"
+    assert records[0].tool_name == ""
+
+
+def test_non_dict_events_skipped():
+    events = ["garbage", None, {"type": "tool_result", "content": "https://a.test/x"}]
+    records = extract_records_from_events(events, trace="t")
+    assert [r.url for r in records] == ["https://a.test/x"]
+
+
+def _write_jsonl(path: Path, events: list[dict]) -> None:
+    path.write_text("\n".join(json.dumps(e) for e in events), encoding="utf-8")
+
+
+def test_trace_file_uses_summarize_rule(tmp_path: Path):
+    f = tmp_path / "traces_summarize.jsonl"
+    _write_jsonl(f, [{"type": "initial_prompt", "prompt": "see https://a.test/r"}])
+    records = extract_records_from_trace_file(str(f), run_id="run1", bot="template")
+    assert [r.url for r in records] == ["https://a.test/r"]
+    assert records[0].trace == "summarize"
+    assert records[0].run_id == "run1"
+
+
+def test_trace_file_skips_blank_and_bad_lines(tmp_path: Path):
+    f = tmp_path / "traces_forecast_1.jsonl"
+    f.write_text(
+        '\n{"type": "tool_result", "content": "https://a.test/x"}\nnot json\n',
+        encoding="utf-8",
+    )
+    records = extract_records_from_trace_file(str(f))
+    assert [r.url for r in records] == ["https://a.test/x"]
+
+
+def test_question_dir_reads_metadata_and_builds_url(tmp_path: Path):
+    qdir = tmp_path / "q_123"
+    qdir.mkdir()
+    (qdir / "question.json").write_text(
+        json.dumps({"question_id": "metac_123", "metaculus_id": 123}),
+        encoding="utf-8",
+    )
+    _write_jsonl(
+        qdir / "traces_forecast_1.jsonl",
+        [{"type": "tool_result", "content": "https://a.test/x"}],
+    )
+    records = extract_records_from_question_dir(
+        str(qdir), run_id="run1", bot="template"
+    )
+    assert len(records) == 1
+    rec = records[0]
+    assert rec.question_id == "metac_123"
+    assert rec.metaculus_id == "123"
+    assert rec.question_url == "https://www.metaculus.com/questions/123/"
+
+
+def test_question_dir_without_metadata_still_emits(tmp_path: Path):
+    qdir = tmp_path / "q_x"
+    qdir.mkdir()
+    _write_jsonl(
+        qdir / "traces_forecast_1.jsonl",
+        [{"type": "tool_result", "content": "https://a.test/x"}],
+    )
+    records = extract_records_from_question_dir(str(qdir))
+    assert [r.url for r in records] == ["https://a.test/x"]
+    assert records[0].question_id is None
+    assert records[0].question_url is None
+
+
+def test_harvest_run_walks_bot_and_question_dirs(tmp_path: Path):
+    run = tmp_path / "run_demo"
+    qdir = run / "bot_template" / "q_1"
+    qdir.mkdir(parents=True)
+    (qdir / "question.json").write_text(
+        json.dumps({"metaculus_id": 1}), encoding="utf-8"
+    )
+    _write_jsonl(
+        qdir / "traces_forecast_1.jsonl",
+        [{"type": "tool_result", "content": "https://a.test/x"}],
+    )
+    records = harvest_run(str(run))
+    assert len(records) == 1
+    rec = records[0]
+    assert rec.run_id == "run_demo"
+    assert rec.bot == "template"
+    assert rec.metaculus_id == "1"
+
+
+def test_harvest_run_flat_layout_without_bot_dirs(tmp_path: Path):
+    # Butler-style: <run>/<question>/traces_*.jsonl with no bot_* grouping.
+    run = tmp_path / "s3_backfill"
+    qdir = run / "2026-05-20_metac_43538"
+    qdir.mkdir(parents=True)
+    (qdir / "question.json").write_text(
+        json.dumps({"metaculus_id": 43538}), encoding="utf-8"
+    )
+    _write_jsonl(
+        qdir / "traces_forecast_1.jsonl",
+        [{"type": "tool_result", "content": "https://a.test/x"}],
+    )
+    records = harvest_run(str(run), bot="butler")
+    assert len(records) == 1
+    rec = records[0]
+    assert rec.bot == "butler"  # the flat-layout bot override
+    assert rec.metaculus_id == "43538"  # still read from question.json
+
+
+def test_harvest_run_flat_layout_defaults_bot_to_run_name(tmp_path: Path):
+    run = tmp_path / "myrun"
+    qdir = run / "q_only"
+    qdir.mkdir(parents=True)
+    _write_jsonl(
+        qdir / "traces_x.jsonl",
+        [{"type": "tool_result", "content": "https://a.test/y"}],
+    )
+    records = harvest_run(str(run))  # no bot= -> defaults to run dir name
+    assert [r.bot for r in records] == ["myrun"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
index e018af77..443578bb 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
@@ -38,6 +38,25 @@ def test_dedupes_preserving_order():
     assert extract_urls(text) == ["https://a.test", "https://b.test"]
 
 
+def test_strips_trailing_backslash_escape_residue():
+    # Markdown often leaves a trailing backslash, e.g. "Zaporizhzhia\"
+    assert extract_urls("see https://a.test/search?q=Zaporizhzhia\\ ok") == [
+        "https://a.test/search?q=Zaporizhzhia"
+    ]
+
+
+def test_cuts_markdown_reference_tail_and_keeps_both_urls():
+    # The bare scan can glue ")[10](other)" onto a real URL; the tail is cut so
+    # the first URL is clean, and the genuinely-separate second URL (itself a
+    # valid markdown link) is still extracted. Order follows pattern precedence
+    # (markdown links before bare URLs), so compare as a set.
+    text = "https://a.test/story?id=123)[10](https://b.test/other)"
+    assert set(extract_urls(text)) == {
+        "https://a.test/story?id=123",
+        "https://b.test/other",
+    }
+
+
 def test_ignores_non_http_and_empty():
     assert extract_urls("ftp://a.test mailto:x@y.test nope") == []
     assert extract_urls(None) == []
diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md
index 4eb2d9ef..fbcb34cf 100644
--- a/forecasting_tools/agents_and_tools/source_archive/README.md
+++ b/forecasting_tools/agents_and_tools/source_archive/README.md
@@ -43,6 +43,10 @@ Configuration is read from the environment (see the project `.env.template`):
 | `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain |
 | `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` |
 | `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) |
+| `WEB_ARCHIVE_FIRECRAWL_PROXY` | Firecrawl proxy mode for hardened sites: `basic` (1 credit) / `auto` / `stealth` (5 credits). | `basic` |
+| `HYPERBROWSER_API_KEY` | Enables the Hyperbrowser managed fallback. | — (off) |
+| `WEB_ARCHIVE_CLOAKBROWSER_IMPORT` | Module exposing CloakBrowser's `launch()`. | `cloakbrowser` |
+| `WEB_ARCHIVE_PDF_MAX_PAGES` | Cap on PDF pages parsed per document. | `50` |
 
 AWS credentials use the standard AWS resolution chain — environment variables, a
 shared config file, or an SSO profile. Nothing secret is committed or baked into
@@ -87,13 +91,131 @@ source-archive capture run.jsonl --local ./archive
 # Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself
 source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo
 
+# Skip the Hyperbrowser fallback this run; failures are written to a retry
+# manifest so you can come back to just those sites later (e.g. with it on).
+source-archive capture run.jsonl --no-hyperbrowser --run-id demo
+source-archive capture demo_needs_retry.jsonl --run-id demo   # later, hyperbrowser on
+
 # Build a manifest by harvesting the URLs bots cited on a Metaculus tournament
 source-archive harvest 32506 --out run.jsonl
 ```
 
+Because a failed fetch leaves no cache entry while a success does, re-running the
+same manifest only re-attempts the failures — the retry manifest just makes that
+explicit and fast (it skips the already-captured majority).
+
 `source-archive` is installed by the extra; the equivalent module form is
 `python -m forecasting_tools.agents_and_tools.source_archive.cli`.
 
+## Backup backends & the bake-off
+
+A self-hosted browser is the primary backend and gets ~70% of URLs for ~free,
+but two kinds of URL fall through it: **anti-bot/Cloudflare** pages (it detects
+the block but can't get past it) and **PDFs** (Chromium downloads them instead of
+rendering, so nothing is captured). The package ships these backups, ordered by
+marginal cost so the cheap tiers absorb most of the tail:
+
+| Backend | Cost (2026) | Closes | Notes |
+| --- | --- | --- | --- |
+| `CloakBrowserFetcher` | ~$0/page (self-host) | Cloudflare | **The primary browser tier when installed** (`pip install cloakbrowser`): patched Chromium that beat vanilla Playwright on Cloudflare in 2026 benchmarks. Only one browser runs — cloak *replaces* vanilla Playwright (two `sync_playwright` instances conflict in one process), falling back to vanilla when cloak isn't installed. |
+| `PdfFetcher` | $0 local; ~$0.0008/pg OCR | PDFs | PyMuPDF4LLM locally, falls back to Firecrawl OCR on scanned PDFs. |
+| `FirecrawlFetcher` | $0.0008 basic / $0.0042 stealth | Cloudflare + PDFs | Native PDF parser; `WEB_ARCHIVE_FIRECRAWL_PROXY=stealth` for hardened sites. |
+| `HyperbrowserFetcher` | $0.001 basic / $0.01 proxy | Cloudflare | Consolidates spend onto a vendor already used elsewhere. No PDF support. |
+
+Selenium was evaluated and **rejected**: it drives the same Chromium as
+Playwright, so it bypasses nothing Playwright can't, and its stealth ecosystem
+(`undetected-chromedriver`) is now legacy. CloakBrowser/Patchright/nodriver are
+the credible self-hosted upgrades.
+
+To decide which backup(s) to wire in, run the bake-off — it runs each selected
+backend independently over the same URLs (not tiered) and reports reliability,
+latency, and estimated cost per backend, broken down by category:
+
+```bash
+python -m forecasting_tools.agents_and_tools.source_archive.benchmark \
+    --manifest forecasting_tools/agents_and_tools/source_archive/benchmarks/sample_urls.jsonl \
+    --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \
+    --out bench.csv
+```
+
+Backends whose API key or dependency is missing are skipped cleanly. Cost
+figures are model estimates (see `PRICING` in `benchmark.py`); tune the credit
+rates with `--firecrawl-credit-usd` / `--hyperbrowser-credit-usd` to match your
+plan. Swap the sample manifest for a JSONL of your own cited URLs (one
+`{"url", "category"}` per line; categories `normal`/`cloudflare`/`pdf`) for a
+representative run.
+
+## Browse what you captured
+
+A Streamlit viewer reads the manifests + index back out of the store and shows
+each captured URL's **screenshot, markdown, and HTML** side by side, filterable
+by bot and question:
+
+```bash
+AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \
+  streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+```
+
+It uses the same `ArchiveConfig.from_env()` settings as capture, so it points at
+whatever bucket/prefix you captured to (no extra configuration).
+
+To browse a **local** capture (no S3/AWS), set `WEB_ARCHIVE_LOCAL_DIR` to the
+directory you captured into with `--local`:
+
+```bash
+WEB_ARCHIVE_LOCAL_DIR=./archive \
+  streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+```
+
+## The catalog: a browsable, coworker-legible view
+
+The viewer is interactive (good for us); the **catalog** is a set of static
+HTML/CSV pages written into the bucket so a non-technical coworker can browse the
+sources without any tooling. It is **question-primary** — the encyclopedia of
+every web source used for a question — plus `by-bot/` and `by-domain/`
+cross-views, built by joining the manifests with the index:
+
+```bash
+# write catalog/ into the bucket (uses WEB_ARCHIVE_S3_BUCKET)
+source-archive catalog
+# or against a local capture dir
+source-archive catalog --local ./archive
+```
+
+Start at `catalog/index.html` (or `catalog/READ_ME_FIRST.html` for the plain
+explainer). Each source shows its screenshot, who used it (bot + tool), and
+whether it was captured; each question also has a CSV. Data/API calls (a bot's
+`run_code` pulling a CSV, etc.) are **excluded** from the catalog — it lists web
+pages a bot read, not data endpoints — though they remain in the raw manifests.
+
+## Coverage: what fraction did we archive?
+
+The catalog shows what we *have*; the **coverage report** shows what we're
+*missing*. It's two separate reports, by ingestion path — different denominators,
+different ground truth:
+
+```bash
+source-archive coverage                 # both reports
+source-archive coverage --mode trace    # just the complex/template bot
+source-archive coverage --csv ./cov     # also write cov_<mode>.csv (+ _missing.txt)
+```
+
+- **trace** — the complex/template bot's instrumented runs (metac-ai-sdk). Traces
+  hold *every* URL the bot touched, so this is a true archival success-rate.
+- **comments** — every bot (Metaculus's own + outsiders) harvested from public
+  comments. Comments are truncated, so this denominator under-counts — coverage
+  here means "of the links visible in comments, how many we archived."
+
+The report is oriented to one question: **are there sources bots are using that
+we are not yet archiving?** It leads with that gap, then breaks it down by
+question, bot, tool, and the biggest-gap sites, plus the list of sources to
+collect. Non-source URLs — search-engine results, `run_code`-style tool/API
+calls, and malformed extractor junk — are excluded (same as the catalog).
+
+If capture runs have persisted their outcomes (`reports/<run_id>.json`, written
+automatically by `capture`), the gap is split into **never fetched** (the real
+collection gap) vs **fetched but failed** (a capture problem).
+
 ## The manifest: what to feed it
 
 A run produces a **citation manifest** — a JSONL file with one record per cited
@@ -107,33 +229,49 @@ The pipeline dedupes URLs within the manifest before fetching.
 
 ## Where the manifest comes from
 
-You can write a manifest yourself, or generate one from a bot's published
-reasoning. Both first-party and third-party bots post their reasoning — with the
-source links they used — as comments on Metaculus, so the public, no-auth
-Metaculus API is the one ingestion path that works across *every* bot:
+You can write a manifest yourself, or generate one from a forecasting bot's
+reasoning — the source links a bot used are recorded in the comment it posts and,
+more completely, in its run traces.
 
-```python
-from forecasting_tools.agents_and_tools.source_archive.ingest import (
-    MetaculusCommentHarvester,
-)
-from forecasting_tools.agents_and_tools.source_archive import manifest
+**From the database (operator path).** `harvest-db` reads the URLs a bot cited
+straight from the platform's Postgres database and emits a manifest. Point it at
+a database (a `postgresql://…` URL works — e.g. a Neon connection string):
 
-harvester = MetaculusCommentHarvester()        # uses METACULUS_API_BASE_URL
-records = harvester.harvest_project(32506)     # a tournament / project id
-manifest.write_file("run.jsonl", records)      # -> feed to `capture`
+```bash
+# one post, or the latest day of activity
+source-archive harvest-db --post 29495 --dedupe --out run.jsonl
+source-archive harvest-db --days 1 --dedupe --upload --run-id "$(date -u +%F)"
 ```
 
-Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`.
+It reads `comments_comment ⋈ users_user (is_bot)` and emits the same manifest.
+`--days` is uncapped by default; `--limit N` caps the row count for spot checks.
+`--public-only` restricts to public comments (all comments are read by default).
 
-The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in
-`ingest.url_extraction` pull URLs out of any markdown/text (markdown links,
-autolinks, and bare URLs), if you are ingesting from somewhere other than
-comments.
+**DSN resolution (keep the credential off disk).** The DSN is resolved in this
+order: `--dsn` flag → `$METACULUS_DB_DSN` → macOS Keychain item
+`metaculus-db-dsn` → local default `dbname=metaculus`. The DSN is a real secret
+(it grants database read access), so prefer the **Keychain** over `.env` / a
+shell export — those land in files and shell history that any editor or coding
+agent can read. Store it once (you'll be prompted to paste it, so it never
+appears in your shell history):
 
-Caveat: comments are length-truncated when posted, so a comment-harvested URL
-list can be incomplete versus a bot's full research. For bots you control, an
-instrumented trace gives a fuller list; comment harvesting is the universal
-baseline.
+```bash
+security add-generic-password -U -a "$USER" -s metaculus-db-dsn -w
+# paste the full postgresql://USER:PASS@HOST/dbname?sslmode=require string, return
+```
+
+For the strongest guard, open **Keychain Access.app → login → `metaculus-db-dsn`
+→ Access Control → "Confirm before allowing access"** and clear the always-allow
+list. Every read then raises a GUI confirm: a human running the harvest clicks
+*Allow* (not *Always Allow*), but an automated agent driving a shell can't. With
+that set, the harvester works with no DSN in any file — `source-archive
+harvest-db --days 1` just prompts you once per run.
+
+**From text or traces.** The lower-level `extract_urls(text)` /
+`extract_citation_records(...)` helpers in `ingest.url_extraction` pull URLs out
+of any markdown/text (markdown links, autolinks, and bare URLs). For bots you
+control, an instrumented trace (`ingest-traces`) gives the fullest URL list; a
+comment gives a shallower one, since it is length-truncated when posted.
 
 ## How it's organized
 
@@ -142,7 +280,8 @@ baseline.
 | `config.py` | Environment-driven `ArchiveConfig` |
 | `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` |
 | `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester |
-| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator |
+| `fetchers/` | Playwright (primary) + CloakBrowser / Hyperbrowser / Firecrawl / PDF backups, tiered orchestrator |
+| `benchmark.py` | Backend bake-off: reliability + cost per backend over a manifest |
 | `quality.py` | Reject 404s, block pages, and thin content before archiving |
 | `storage/` | `BlobStore` interface with S3 and local backends |
 | `content_store.py` | `url + content-hash` store with the TTL cache and dedup |
@@ -150,12 +289,22 @@ baseline.
 | `pipeline.py` | `lookup → fetch → quality gate → store` |
 | `cli.py` | `source-archive` command |
 
+## Roadmap
+
+Planned and shipped improvements — smarter dedup (URL canonicalization +
+redirect/content aliasing), the coworker-legible catalog, and coverage reports —
+are written up in [ROADMAP.md](ROADMAP.md).
+
 ## What lands in storage
 
 ```
-<prefix>/index/<url_hash>.json                     per-URL capture history
+<prefix>/index/<url_hash>.json                     per-URL capture history (+ aliases)
+<prefix>/index/by-content/<content_hash>.json      reverse index for content dedup
 <prefix>/content/<url_hash>/<content_hash>.html
 <prefix>/content/<url_hash>/<content_hash>.webp     (screenshot)
 <prefix>/content/<url_hash>/<content_hash>.md
 <prefix>/manifests/<run_id>.jsonl                  the run's citation manifest
+<prefix>/reports/<run_id>.json                     per-URL capture outcomes (for coverage)
+<prefix>/catalog/index.html                        browsable catalog (by question/bot/site)
+<prefix>/catalog/by-question/<id>.{html,csv}
 ```
diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py
index 795f4b66..5ede914d 100644
--- a/forecasting_tools/agents_and_tools/source_archive/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py
@@ -29,10 +29,7 @@
 from forecasting_tools.agents_and_tools.source_archive.fetchers import (
     build_default_fetcher,
 )
-from forecasting_tools.agents_and_tools.source_archive.ingest import (
-    MetaculusCommentHarvester,
-    extract_urls,
-)
+from forecasting_tools.agents_and_tools.source_archive.ingest import extract_urls
 from forecasting_tools.agents_and_tools.source_archive.models import (
     CaptureResult,
     CitationRecord,
@@ -51,7 +48,6 @@
     "CapturePipeline",
     "CitationRecord",
     "ContentStore",
-    "MetaculusCommentHarvester",
     "PipelineSummary",
     "StoreResult",
     "StoredCapture",
diff --git a/forecasting_tools/agents_and_tools/source_archive/benchmark.py b/forecasting_tools/agents_and_tools/source_archive/benchmark.py
new file mode 100644
index 00000000..76d79083
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/benchmark.py
@@ -0,0 +1,459 @@
+"""Backend bake-off: run each capture backend independently over the same URLs.
+
+This is the harness for deciding *which* backup to put behind Playwright. Unlike
+the production :class:`TieredFetcher` (which stops at the first backend that
+passes the quality gate), the benchmark runs **every** selected backend over
+**every** URL, so you get an apples-to-apples table of reliability, latency, and
+estimated cost per backend — broken down by URL category (normal / cloudflare /
+pdf).
+
+Run it::
+
+    python -m forecasting_tools.agents_and_tools.source_archive.benchmark \\
+        --manifest sample_urls.jsonl \\
+        --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \\
+        --out bench.csv
+
+A backend whose dependency or API key is missing is skipped with a note rather
+than failing the whole run, so you can benchmark whatever you have configured.
+
+Cost figures are ESTIMATES from a documented pricing model (see ``PRICING``,
+sourced 2026-06); they are not billed amounts. Override the credit rates via
+CLI flags to match your plan.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import io
+import json
+import logging
+import statistics
+import sys
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+    Fetcher,
+    FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+    CloakBrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+    FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+    HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+    PdfFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+    PlaywrightFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+GB = 1_000_000_000
+
+# --- Pricing model -----------------------------------------------------------
+# $/unit as of 2026-06, from each vendor's public pricing + this repo's prior
+# cost experiment. These are the knobs to adjust for your plan.
+#
+#   - Self-hosted compute (Playwright / CloakBrowser): ~$0.00001/page rendered
+#     (measured in bot-sources probe). Marginal service fee is effectively $0.
+#   - Firecrawl: 1 credit basic, 5 credits stealth/"enhanced" proxy. Standard
+#     plan ≈ $0.00083/credit.
+#   - Hyperbrowser: 1 credit ($0.001) basic, 10 credits ($0.01) with proxy,
+#     plus $10/GB proxy bandwidth. 1 credit = $0.001.
+#   - PDF: PyMuPDF4LLM local = $0; Firecrawl OCR fallback = ~1 credit/PDF page.
+
+
+@dataclass
+class Pricing:
+    self_host_per_page: float = 0.00001
+    firecrawl_credit_usd: float = 0.00083
+    firecrawl_basic_credits: int = 1
+    firecrawl_stealth_credits: int = 5
+    hyperbrowser_credit_usd: float = 0.001
+    hyperbrowser_basic_credits: int = 1
+    hyperbrowser_proxy_credits: int = 10
+    hyperbrowser_bandwidth_usd_per_gb: float = 10.0
+
+
+def estimate_cost(
+    backend: str, result: CaptureResult, response_bytes: int, pricing: Pricing
+) -> float:
+    """Estimated $ for one successful capture by ``backend``."""
+    meta = result.metadata or {}
+    if backend in ("playwright", "cloakbrowser"):
+        return pricing.self_host_per_page
+    if backend.startswith("firecrawl"):
+        proxy = str(meta.get("firecrawl_proxy", "basic")).lower()
+        credits = (
+            pricing.firecrawl_basic_credits
+            if proxy in ("", "basic")
+            else pricing.firecrawl_stealth_credits
+        )
+        return credits * pricing.firecrawl_credit_usd
+    if backend == "hyperbrowser":
+        credits = (
+            pricing.hyperbrowser_proxy_credits
+            if meta.get("used_proxy")
+            else pricing.hyperbrowser_basic_credits
+        )
+        bandwidth = (response_bytes / GB) * pricing.hyperbrowser_bandwidth_usd_per_gb
+        return credits * pricing.hyperbrowser_credit_usd + bandwidth
+    if backend == "pdf":
+        if meta.get("pdf_engine") == "firecrawl":
+            pages = int(meta.get("pdf_pages") or 1)
+            return pages * pricing.firecrawl_credit_usd
+        return 0.0  # local PyMuPDF4LLM
+    return 0.0
+
+
+# --- Backend registry --------------------------------------------------------
+# Factories so a missing dependency / API key only skips that backend. The
+# ``context`` flag marks browser backends that must be entered as a context
+# manager (the browser launches once and is reused across URLs).
+
+
+@dataclass
+class BackendSpec:
+    name: str
+    factory: Callable[[ArchiveConfig], Fetcher]
+    context: bool = False
+    # Optional pre-flight: return a reason string if the backend can't run
+    # (missing key/dep) so the bake-off reports a clean SKIP instead of N/N
+    # fetch_errors. ``None`` means "looks runnable".
+    precheck: Callable[[ArchiveConfig], str | None] | None = None
+
+
+def _need_firecrawl_key(config: ArchiveConfig) -> str | None:
+    if not config.firecrawl_api_key:
+        return "FIRECRAWL_API_KEY not set"
+    return None
+
+
+def _need_hyperbrowser_key(config: ArchiveConfig) -> str | None:
+    if not config.hyperbrowser_api_key:
+        return "HYPERBROWSER_API_KEY not set"
+    return None
+
+
+def _firecrawl_stealth(config: ArchiveConfig) -> FirecrawlFetcher:
+    # Force the proxy/stealth path so this row measures the Cloudflare-grade
+    # (5-credit) cost, even if the operator left the default at "basic".
+    proxy = config.firecrawl_proxy
+    if proxy in ("", "basic"):
+        proxy = "auto"
+    f = FirecrawlFetcher(config.model_copy(update={"firecrawl_proxy": proxy}))
+    f.name = "firecrawl-stealth"
+    return f
+
+
+BACKENDS: dict[str, BackendSpec] = {
+    "playwright": BackendSpec("playwright", PlaywrightFetcher, context=True),
+    "cloakbrowser": BackendSpec("cloakbrowser", CloakBrowserFetcher, context=True),
+    "firecrawl": BackendSpec(
+        "firecrawl", FirecrawlFetcher, precheck=_need_firecrawl_key
+    ),
+    "firecrawl-stealth": BackendSpec(
+        "firecrawl-stealth", _firecrawl_stealth, precheck=_need_firecrawl_key
+    ),
+    "hyperbrowser": BackendSpec(
+        "hyperbrowser", HyperbrowserFetcher, precheck=_need_hyperbrowser_key
+    ),
+    "pdf": BackendSpec("pdf", PdfFetcher),
+}
+
+
+# --- Sample manifest ---------------------------------------------------------
+# A curated starter set spanning the three categories the backup must handle.
+# Replace/extend with your own real cited URLs for a representative run.
+SAMPLE_MANIFEST: list[dict] = [
+    {"url": "https://example.com", "category": "normal"},
+    {"url": "https://en.wikipedia.org/wiki/Forecasting", "category": "normal"},
+    {"url": "https://www.federalregister.gov/", "category": "normal"},
+    # Sites commonly fronted by Cloudflare / anti-bot:
+    {"url": "https://www.g2.com/", "category": "cloudflare"},
+    {"url": "https://www.indeed.com/", "category": "cloudflare"},
+    {"url": "https://www.zillow.com/", "category": "cloudflare"},
+    # PDFs (the gap Playwright can't render):
+    {"url": "https://arxiv.org/pdf/1706.03762", "category": "pdf"},
+    {"url": "https://bitcoin.org/bitcoin.pdf", "category": "pdf"},
+]
+
+
+@dataclass
+class Row:
+    backend: str
+    url: str
+    category: str
+    passed: bool
+    reason: str
+    seconds: float
+    html_bytes: int
+    md_bytes: int
+    screenshot_bytes: int
+    cost_usd: float
+    error: str = ""
+
+
+@dataclass
+class BackendRun:
+    name: str
+    rows: list[Row] = field(default_factory=list)
+    skipped: str = ""
+
+
+def _sizes(result: CaptureResult) -> tuple[int, int, int]:
+    html = len(result.html.encode()) if result.html else 0
+    md = len(result.markdown.encode()) if result.markdown else 0
+    shot = len(result.screenshot) if result.screenshot else 0
+    return html, md, shot
+
+
+def run_backend(
+    spec: BackendSpec,
+    manifest: list[dict],
+    config: ArchiveConfig,
+    pricing: Pricing,
+) -> BackendRun:
+    run = BackendRun(name=spec.name)
+    if spec.precheck is not None:
+        reason = spec.precheck(config)
+        if reason:
+            run.skipped = reason
+            logger.warning("%s skipped: %s", spec.name, reason)
+            return run
+    try:
+        fetcher = spec.factory(config)
+    except Exception as e:  # construction (e.g. missing key) — skip cleanly
+        run.skipped = f"could not construct {spec.name}: {e}"
+        logger.warning(run.skipped)
+        return run
+
+    cm = fetcher if spec.context else nullcontext(fetcher)
+    try:
+        with cm as live:
+            for record in manifest:
+                run.rows.append(_capture_one(spec.name, live, record, pricing))
+    except FetchError as e:
+        # A browser backend can fail to even start (e.g. cloakbrowser not
+        # installed). Record it as a skip rather than crashing the bake-off.
+        if not run.rows:
+            run.skipped = f"{spec.name} unavailable: {e}"
+            logger.warning(run.skipped)
+        else:
+            raise
+    return run
+
+
+def _capture_one(backend: str, fetcher: Fetcher, record: dict, pricing: Pricing) -> Row:
+    url = record["url"]
+    category = record.get("category", "normal")
+    start = time.monotonic()
+    try:
+        result = fetcher.fetch(url)
+    except FetchError as e:
+        return Row(
+            backend,
+            url,
+            category,
+            False,
+            "fetch_error",
+            round(time.monotonic() - start, 2),
+            0,
+            0,
+            0,
+            0.0,
+            error=str(e)[:300],
+        )
+    except Exception as e:  # backend bug / unexpected SDK error
+        return Row(
+            backend,
+            url,
+            category,
+            False,
+            "exception",
+            round(time.monotonic() - start, 2),
+            0,
+            0,
+            0,
+            0.0,
+            error=str(e)[:300],
+        )
+
+    seconds = round(time.monotonic() - start, 2)
+    verdict = evaluate(result)
+    html_b, md_b, shot_b = _sizes(result)
+    response_bytes = html_b + shot_b
+    cost = (
+        estimate_cost(backend, result, response_bytes, pricing)
+        if verdict.passed
+        else 0.0
+    )
+    return Row(
+        backend,
+        url,
+        category,
+        verdict.passed,
+        verdict.reason or "ok",
+        seconds,
+        html_b,
+        md_b,
+        shot_b,
+        round(cost, 6),
+    )
+
+
+# --- Reporting ---------------------------------------------------------------
+def write_csv(path: str, runs: list[BackendRun]) -> None:
+    buf = io.StringIO()
+    w = csv.writer(buf)
+    w.writerow(
+        [
+            "backend",
+            "url",
+            "category",
+            "passed",
+            "reason",
+            "seconds",
+            "html_bytes",
+            "md_bytes",
+            "screenshot_bytes",
+            "cost_usd",
+            "error",
+        ]
+    )
+    for run in runs:
+        for r in run.rows:
+            w.writerow(
+                [
+                    r.backend,
+                    r.url,
+                    r.category,
+                    r.passed,
+                    r.reason,
+                    r.seconds,
+                    r.html_bytes,
+                    r.md_bytes,
+                    r.screenshot_bytes,
+                    r.cost_usd,
+                    r.error,
+                ]
+            )
+    Path(path).write_text(buf.getvalue(), encoding="utf-8")
+
+
+def summarize(runs: list[BackendRun], urls_per_question: int, tail_share: float) -> str:
+    cats = ["normal", "cloudflare", "pdf"]
+    lines = []
+    header = (
+        f"{'backend':<18}{'overall':>9}"
+        + "".join(f"{c:>11}" for c in cats)
+        + f"{'med s':>8}{'$/page':>10}{'proj $/q':>10}"
+    )
+    lines.append(header)
+    lines.append("-" * len(header))
+    for run in runs:
+        if run.skipped:
+            lines.append(f"{run.name:<18}  SKIPPED: {run.skipped[:80]}")
+            continue
+        total = len(run.rows)
+        passed = [r for r in run.rows if r.passed]
+        overall = f"{len(passed)}/{total}"
+
+        def cat_rate(cat: str) -> str:
+            rows = [r for r in run.rows if r.category == cat]
+            if not rows:
+                return "-"
+            ok = sum(1 for r in rows if r.passed)
+            return f"{ok}/{len(rows)}"
+
+        med = statistics.median([r.seconds for r in run.rows]) if run.rows else 0
+        cost_per = statistics.mean([r.cost_usd for r in passed]) if passed else 0.0
+        # Illustrative: if THIS backend alone handled the whole post-Playwright
+        # tail of a question. (tail_share × urls × $/successful page.)
+        proj = tail_share * urls_per_question * cost_per
+        lines.append(
+            f"{run.name:<18}{overall:>9}"
+            + "".join(f"{cat_rate(c):>11}" for c in cats)
+            + f"{med:>8.1f}{cost_per:>10.5f}{proj:>10.3f}"
+        )
+    note = (
+        f"\nproj $/q assumes one backend covers a {tail_share:.0%} tail of "
+        f"{urls_per_question} URLs/question, BEFORE the TTL cache (which makes "
+        f"re-runs nearly free). Costs are model estimates, not billed amounts."
+    )
+    return "\n".join(lines) + "\n" + note
+
+
+def load_manifest(path: str | None) -> list[dict]:
+    if not path:
+        return SAMPLE_MANIFEST
+    records = []
+    for line in Path(path).read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if line:
+            records.append(json.loads(line))
+    return records
+
+
+def main(argv: list[str] | None = None) -> int:
+    logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(message)s")
+    p = argparse.ArgumentParser(description="Capture-backend bake-off.")
+    p.add_argument(
+        "--manifest", help="JSONL of {url, category}. Omit for the built-in sample."
+    )
+    p.add_argument(
+        "--backends",
+        default="playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf",
+        help="Comma-separated subset of: " + ", ".join(BACKENDS),
+    )
+    p.add_argument("--out", default="benchmark.csv", help="CSV output path.")
+    p.add_argument("--urls-per-question", type=int, default=450)
+    p.add_argument(
+        "--tail-share",
+        type=float,
+        default=0.30,
+        help="Fraction of URLs that fall through Playwright.",
+    )
+    p.add_argument("--firecrawl-credit-usd", type=float, default=0.00083)
+    p.add_argument("--hyperbrowser-credit-usd", type=float, default=0.001)
+    args = p.parse_args(argv)
+
+    config = ArchiveConfig.from_env()
+    pricing = Pricing(
+        firecrawl_credit_usd=args.firecrawl_credit_usd,
+        hyperbrowser_credit_usd=args.hyperbrowser_credit_usd,
+    )
+    manifest = load_manifest(args.manifest)
+
+    selected = [b.strip() for b in args.backends.split(",") if b.strip()]
+    unknown = [b for b in selected if b not in BACKENDS]
+    if unknown:
+        p.error(f"unknown backends: {unknown}. Choose from {list(BACKENDS)}")
+
+    runs: list[BackendRun] = []
+    for name in selected:
+        print(f"running {name} over {len(manifest)} URLs...", file=sys.stderr)
+        runs.append(run_backend(BACKENDS[name], manifest, config, pricing))
+
+    write_csv(args.out, runs)
+    print("\n" + summarize(runs, args.urls_per_question, args.tail_share))
+    print(f"\nper-URL detail written to {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/canonicalize.py b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py
new file mode 100644
index 00000000..b791a47e
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py
@@ -0,0 +1,115 @@
+"""Canonicalize URLs so trivially-different links collapse to one dedup key.
+
+Every capture of a page is grouped under ``url_hash`` (see :mod:`models`).
+Historically that hashed the *raw* URL string, so ``…/x``, ``…/x/``,
+``…/x?utm_source=…`` and ``…/x#frag`` were four different "sources" — inflating
+both storage and any "how many sources have we covered" count.
+
+This module normalizes away differences that do **not** change *which page* you
+get, so the dedup key is stable across those variants:
+
+  - lowercase scheme + host, strip a default port (``:80`` / ``:443``)
+  - drop the fragment (``#…``)
+  - drop known analytics / click-tracking query params, then sort the rest
+  - normalize a trailing slash (``…/x/`` -> ``…/x``; root collapses to no path)
+
+It is deliberately conservative. It does **not** upgrade ``http`` -> ``https`` or
+strip ``www.``: those can resolve to genuinely different pages on some hosts, so
+collapsing them belongs to a later, opt-in phase (see ``ROADMAP.md``).
+"""
+
+from __future__ import annotations
+
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+
+# Query params that are analytics/click tracking and never select the page.
+# Matched case-insensitively; any key starting with a prefix below is also
+# dropped. Bare ``ref`` / ``source`` are intentionally left alone — they are too
+# often load-bearing (API refs, content selectors) to drop blindly.
+_TRACKING_PARAMS = frozenset(
+    {
+        "gclid",
+        "gclsrc",
+        "dclid",
+        "gbraid",
+        "wbraid",
+        "fbclid",
+        "msclkid",
+        "yclid",
+        "twclid",
+        "mc_eid",
+        "mc_cid",
+        "_hsenc",
+        "_hsmi",
+        "igshid",
+        "igsh",
+        "vero_id",
+        "vero_conv",
+        "oly_anon_id",
+        "oly_enc_id",
+        "spm",
+        "scm",
+        "ref_src",
+        "ref_url",
+    }
+)
+_TRACKING_PREFIXES = ("utm_",)
+
+_DEFAULT_PORTS = {"http": "80", "https": "443"}
+
+
+def _is_tracking(key: str) -> bool:
+    k = key.lower()
+    return k in _TRACKING_PARAMS or any(k.startswith(p) for p in _TRACKING_PREFIXES)
+
+
+def canonicalize_url(url: str) -> str:
+    """Return a normalized form of ``url`` to use as a dedup key.
+
+    Idempotent — ``canonicalize_url(canonicalize_url(u)) == canonicalize_url(u)``.
+    Non-http(s) or unparsable input is returned stripped but otherwise as-is
+    (e.g. ``mailto:``, relative paths), so callers can pass anything safely.
+    """
+    if not url:
+        return url
+    raw = url.strip()
+    try:
+        parts = urlsplit(raw)
+    except ValueError:
+        return raw
+    if parts.scheme not in ("http", "https") or not parts.netloc:
+        return raw
+
+    scheme = parts.scheme.lower()
+
+    # netloc: lowercase host (bracket IPv6), keep userinfo, strip default port.
+    host = (parts.hostname or "").lower()
+    if ":" in host:  # IPv6 literal
+        host = f"[{host}]"
+    netloc = host
+    if parts.username is not None:
+        auth = parts.username
+        if parts.password is not None:
+            auth += f":{parts.password}"
+        netloc = f"{auth}@{netloc}"
+    if parts.port is not None and str(parts.port) != _DEFAULT_PORTS.get(scheme):
+        netloc += f":{parts.port}"
+
+    # path: collapse the bare root to empty; drop a trailing slash otherwise.
+    path = parts.path
+    if path in ("", "/"):
+        path = ""
+    elif path.endswith("/"):
+        path = path.rstrip("/")
+
+    # query: drop tracking params, then sort so order doesn't matter.
+    kept = [
+        (k, v)
+        for k, v in parse_qsl(parts.query, keep_blank_values=True)
+        if not _is_tracking(k)
+    ]
+    kept.sort()
+    query = urlencode(kept)
+
+    # fragment: always dropped.
+    return urlunsplit((scheme, netloc, path, query, ""))
diff --git a/forecasting_tools/agents_and_tools/source_archive/catalog.py b/forecasting_tools/agents_and_tools/source_archive/catalog.py
new file mode 100644
index 00000000..a9ec97f4
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/catalog.py
@@ -0,0 +1,562 @@
+"""Generate a coworker-legible catalog over the hash-addressed store.
+
+The content store is keyed by URL/content hash — great for dedup, opaque to a
+human browsing the bucket. This builds a browsable ``catalog/`` layer on top by
+joining the citation manifests (who cited what, on which question, with which
+tool) with the per-URL index (what actually got captured). Blobs are never moved
+or duplicated; the catalog only writes small HTML/CSV pointer pages.
+
+Views (question-primary, with two cross-views):
+
+    catalog/READ_ME_FIRST.html        plain-language explainer for coworkers
+    catalog/index.html                landing page + headline counts
+    catalog/by-question/<id>.html     ★ the encyclopedia for one question:
+    catalog/by-question/<id>.csv        every source, deduped, tagged with the
+                                        bots/tools/queries that used it
+    catalog/by-bot/<bot>.html         one bot's sources across questions
+    catalog/by-domain/<domain>.html   sources grouped by site
+
+The question view is the default because that's how post-mortems and
+non-technical coworkers think ("what did we know about question X?"); ``by-bot``
+covers profiling/"what is the top bot-maker doing", always next to how other
+bots handled the same question.
+"""
+
+from __future__ import annotations
+
+import csv
+import html
+import io
+from collections import defaultdict
+from urllib.parse import urlsplit
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CitationRecord,
+    url_hash,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+_UNKNOWN_Q = "unknown-question"
+
+# Tools that fetch data/API endpoints, not human-readable web pages. A URL only
+# ever touched by one of these is a data call (e.g. a bot's run_code pulling a
+# CSV), so it is kept out of the page-oriented catalog (it stays in the raw
+# manifests). A URL also seen via search/page-fetch is treated as a real page.
+_NON_PAGE_TOOLS = {
+    "run_code",
+    "code",
+    "python",
+    "run_python",
+    "code_interpreter",
+    "execute_code",
+    "bash",
+    "shell",
+}
+
+
+def tool_call_only(citations: list) -> bool:
+    """True if a URL was touched *only* by code-execution tools (a data/API call,
+    not a page a bot read)."""
+    tools = {(c.tool_name or "").lower() for c in citations}
+    code_tools = tools & _NON_PAGE_TOOLS
+    other_tools = tools - _NON_PAGE_TOOLS - {""}
+    return bool(code_tools) and not other_tools
+
+
+def _is_tool_call_only(source: "Source") -> bool:
+    return tool_call_only(source.citations)
+
+
+# Search-engine result pages are navigation, not sources — a bot citing a
+# google/duckduckgo search URL hasn't handed us a page worth archiving.
+_SEARCH_HOSTS = {
+    "duckduckgo.com",
+    "bing.com",
+    "search.brave.com",
+    "search.yahoo.com",
+    "ecosia.org",
+    "startpage.com",
+    "baidu.com",
+    "ask.com",
+    "qwant.com",
+    "search.marginalia.nu",
+    "kagi.com",
+}
+# Percent-encoded junk that means the extractor swallowed markdown / a second URL
+# / control chars into the URL (legacy captures from before extraction hardening).
+_MALFORMED_MARKERS = ("%5b", "%5d", "%5c", "%0a", "%0d", "%28http", "%29%5b")
+
+
+def is_search_url(url: str) -> bool:
+    host = urlsplit(url).netloc.lower()
+    host = host[4:] if host.startswith("www.") else host
+    return host in _SEARCH_HOSTS or host == "google.com" or host.startswith("google.")
+
+
+def is_malformed_url(url: str) -> bool:
+    low = url.lower()
+    return url.count("://") > 1 or any(m in low for m in _MALFORMED_MARKERS)
+
+
+def exclusion_reason(url: str, citations: list) -> str | None:
+    """Why a cited URL is kept out of the page catalog / coverage, or ``None`` to
+    keep it. ``malformed`` (extractor junk), ``search`` (search-engine results),
+    ``tool_call`` (data/API endpoint touched only by code tools)."""
+    if is_malformed_url(url):
+        return "malformed"
+    if is_search_url(url):
+        return "search"
+    if tool_call_only(citations):
+        return "tool_call"
+    return None
+
+
+class Citation(BaseModel):
+    bot: str | None = None
+    question_id: str | None = None
+    question_url: str | None = None
+    run_id: str | None = None
+    tool_name: str | None = None
+    origin: str | None = None
+    query: str | None = None
+    cited_url: str = ""  # the original URL as cited (pre-canonicalization)
+
+
+class Source(BaseModel):
+    canonical_url: str
+    domain: str
+    captured: bool = False
+    content_hash: str | None = None
+    html_key: str | None = None  # store-relative (no prefix)
+    screenshot_key: str | None = None
+    markdown_key: str | None = None
+    citations: list[Citation] = []
+
+    @property
+    def bots(self) -> list[str]:
+        return sorted({c.bot for c in self.citations if c.bot})
+
+    @property
+    def question_ids(self) -> list[str]:
+        return sorted({c.question_id for c in self.citations if c.question_id})
+
+
+class CatalogData(BaseModel):
+    sources: list[Source] = []
+    excluded: dict[str, int] = {}  # exclusion reason -> count of URLs dropped
+
+    @property
+    def hidden_total(self) -> int:
+        return sum(self.excluded.values())
+
+    def by_question(self) -> dict[str, list[Source]]:
+        out: dict[str, list[Source]] = defaultdict(list)
+        for s in self.sources:
+            qids = s.question_ids or [_UNKNOWN_Q]
+            for qid in qids:
+                out[qid].append(s)
+        return out
+
+    def by_bot(self) -> dict[str, list[Source]]:
+        out: dict[str, list[Source]] = defaultdict(list)
+        for s in self.sources:
+            for bot in s.bots or ["(no bot)"]:
+                out[bot].append(s)
+        return out
+
+    def by_domain(self) -> dict[str, list[Source]]:
+        out: dict[str, list[Source]] = defaultdict(list)
+        for s in self.sources:
+            out[s.domain].append(s)
+        return out
+
+    def question_url(self, qid: str) -> str | None:
+        for s in self.sources:
+            for c in s.citations:
+                if c.question_id == qid and c.question_url:
+                    return c.question_url
+        return None
+
+
+# --------------------------------------------------------------------------- #
+# Build (join manifests + index)
+# --------------------------------------------------------------------------- #
+def _domain(url: str) -> str:
+    host = urlsplit(url).netloc.lower()
+    return host[4:] if host.startswith("www.") else host
+
+
+def _strip_prefix(key: str | None, prefix: str) -> str | None:
+    if not key:
+        return None
+    p = prefix.rstrip("/") + "/"
+    return key[len(p) :] if key.startswith(p) else key
+
+
+def _latest_capture(store: ContentStore, canonical_url: str) -> dict | None:
+    """Return the latest stored capture dict for a URL (ignoring TTL), following
+    a redirect alias if present. ``None`` if nothing was ever captured."""
+    index = store._read_index(url_hash(canonical_url))
+    if not index:
+        return None
+    if index.get("alias_of"):
+        index = store._read_index(index["alias_of"])
+        if not index:
+            return None
+    ch = index.get("latest_content_hash")
+    return (index.get("captures") or {}).get(ch)
+
+
+def _load_all_records(store: BlobStore, prefix: str) -> list[CitationRecord]:
+    records: list[CitationRecord] = []
+    for key in store.list_keys(f"{prefix.rstrip('/')}/manifests/"):
+        if not key.endswith(".jsonl"):
+            continue
+        try:
+            records.extend(manifest_io.loads(store.get(key).decode("utf-8")))
+        except (UnicodeDecodeError, ValueError):
+            continue
+    return records
+
+
+def build_sources(store: BlobStore, config: ArchiveConfig) -> list[Source]:
+    """Join every manifest with the index into one ``Source`` per canonical URL.
+
+    Unfiltered (includes tool/API-call URLs) so other tools — e.g. the coverage
+    report — can classify them. The catalog itself filters these out.
+    """
+    prefix = config.s3_prefix.rstrip("/")
+    cstore = ContentStore(store, config)
+    records = _load_all_records(store, prefix)
+
+    grouped: dict[str, list[CitationRecord]] = defaultdict(list)
+    for r in records:
+        if r.url:
+            grouped[canonicalize_url(r.url)].append(r)
+
+    sources: list[Source] = []
+    for canonical, recs in sorted(grouped.items()):
+        cap = _latest_capture(cstore, canonical)
+        source = Source(
+            canonical_url=canonical,
+            domain=_domain(canonical) or "(unknown)",
+            captured=cap is not None,
+            content_hash=(cap or {}).get("content_hash"),
+            html_key=_strip_prefix((cap or {}).get("html_key"), prefix),
+            screenshot_key=_strip_prefix((cap or {}).get("screenshot_key"), prefix),
+            markdown_key=_strip_prefix((cap or {}).get("markdown_key"), prefix),
+            citations=[
+                Citation(
+                    bot=r.bot,
+                    question_id=r.question_id or r.metaculus_id,
+                    question_url=r.question_url,
+                    run_id=r.run_id,
+                    tool_name=r.tool_name,
+                    origin=r.origin,
+                    query=r.query,
+                    cited_url=r.url,
+                )
+                for r in recs
+            ],
+        )
+        sources.append(source)
+    return sources
+
+
+def build_catalog(store: BlobStore, config: ArchiveConfig) -> CatalogData:
+    sources = build_sources(store, config)
+    pages: list[Source] = []
+    excluded: dict[str, int] = defaultdict(int)
+    for s in sources:
+        reason = exclusion_reason(s.canonical_url, s.citations)
+        if reason:
+            excluded[reason] += 1
+        else:
+            pages.append(s)
+    return CatalogData(sources=pages, excluded=dict(excluded))
+
+
+# --------------------------------------------------------------------------- #
+# Render
+# --------------------------------------------------------------------------- #
+_CSS = """
+body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;color:#1a1a1a;background:#fafafa}
+header{background:#1f2937;color:#fff;padding:16px 24px}
+header a{color:#cbd5e1}
+h1{font-size:20px;margin:0 0 4px}
+.wrap{padding:24px;max-width:1100px;margin:0 auto}
+.muted{color:#6b7280}
+.badge{display:inline-block;font-size:11px;padding:1px 7px;border-radius:10px}
+.ok{background:#dcfce7;color:#166534}.no{background:#fee2e2;color:#991b1b}
+.card{background:#fff;border:1px solid #e5e7eb;border-radius:8px;padding:12px;margin:12px 0;display:flex;gap:12px}
+.card img{width:160px;height:110px;object-fit:cover;object-position:top;border:1px solid #e5e7eb;border-radius:4px;background:#f3f4f6}
+.card .meta{flex:1;min-width:0}
+.card .u{font-weight:600;word-break:break-all}
+.tags{margin-top:6px}
+.tag{display:inline-block;background:#eef2ff;color:#3730a3;font-size:11px;padding:1px 7px;border-radius:10px;margin:2px 4px 2px 0}
+.links a{margin-right:10px;font-size:12px}
+table{border-collapse:collapse;width:100%;background:#fff}
+td,th{border:1px solid #e5e7eb;padding:6px 8px;text-align:left;font-size:13px}
+th{background:#f3f4f6}
+a.grid{display:inline-block;margin:4px 12px 4px 0}
+"""
+
+
+def _esc(s) -> str:
+    return html.escape(str(s)) if s is not None else ""
+
+
+def _page(title: str, body: str, rel_root: str) -> str:
+    return (
+        "<!doctype html><html><head><meta charset='utf-8'>"
+        f"<title>{_esc(title)}</title><style>{_CSS}</style></head><body>"
+        f"<header><h1>Source Archive</h1>"
+        f"<a href='{rel_root}catalog/index.html'>← catalog home</a></header>"
+        f"<div class='wrap'>{body}</div></body></html>"
+    )
+
+
+class Linker:
+    """Turns a store-relative blob key into a link a coworker can open."""
+
+    def __init__(self, store: BlobStore, config: ArchiveConfig):
+        from forecasting_tools.agents_and_tools.source_archive.storage import (
+            S3BlobStore,
+        )
+
+        self.is_s3 = isinstance(store, S3BlobStore)
+        self.bucket = config.s3_bucket
+        self.region = config.aws_region
+        self.prefix = config.s3_prefix.rstrip("/")
+
+    def url(self, rel_key: str | None, rel_root: str) -> str | None:
+        if not rel_key:
+            return None
+        if self.is_s3:
+            host = (
+                f"{self.bucket}.s3.{self.region}.amazonaws.com"
+                if self.region
+                else f"{self.bucket}.s3.amazonaws.com"
+            )
+            return f"https://{host}/{self.prefix}/{rel_key}"
+        return f"{rel_root}{rel_key}"  # local: relative within the prefix dir
+
+
+def _source_card(s: Source, linker: Linker, rel_root: str) -> str:
+    shot = linker.url(s.screenshot_key, rel_root)
+    html_link = linker.url(s.html_key, rel_root)
+    md_link = linker.url(s.markdown_key, rel_root)
+    badge = (
+        "<span class='badge ok'>captured</span>"
+        if s.captured
+        else "<span class='badge no'>not captured</span>"
+    )
+    img = (
+        f"<a href='{_esc(shot)}'><img src='{_esc(shot)}' alt='screenshot'></a>"
+        if shot
+        else "<div class='card-img'></div>"
+    )
+    tools = sorted({c.tool_name for c in s.citations if c.tool_name})
+    tags = "".join(f"<span class='tag'>{_esc(b)}</span>" for b in s.bots)
+    tool_tags = "".join(f"<span class='tag'>{_esc(t)}</span>" for t in tools)
+    links = []
+    if html_link:
+        links.append(f"<a href='{_esc(html_link)}'>HTML</a>")
+    if md_link:
+        links.append(f"<a href='{_esc(md_link)}'>markdown</a>")
+    if shot:
+        links.append(f"<a href='{_esc(shot)}'>screenshot</a>")
+    links.append(f"<a href='{_esc(s.canonical_url)}'>live ↗</a>")
+    return (
+        f"<div class='card'>{img}<div class='meta'>"
+        f"<div class='u'>{_esc(s.canonical_url)}</div>"
+        f"<div class='muted'>{_esc(s.domain)} · {badge}</div>"
+        f"<div class='tags'>{tags}{tool_tags}</div>"
+        f"<div class='links'>{' '.join(links)}</div>"
+        f"</div></div>"
+    )
+
+
+def _question_csv(sources: list[Source]) -> str:
+    buf = io.StringIO()
+    w = csv.writer(buf)
+    w.writerow(["url", "domain", "captured", "bots", "tools", "screenshot_key"])
+    for s in sources:
+        tools = sorted({c.tool_name for c in s.citations if c.tool_name})
+        w.writerow(
+            [
+                s.canonical_url,
+                s.domain,
+                "yes" if s.captured else "no",
+                "; ".join(s.bots),
+                "; ".join(tools),
+                s.screenshot_key or "",
+            ]
+        )
+    return buf.getvalue()
+
+
+# --------------------------------------------------------------------------- #
+# Write
+# --------------------------------------------------------------------------- #
+class CatalogSummary(BaseModel):
+    sources: int = 0
+    captured: int = 0
+    questions: int = 0
+    bots: int = 0
+    domains: int = 0
+    excluded: dict[str, int] = {}
+
+    def __str__(self) -> str:
+        excl = sum(self.excluded.values())
+        breakdown = (
+            " (" + ", ".join(f"{k}={v}" for k, v in sorted(self.excluded.items())) + ")"
+            if self.excluded
+            else ""
+        )
+        return (
+            f"Catalog: {self.sources} page sources ({self.captured} captured) across "
+            f"{self.questions} questions, {self.bots} bots, {self.domains} domains "
+            f"— {excl} non-page URLs excluded{breakdown}"
+        )
+
+
+def _slug(value: str) -> str:
+    # Keep dots so domains stay readable (a.test.html); collapse anything else.
+    keep = [c if c.isalnum() or c in "-_." else "-" for c in value]
+    out = "".join(keep).strip("-.").replace("..", ".")[:80]
+    return out or "x"
+
+
+def write_catalog(
+    store: BlobStore,
+    config: ArchiveConfig,
+    out_store: BlobStore | None = None,
+) -> CatalogSummary:
+    """Build the catalog from ``store`` and write it to ``out_store`` (default:
+    ``store``). Pass a separate ``out_store`` to preview a live bucket's catalog
+    into a local directory without mutating the bucket."""
+    prefix = config.s3_prefix.rstrip("/")
+    data = build_catalog(store, config)
+    out = out_store or store
+    linker = Linker(out, config)
+
+    def put(rel: str, body: str, ctype: str) -> None:
+        out.put(f"{prefix}/catalog/{rel}", body.encode("utf-8"), content_type=ctype)
+
+    by_q = data.by_question()
+    by_b = data.by_bot()
+    by_d = data.by_domain()
+
+    # Per-question pages (the encyclopedia) + CSVs. rel_root: catalog/<view>/ -> ../../
+    rr2 = "../../"
+    for qid, sources in sorted(by_q.items()):
+        sources = sorted(sources, key=lambda s: s.canonical_url)
+        qurl = data.question_url(qid)
+        head = f"<h1>Question {_esc(qid)}</h1>"
+        if qurl:
+            head += f"<p><a href='{_esc(qurl)}'>{_esc(qurl)} ↗</a></p>"
+        head += (
+            f"<p class='muted'>{len(sources)} source(s); "
+            f"{sum(s.captured for s in sources)} captured · "
+            f"<a href='{_slug(qid)}.csv'>download CSV</a></p>"
+        )
+        cards = "".join(_source_card(s, linker, rr2) for s in sources)
+        put(
+            f"by-question/{_slug(qid)}.html",
+            _page(f"Question {qid}", head + cards, rr2),
+            "text/html",
+        )
+        put(f"by-question/{_slug(qid)}.csv", _question_csv(sources), "text/csv")
+
+    # Per-bot and per-domain cross-views.
+    for bot, sources in sorted(by_b.items()):
+        sources = sorted(sources, key=lambda s: s.canonical_url)
+        body = f"<h1>Bot: {_esc(bot)}</h1><p class='muted'>{len(sources)} source(s)</p>"
+        body += "".join(_source_card(s, linker, rr2) for s in sources)
+        put(f"by-bot/{_slug(bot)}.html", _page(f"Bot {bot}", body, rr2), "text/html")
+
+    for domain, sources in sorted(by_d.items()):
+        sources = sorted(sources, key=lambda s: s.canonical_url)
+        body = f"<h1>Site: {_esc(domain)}</h1><p class='muted'>{len(sources)} source(s)</p>"
+        body += "".join(_source_card(s, linker, rr2) for s in sources)
+        put(
+            f"by-domain/{_slug(domain)}.html",
+            _page(f"Site {domain}", body, rr2),
+            "text/html",
+        )
+
+    # Landing + readme. rel_root: catalog/ -> ../
+    rr1 = "../"
+    index_body = _index_body(data, by_q, by_b, by_d)
+    put("index.html", _page("Catalog", index_body, rr1), "text/html")
+    put("READ_ME_FIRST.html", _page("Read me first", _readme_body(), rr1), "text/html")
+
+    return CatalogSummary(
+        sources=len(data.sources),
+        captured=sum(s.captured for s in data.sources),
+        questions=len(by_q),
+        bots=len(by_b),
+        domains=len(by_d),
+        excluded=data.excluded,
+    )
+
+
+def _index_body(data, by_q, by_b, by_d) -> str:
+    captured = sum(s.captured for s in data.sources)
+
+    def links(items: dict, view: str) -> str:
+        rows = []
+        for key, sources in sorted(items.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+            rows.append(
+                f"<a class='grid' href='{view}/{_slug(key)}.html'>"
+                f"{_esc(key)} <span class='muted'>({len(sources)})</span></a>"
+            )
+        return "".join(rows)
+
+    hidden_note = (
+        f" · {data.hidden_total} non-page URLs hidden "
+        f"({', '.join(f'{k} {v}' for k, v in sorted(data.excluded.items()))})"
+        if data.hidden_total
+        else ""
+    )
+    return (
+        f"<p><a href='READ_ME_FIRST.html'>What is this? →</a></p>"
+        f"<p class='muted'>{len(data.sources)} page sources ({captured} captured) · "
+        f"{len(by_q)} questions · {len(by_b)} bots · {len(by_d)} sites{hidden_note}</p>"
+        f"<h1>By question</h1><p class='muted'>The encyclopedia of sources per "
+        f"question — start here.</p>{links(by_q, 'by-question')}"
+        f"<h1>By bot</h1>{links(by_b, 'by-bot')}"
+        f"<h1>By site</h1>{links(by_d, 'by-domain')}"
+    )
+
+
+def _readme_body() -> str:
+    return (
+        "<h1>What is this bucket?</h1>"
+        "<p>This is a <b>source archive</b>: for every web page a forecasting bot "
+        "cited, we save a snapshot — the page's <b>HTML</b>, a full-page "
+        "<b>screenshot</b>, and a clean <b>markdown</b> copy — so a forecast can be "
+        "audited later even if the original page changes or disappears.</p>"
+        "<h1>How to browse it</h1>"
+        "<ul>"
+        "<li>Open <b>index.html</b> (the catalog home).</li>"
+        "<li><b>By question</b> is the main view: pick a question to see every "
+        "source used for it, who used it, and a screenshot of each.</li>"
+        "<li><b>By bot</b> shows one bot's sources across questions; <b>By site</b> "
+        "groups sources by website.</li>"
+        "<li>Each question also has a <b>CSV</b> you can open in a spreadsheet.</li>"
+        "</ul>"
+        "<p class='muted'>The folders with long hash names (content/, index/) are "
+        "the machine-readable store — you don't need to open those.</p>"
+    )
diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py
index c2eed8db..d5ec2545 100644
--- a/forecasting_tools/agents_and_tools/source_archive/cli.py
+++ b/forecasting_tools/agents_and_tools/source_archive/cli.py
@@ -24,7 +24,6 @@
 from forecasting_tools.agents_and_tools.source_archive.fetchers import (
     build_default_fetcher,
 )
-from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
 
 
 def _load_dotenv() -> None:
@@ -69,6 +68,11 @@ def _cmd_check(config: ArchiveConfig) -> int:
     print(f"  AWS profile          : {config.aws_profile or '(default chain)'}")
     print(f"  AWS region           : {config.aws_region or '(default)'}")
     print(f"  Firecrawl API key    : {_mask(config.firecrawl_api_key)}")
+    print(f"  Firecrawl proxy mode : {config.firecrawl_proxy}")
+    print(f"  Hyperbrowser API key : {_mask(config.hyperbrowser_api_key)}")
+    print(f"  Hyperbrowser proxy   : {config.hyperbrowser_use_proxy}")
+    print(f"  CloakBrowser module  : {config.cloakbrowser_import}")
+    print(f"  PDF max pages        : {config.pdf_max_pages}")
     print(f"  TTL (days)           : {config.ttl_days}")
     print(f"  Screenshot format    : {config.screenshot_format}")
     print(f"  Screenshot max height: {config.screenshot_max_height}")
@@ -76,19 +80,64 @@ def _cmd_check(config: ArchiveConfig) -> int:
 
 
 def _cmd_capture(args, config: ArchiveConfig) -> int:
+    from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls
+    from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+        capture_urls_concurrent,
+    )
+
     records = manifest_io.read_file(args.manifest)
+
+    overrides = {}
+    if getattr(args, "no_hyperbrowser", False):
+        overrides["hyperbrowser_api_key"] = None
+    if getattr(args, "concurrency", None):
+        overrides["concurrency"] = args.concurrency
+    if overrides:
+        config = config.model_copy(update=overrides)
+    if "hyperbrowser_api_key" in overrides:
+        print("Hyperbrowser fallback DISABLED for this run.")
+
     store = ContentStore(_make_blob_store(config, args.local, args.bucket), config)
 
+    urls = list(unique_urls(records))
+    if args.limit:
+        urls = urls[: args.limit]
     target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
-    print(f"Capturing {len(records)} citation record(s) -> {target}")
+    print(
+        f"Capturing {len(urls)} URL(s) at concurrency {config.concurrency} -> {target}"
+    )
 
-    with build_default_fetcher(config) as fetcher:
-        pipeline = CapturePipeline(fetcher, store)
-        summary = pipeline.run_manifest(records)
+    summary = capture_urls_concurrent(urls, store, config, build_default_fetcher)
     print(summary)
 
+    run_id = args.run_id or (records[0].run_id if records else None)
+    if run_id:
+        from forecasting_tools.agents_and_tools.source_archive import reports
+
+        reports.write_run_report(store.blobs, run_id, summary, config)
+        print(f"Wrote run outcomes -> {config.s3_prefix}/reports/{run_id}.json")
+
+    # Failures leave no cache entry, so re-running retries exactly them. Write a
+    # retry manifest (with provenance) so coming back — e.g. with hyperbrowser
+    # re-enabled — is one command over only the sites that still need it.
+    failed = {
+        o.url for o in summary.outcomes if o.status in ("quality_failed", "error")
+    }
+    if failed:
+        from forecasting_tools.agents_and_tools.source_archive.ingest import (
+            dedupe_records,
+        )
+
+        retry_records = dedupe_records(r for r in records if r.url in failed)
+        retry_path = args.retry_out or f"{run_id or 'run'}_needs_retry.jsonl"
+        manifest_io.write_file(retry_path, retry_records)
+        print(
+            f"{len(failed)} URL(s) failed -> retry manifest {retry_path}\n"
+            f"  come back later with:  source-archive capture {retry_path} "
+            f"--run-id {run_id or '<run-id>'}   (hyperbrowser on by default)"
+        )
+
     if args.upload_manifest:
-        run_id = args.run_id or (records[0].run_id if records else None)
         if not run_id:
             sys.exit("--upload-manifest needs --run-id (no run_id found in records)")
         manifest_io.write_blob(store.blobs, run_id, records, config)
@@ -96,19 +145,71 @@ def _cmd_capture(args, config: ArchiveConfig) -> int:
     return 0
 
 
-def _cmd_harvest(args, config: ArchiveConfig) -> int:
+def _cmd_ingest_traces(args, config: ArchiveConfig) -> int:
     from forecasting_tools.agents_and_tools.source_archive.ingest import (
-        MetaculusCommentHarvester,
+        dedupe_records,
+        harvest_run,
     )
 
-    run_id = args.run_id or f"metaculus-comments-{args.project_id}"
-    harvester = MetaculusCommentHarvester()
-    records = harvester.harvest_project(args.project_id, run_id=run_id)
-    print(
-        f"Harvested {len(records)} citation record(s) from project "
-        f"{args.project_id}"
+    run_id = args.run_id  # None -> derived from the run dir name
+    records = harvest_run(args.run_dir, run_id=run_id, bot=args.bot)
+    if args.dedupe:
+        records = dedupe_records(records)
+    run_id = run_id or (records[0].run_id if records else None)
+    print(f"Ingested {len(records)} citation record(s) from traces in {args.run_dir}")
+
+    out_path = args.out or f"{run_id or 'traces'}.jsonl"
+    if not args.upload or args.out:
+        manifest_io.write_file(out_path, records)
+        print(f"Wrote manifest -> {out_path}")
+    if args.upload:
+        if not run_id:
+            sys.exit("--upload needs a run id (pass --run-id; none found in records)")
+        store = _make_blob_store(config, None, args.bucket)
+        manifest_io.write_blob(store, run_id, records, config)
+        print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+    return 0
+
+
+def _cmd_catalog(args, config: ArchiveConfig) -> int:
+    from forecasting_tools.agents_and_tools.source_archive.catalog import write_catalog
+
+    store = _make_blob_store(config, args.local, args.bucket)
+    target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
+    print(f"Building catalog from manifests + index -> {target}/catalog/")
+    summary = write_catalog(store, config)
+    print(summary)
+    print(f"Open {config.s3_prefix}/catalog/index.html")
+    return 0
+
+
+def _cmd_harvest_db(args, config: ArchiveConfig) -> int:
+    from forecasting_tools.agents_and_tools.source_archive.ingest import (
+        MetaculusDbHarvester,
+        dedupe_records,
+        resolve_dsn,
     )
 
+    dsn = resolve_dsn(args.dsn)
+    include_private = not args.public_only
+    harvester = MetaculusDbHarvester.from_dsn(dsn)
+    if args.post:
+        records = harvester.harvest_post(
+            args.post, run_id=args.run_id, include_private=include_private
+        )
+        run_id = args.run_id or f"metaculus-db-post-{args.post}"
+    else:
+        records = harvester.harvest_recent(
+            days=args.days,
+            limit=args.limit,
+            run_id=args.run_id,
+            include_private=include_private,
+        )
+        run_id = args.run_id or f"metaculus-db-recent-{args.days}d"
+    if args.dedupe:
+        records = dedupe_records(records)
+    print(f"Harvested {len(records)} citation record(s) from the Metaculus DB")
+
     out_path = args.out or f"{run_id}.jsonl"
     if not args.upload or args.out:
         manifest_io.write_file(out_path, records)
@@ -120,6 +221,35 @@ def _cmd_harvest(args, config: ArchiveConfig) -> int:
     return 0
 
 
+def _cmd_coverage(args, config: ArchiveConfig) -> int:
+    from pathlib import Path
+
+    from forecasting_tools.agents_and_tools.source_archive import reports
+    from forecasting_tools.agents_and_tools.source_archive.catalog import build_sources
+    from forecasting_tools.agents_and_tools.source_archive.coverage import (
+        MODES,
+        coverage_from_sources,
+    )
+
+    store = _make_blob_store(config, args.local, args.bucket)
+    sources = build_sources(store, config)  # read manifests + index once
+    outcomes = reports.read_outcomes(store, config) or None
+    modes = MODES if args.mode == "both" else (args.mode,)
+    for mode in modes:
+        report = coverage_from_sources(sources, mode, outcomes)
+        print(report)
+        print()
+        if args.csv:
+            Path(f"{args.csv}_{mode}.csv").write_text(report.to_csv())
+            print(f"Wrote {args.csv}_{mode}.csv")
+            if report.missing_urls:
+                Path(f"{args.csv}_{mode}_missing.txt").write_text(
+                    "\n".join(report.missing_urls)
+                )
+                print(f"Wrote {args.csv}_{mode}_missing.txt")
+    return 0
+
+
 def main(argv: list[str] | None = None) -> int:
     _load_dotenv()
     parser = argparse.ArgumentParser(
@@ -145,22 +275,110 @@ def main(argv: list[str] | None = None) -> int:
         help="also upload the manifest itself to manifests/<run_id>.jsonl",
     )
     cap.add_argument("--run-id", help="run id for the uploaded manifest")
+    cap.add_argument(
+        "--no-hyperbrowser",
+        action="store_true",
+        help="disable the Hyperbrowser fallback for this run (others still run)",
+    )
+    cap.add_argument(
+        "--retry-out",
+        metavar="FILE",
+        help="where to write the failed-URL retry manifest "
+        "(default: <run_id>_needs_retry.jsonl)",
+    )
+    cap.add_argument(
+        "--concurrency",
+        type=int,
+        metavar="N",
+        help="parallel browser workers for this run (overrides WEB_ARCHIVE_CONCURRENCY)",
+    )
+    cap.add_argument(
+        "--limit",
+        type=int,
+        metavar="N",
+        help="capture only the first N URLs (chunk a big manifest; resume via cache)",
+    )
 
-    harv = sub.add_parser(
-        "harvest",
-        help="harvest cited URLs from bot comments on a Metaculus project",
+    ing = sub.add_parser(
+        "ingest-traces",
+        help="build a manifest from a traced bot run directory (bot_*/q_*/traces_*.jsonl)",
     )
-    harv.add_argument("project_id", help="Metaculus project / tournament id")
-    harv.add_argument(
+    ing.add_argument("run_dir", help="path to a traced run directory")
+    ing.add_argument(
         "--out", metavar="FILE", help="write the manifest to this .jsonl file"
     )
-    harv.add_argument(
-        "--run-id", help="run id (default: metaculus-comments-<project_id>)"
+    ing.add_argument("--run-id", help="run id (default: the run dir's name)")
+    ing.add_argument(
+        "--bot",
+        help="bot name for a flat (no bot_*/) layout (default: the run dir's name)",
+    )
+    ing.add_argument(
+        "--dedupe", action="store_true", help="keep one record per URL (first seen)"
     )
-    harv.add_argument(
+    ing.add_argument(
         "--upload", action="store_true", help="upload the manifest to S3 manifests/"
     )
-    harv.add_argument("--bucket", help="override the S3 bucket")
+    ing.add_argument("--bucket", help="override the S3 bucket")
+
+    cat = sub.add_parser(
+        "catalog",
+        help="generate a coworker-legible HTML/CSV catalog (by question/bot/site)",
+    )
+    cat.add_argument(
+        "--local", metavar="DIR", help="read/write the catalog in this directory"
+    )
+    cat.add_argument("--bucket", help="override the S3 bucket")
+
+    hdb = sub.add_parser(
+        "harvest-db",
+        help="read a bot's cited URLs from the platform Postgres database (operator)",
+    )
+    grp = hdb.add_mutually_exclusive_group(required=True)
+    grp.add_argument("--post", help="harvest one post id")
+    grp.add_argument("--days", type=int, help="harvest the most recent N days")
+    hdb.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="cap rows when using --days (default: uncapped — a daily sweep wants all)",
+    )
+    hdb.add_argument(
+        "--public-only",
+        action="store_true",
+        help="read only public comments (default: read all of a bot's comments)",
+    )
+    hdb.add_argument(
+        "--dsn",
+        help="libpq DSN or postgresql:// URL. Default resolution: --dsn > "
+        "$METACULUS_DB_DSN > macOS Keychain item 'metaculus-db-dsn' > "
+        "dbname=metaculus. Prefer the Keychain for the real secret "
+        "(a --dsn value lands in shell history).",
+    )
+    hdb.add_argument("--out", metavar="FILE", help="write the manifest to this .jsonl")
+    hdb.add_argument("--run-id", help="run id (default derived from the slice)")
+    hdb.add_argument(
+        "--dedupe", action="store_true", help="keep one record per URL (first seen)"
+    )
+    hdb.add_argument(
+        "--upload", action="store_true", help="upload the manifest to S3 manifests/"
+    )
+    hdb.add_argument("--bucket", help="override the S3 bucket")
+
+    cov = sub.add_parser(
+        "coverage",
+        help="report what %% of cited sources were archived (trace vs comments)",
+    )
+    cov.add_argument(
+        "--mode",
+        choices=["trace", "comments", "both"],
+        default="both",
+        help="which report(s) to print (default: both)",
+    )
+    cov.add_argument(
+        "--csv", metavar="PREFIX", help="write PREFIX_<mode>.csv (+ _missing.txt)"
+    )
+    cov.add_argument("--local", metavar="DIR", help="read from this directory")
+    cov.add_argument("--bucket", help="override the S3 bucket")
 
     args = parser.parse_args(argv)
     config = ArchiveConfig.from_env()
@@ -169,8 +387,14 @@ def main(argv: list[str] | None = None) -> int:
         return _cmd_check(config)
     if args.command == "capture":
         return _cmd_capture(args, config)
-    if args.command == "harvest":
-        return _cmd_harvest(args, config)
+    if args.command == "ingest-traces":
+        return _cmd_ingest_traces(args, config)
+    if args.command == "harvest-db":
+        return _cmd_harvest_db(args, config)
+    if args.command == "catalog":
+        return _cmd_catalog(args, config)
+    if args.command == "coverage":
+        return _cmd_coverage(args, config)
     return 1
 
 
diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py
index 2572ffc4..cfb643ef 100644
--- a/forecasting_tools/agents_and_tools/source_archive/config.py
+++ b/forecasting_tools/agents_and_tools/source_archive/config.py
@@ -19,17 +19,41 @@ def _get_int(name: str, default: int) -> int:
     return int(raw)
 
 
+def _get_bool(name: str, default: bool) -> bool:
+    raw = os.environ.get(name)
+    if raw is None or raw == "":
+        return default
+    return raw.strip().lower() in ("1", "true", "yes", "on")
+
+
 class ArchiveConfig(BaseModel):
     """Runtime configuration. Construct directly in tests, or ``from_env()``."""
 
     s3_bucket: str | None = None
     s3_prefix: str = "source-archive"
+    # Local archive directory. When set, the viewer reads captures from here
+    # instead of S3 — handy for inspecting a local capture run with no AWS.
+    local_dir: str | None = None
     aws_profile: str | None = None
     aws_region: str | None = None
     firecrawl_api_key: str | None = None
+    # Firecrawl proxy mode for the anti-bot path: "basic" (1 credit) | "auto"
+    # (1 credit, escalates to 5 on fallback) | "stealth"/"enhanced" (5 credits).
+    # Only the fallback Firecrawl tier pays this; basic is the default.
+    firecrawl_proxy: str = "basic"
+    hyperbrowser_api_key: str | None = None
+    # Hyperbrowser session knobs for the anti-bot path. Proxy turns a 1-credit
+    # scrape into a 10-credit one, so leave it on only for the Cloudflare tier.
+    hyperbrowser_use_proxy: bool = True
+    hyperbrowser_use_stealth: bool = True
+    hyperbrowser_solve_captchas: bool = True
+    # CloakBrowser exposes ``cloakbrowser.launch() -> Browser``; the module name
+    # is overridable in case the package is renamed.
+    cloakbrowser_import: str = "cloakbrowser"
+    pdf_max_pages: int = 50  # cap PDF parsing so a huge report can't blow latency/cost
     ttl_days: int = 14
     screenshot_format: str = "webp"  # webp | jpeg | png
-    screenshot_max_height: int = 4000  # px; cap full-page captures
+    screenshot_max_height: int = 16_000  # px; safety cap (under WebP's 16383 limit)
     nav_timeout_ms: int = 30_000
     concurrency: int = 5
 
@@ -38,13 +62,27 @@ def from_env(cls) -> "ArchiveConfig":
         return cls(
             s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"),
             s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"),
+            local_dir=os.environ.get("WEB_ARCHIVE_LOCAL_DIR"),
             aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"),
             aws_region=os.environ.get("AWS_REGION")
             or os.environ.get("AWS_DEFAULT_REGION"),
             firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"),
+            firecrawl_proxy=os.environ.get("WEB_ARCHIVE_FIRECRAWL_PROXY", "basic"),
+            hyperbrowser_api_key=os.environ.get("HYPERBROWSER_API_KEY"),
+            hyperbrowser_use_proxy=_get_bool("WEB_ARCHIVE_HYPERBROWSER_PROXY", True),
+            hyperbrowser_use_stealth=_get_bool(
+                "WEB_ARCHIVE_HYPERBROWSER_STEALTH", True
+            ),
+            hyperbrowser_solve_captchas=_get_bool(
+                "WEB_ARCHIVE_HYPERBROWSER_CAPTCHA", True
+            ),
+            cloakbrowser_import=os.environ.get(
+                "WEB_ARCHIVE_CLOAKBROWSER_IMPORT", "cloakbrowser"
+            ),
+            pdf_max_pages=_get_int("WEB_ARCHIVE_PDF_MAX_PAGES", 50),
             ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14),
             screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"),
-            screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000),
+            screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 16_000),
             nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000),
             concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5),
         )
diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py
index 7481ab93..800ead70 100644
--- a/forecasting_tools/agents_and_tools/source_archive/content_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py
@@ -12,17 +12,33 @@
     already stored, skip the write (dedup identical re-fetches) and just refresh
     timestamps.
 
+**Redirect aliasing.** A capture is keyed by its *final* URL (after redirects),
+so a link shortener (``bit.ly/x``) and the page it resolves to collapse onto one
+capture instead of two. The original cited URL gets a tiny **alias index** that
+points at the final URL's index, and the final URL's index lists its aliases for
+provenance. So ``lookup(bit.ly/x)`` and ``lookup(final)`` both hit the same
+stored page, and we never store the destination twice.
+
+**Cross-URL content dedup.** Different URLs that return byte-identical content
+share the blobs rather than storing them three times each. The first URL to
+store a given content owns the blobs; later URLs get a capture whose blob keys
+point back at them and whose ``content_alias_of`` names the owner. A reverse
+``index/by-content/<content_hash>.json`` tracks the owner and every member URL.
+
 Object layout (under ``config.s3_prefix``)::
 
-    index/<url_hash>.json                       per-URL index + capture history
-    content/<url_hash>/<content_hash>.html
-    content/<url_hash>/<content_hash>.<img_ext>
-    content/<url_hash>/<content_hash>.md
+    index/<final_url_hash>.json        canonical: capture history + "aliases"
+    index/<orig_url_hash>.json         alias: {"alias_of": <final_url_hash>}
+    index/by-content/<content_hash>.json   reverse: owner + member urls
+    content/<final_url_hash>/<content_hash>.html
+    content/<final_url_hash>/<content_hash>.<img_ext>
+    content/<final_url_hash>/<content_hash>.md
 """
 
 from __future__ import annotations
 
 import json
+import threading
 from datetime import datetime, timedelta, timezone
 
 from pydantic import BaseModel
@@ -58,6 +74,11 @@ def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None):
         self.blobs = blob_store
         self.config = config or ArchiveConfig()
         self.prefix = self.config.s3_prefix.rstrip("/")
+        # Serializes the shared by-content reverse index across capture threads
+        # (concurrent runs). Per-URL index files are written by a single thread
+        # each, so they don't need it; the by-content index can be contended when
+        # different URLs return identical content.
+        self._content_lock = threading.Lock()
 
     # --- key helpers -------------------------------------------------------
     def _index_key(self, uh: str) -> str:
@@ -66,6 +87,9 @@ def _index_key(self, uh: str) -> str:
     def _content_key(self, uh: str, ch: str, ext: str) -> str:
         return f"{self.prefix}/content/{uh}/{ch}.{ext}"
 
+    def _content_index_key(self, ch: str) -> str:
+        return f"{self.prefix}/index/by-content/{ch}.json"
+
     # --- index io ----------------------------------------------------------
     def _read_index(self, uh: str) -> dict | None:
         key = self._index_key(uh)
@@ -77,16 +101,61 @@ def _write_index(self, uh: str, index: dict) -> None:
         data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8")
         self.blobs.put(self._index_key(uh), data, content_type="application/json")
 
+    def _read_content_index(self, ch: str) -> dict | None:
+        key = self._content_index_key(ch)
+        if not self.blobs.exists(key):
+            return None
+        try:
+            return json.loads(self.blobs.get(key).decode("utf-8"))
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            # A concurrent writer may have left a partial local file mid-write;
+            # treat as absent rather than crash. The locked path below is authoritative.
+            return None
+
+    def _register_content(
+        self, ch: str, uh: str, url: str, blob_keys: dict | None
+    ) -> None:
+        """Record that ``uh`` produced content ``ch`` in the reverse index.
+
+        The first URL to store a given content becomes its ``canonical_url_hash``
+        and owns the blob keys; later URLs with identical content are added as
+        ``members`` and reuse those blobs (see :meth:`store`). Locked so concurrent
+        capture threads with identical content don't clobber each other's members.
+        """
+        with self._content_lock:
+            reverse = self._read_content_index(ch)
+            if reverse is None:
+                reverse = {
+                    "content_hash": ch,
+                    "canonical_url_hash": uh,
+                    "blob_keys": blob_keys or {},
+                    "members": [],
+                }
+            members = reverse.setdefault("members", [])
+            if not any(m.get("url_hash") == uh for m in members):
+                members.append({"url_hash": uh, "url": url})
+            data = json.dumps(reverse, indent=2, sort_keys=True).encode("utf-8")
+            self.blobs.put(
+                self._content_index_key(ch), data, content_type="application/json"
+            )
+
     # --- public api --------------------------------------------------------
     def lookup(self, url: str) -> StoredCapture | None:
         """Return the latest stored capture if within the TTL, else ``None``.
 
-        A non-``None`` return means callers can skip fetching this URL.
+        A non-``None`` return means callers can skip fetching this URL. If ``url``
+        is an alias of a previously-redirected target, the alias is followed to
+        the canonical capture.
         """
         uh = url_hash(url)
         index = self._read_index(uh)
         if not index:
             return None
+        alias_of = index.get("alias_of")
+        if alias_of:  # follow the alias to the canonical (final-URL) index
+            index = self._read_index(alias_of)
+            if not index:
+                return None
         latest_ch = index.get("latest_content_hash")
         captures = index.get("captures", {})
         latest = captures.get(latest_ch)
@@ -100,13 +169,20 @@ def lookup(self, url: str) -> StoredCapture | None:
         return StoredCapture.model_validate(latest)
 
     def store(self, result: CaptureResult) -> StoreResult:
-        """Persist a capture, deduping by content hash. Always updates the index."""
-        uh = url_hash(result.url)
+        """Persist a capture, deduping by content hash. Always updates the index.
+
+        The capture is keyed by the *final* URL (after redirects). If the cited
+        URL differs from the final one, an alias index is written so the cited
+        URL still resolves here, and the cited URL is recorded under the
+        canonical index's ``aliases``.
+        """
+        final_url = result.final_url or result.url
+        uh = url_hash(final_url)
         ch = result.content_hash
         now = utcnow_iso()
 
         index = self._read_index(uh) or {
-            "url": result.url,
+            "url": final_url,
             "url_hash": uh,
             "first_seen": now,
             "captures": {},
@@ -116,33 +192,48 @@ def store(self, result: CaptureResult) -> StoreResult:
 
         created = existing is None
         if existing is not None:
-            # Identical content already stored — skip blob writes, refresh time.
+            # Identical content already stored for THIS url — skip writes, touch.
             existing["last_seen"] = now
             stored = StoredCapture.model_validate(existing)
         else:
-            html_key = screenshot_key = markdown_key = None
-            if result.html is not None:
-                html_key = self._content_key(uh, ch, "html")
-                self.blobs.put(
-                    html_key, result.html.encode("utf-8"), content_type="text/html"
-                )
-            if result.markdown is not None:
-                markdown_key = self._content_key(uh, ch, "md")
-                self.blobs.put(
-                    markdown_key,
-                    result.markdown.encode("utf-8"),
-                    content_type="text/markdown",
-                )
-            if result.screenshot is not None:
-                ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
-                screenshot_key = self._content_key(uh, ch, ext)
-                self.blobs.put(
-                    screenshot_key,
-                    result.screenshot,
-                    content_type=result.screenshot_content_type,
-                )
+            reverse = self._read_content_index(ch)
+            reuse = bool(
+                reverse and reverse.get("canonical_url_hash") not in (None, uh)
+            )
+            if reuse:
+                # Byte-identical content already stored under a DIFFERENT url —
+                # point at its blobs instead of writing three more (cross-URL
+                # content dedup); each url still keeps its own index history.
+                bk = reverse.get("blob_keys", {})
+                html_key = bk.get("html")
+                markdown_key = bk.get("markdown")
+                screenshot_key = bk.get("screenshot")
+                content_alias_of = reverse["canonical_url_hash"]
+            else:
+                html_key = screenshot_key = markdown_key = None
+                if result.html is not None:
+                    html_key = self._content_key(uh, ch, "html")
+                    self.blobs.put(
+                        html_key, result.html.encode("utf-8"), content_type="text/html"
+                    )
+                if result.markdown is not None:
+                    markdown_key = self._content_key(uh, ch, "md")
+                    self.blobs.put(
+                        markdown_key,
+                        result.markdown.encode("utf-8"),
+                        content_type="text/markdown",
+                    )
+                if result.screenshot is not None:
+                    ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
+                    screenshot_key = self._content_key(uh, ch, ext)
+                    self.blobs.put(
+                        screenshot_key,
+                        result.screenshot,
+                        content_type=result.screenshot_content_type,
+                    )
+                content_alias_of = None
             stored = StoredCapture(
-                url=result.url,
+                url=final_url,
                 url_hash=uh,
                 content_hash=ch,
                 status_code=result.status_code,
@@ -151,12 +242,62 @@ def store(self, result: CaptureResult) -> StoreResult:
                 html_key=html_key,
                 screenshot_key=screenshot_key,
                 markdown_key=markdown_key,
+                content_alias_of=content_alias_of,
                 first_seen=now,
                 last_seen=now,
             )
             captures[ch] = stored.model_dump()
+            self._register_content(
+                ch,
+                uh,
+                final_url,
+                blob_keys=(
+                    None
+                    if reuse
+                    else {
+                        "html": html_key,
+                        "markdown": markdown_key,
+                        "screenshot": screenshot_key,
+                    }
+                ),
+            )
 
         index["latest_content_hash"] = ch
         index["last_checked"] = now
+
+        # If the cited URL redirected to a different final URL, record the alias.
+        orig_uh = url_hash(result.url)
+        if orig_uh != uh:
+            aliases = index.setdefault("aliases", [])
+            if result.url not in aliases:
+                aliases.append(result.url)
+
         self._write_index(uh, index)
+
+        if orig_uh != uh:
+            self._write_alias(orig_uh, result.url, uh, now)
+
         return StoreResult(capture=stored, created=created)
+
+    def _write_alias(
+        self, orig_uh: str, orig_url: str, final_uh: str, now: str
+    ) -> None:
+        """Write/refresh a pointer from a cited URL's hash to its final capture.
+
+        Never clobbers a canonical index (one that already holds captures), so a
+        URL fetched directly in the past keeps its own history.
+        """
+        existing = self._read_index(orig_uh)
+        if existing and existing.get("captures"):
+            return
+        first_seen = existing.get("first_seen", now) if existing else now
+        self._write_index(
+            orig_uh,
+            {
+                "url": orig_url,
+                "url_hash": orig_uh,
+                "alias_of": final_uh,
+                "first_seen": first_seen,
+                "last_checked": now,
+            },
+        )
diff --git a/forecasting_tools/agents_and_tools/source_archive/coverage.py b/forecasting_tools/agents_and_tools/source_archive/coverage.py
new file mode 100644
index 00000000..9b812e9b
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/coverage.py
@@ -0,0 +1,237 @@
+"""Coverage reports: what fraction of cited sources did we actually archive?
+
+Two **separate** reports, split by ingestion path — they have different
+denominators and different notions of ground truth, so they must not be blurred:
+
+- ``trace`` — the complex/template bot's own instrumented runs (metac-ai-sdk).
+  Traces record *every* URL the bot touched, so this is a true archival
+  success-rate against ground truth.
+- ``comments`` — every bot (Metaculus's own + outside bots) harvested from public
+  Metaculus comments. Comments are length-truncated, so the denominator is itself
+  incomplete: coverage here means "of the links we could *see* in comments, how
+  many we archived" — a weaker claim than the trace report.
+
+For each mode: denominator = distinct canonical **page** sources cited (tool/API
+calls excluded, same rule as the catalog); numerator = those with a successful
+capture in the index. Misses are bucketed by site — the per-URL failure *reason*
+isn't persisted yet (that needs each run's pipeline outcomes saved), so we can
+say *which* sites we miss, not yet *why*.
+"""
+
+from __future__ import annotations
+
+import csv
+import io
+from collections import defaultdict
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.catalog import (
+    Source,
+    build_sources,
+    exclusion_reason,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+MODES = ("trace", "comments")
+_COMMENT_ORIGINS = {"metaculus_comment"}
+
+
+def citation_mode(citation) -> str:
+    return "comments" if (citation.origin or "") in _COMMENT_ORIGINS else "trace"
+
+
+class CoverageRow(BaseModel):
+    label: str
+    cited: int = 0
+    captured: int = 0
+
+    @property
+    def pct(self) -> float:
+        return round(100 * self.captured / self.cited, 1) if self.cited else 0.0
+
+
+class CoverageReport(BaseModel):
+    mode: str
+    cited: int = 0
+    captured: int = 0
+    excluded: dict[str, int] = {}  # non-source reason -> count
+    by_question: list[CoverageRow] = []
+    by_bot: list[CoverageRow] = []
+    by_tool: list[CoverageRow] = []
+    missed_by_domain: list[CoverageRow] = []
+    missing_urls: list[str] = []
+    # Populated only when per-run outcomes (reports/) exist:
+    has_outcomes: bool = False
+    missing_never_fetched: int = 0  # the real collection gap
+    missing_fetch_failed: int = 0  # tried, failed (Cloudflare/PDF/…)
+
+    @property
+    def pct(self) -> float:
+        return round(100 * self.captured / self.cited, 1) if self.cited else 0.0
+
+    @property
+    def missing(self) -> int:
+        return self.cited - self.captured
+
+    def __str__(self) -> str:
+        title = {
+            "trace": "Trace coverage — complex/template bot (ground truth)",
+            "comments": "Comment coverage — all bots (truncated denominator)",
+        }.get(self.mode, self.mode)
+        excl = (
+            "  (excluded as non-sources: "
+            + ", ".join(f"{k} {v}" for k, v in sorted(self.excluded.items()))
+            + ")"
+            if self.excluded
+            else ""
+        )
+        lines = [
+            title,
+            "=" * len(title),
+            # Lead with the collection gap: this report exists to tell us whether
+            # there are sources bots are using that we are NOT yet archiving.
+            f"{self.missing} of {self.cited} cited page sources are NOT yet in the "
+            f"archive — candidates to collect  ({self.captured} archived, "
+            f"{self.pct}%).",
+            excl,
+        ]
+        if self.has_outcomes:
+            lines.append(
+                f"  of those {self.missing}: {self.missing_never_fetched} were "
+                f"never fetched (collection gap), {self.missing_fetch_failed} "
+                f"were fetched but failed."
+            )
+        if self.mode == "comments":
+            lines.append(
+                "  note: comments are length-truncated, so even this denominator "
+                "under-counts what bots actually read — the true gap is larger."
+            )
+
+        def table(header: str, rows: list[CoverageRow], n: int = 8) -> None:
+            if not rows:
+                return
+            lines.append("")
+            lines.append(f"--- {header} ---")
+            for r in rows[:n]:
+                lines.append(f"  {r.captured:>4}/{r.cited:<4} {r.pct:>5}%  {r.label}")
+            if len(rows) > n:
+                lines.append(f"  … +{len(rows) - n} more")
+
+        table("by question (most-cited first)", self.by_question)
+        table("by bot", self.by_bot)
+        if self.mode == "trace":
+            table("by tool", self.by_tool)
+        table("biggest collection gaps by site (captured/cited)", self.missed_by_domain)
+        if self.missing_urls:
+            lines.append("")
+            lines.append(
+                f"--- {len(self.missing_urls)} source(s) to collect (first 10) ---"
+            )
+            for u in self.missing_urls[:10]:
+                lines.append(f"  {u}")
+        return "\n".join(lines)
+
+    def to_csv(self) -> str:
+        buf = io.StringIO()
+        w = csv.writer(buf)
+        w.writerow(["group", "label", "cited", "captured", "pct"])
+        w.writerow(["overall", self.mode, self.cited, self.captured, self.pct])
+        for group, rows in (
+            ("question", self.by_question),
+            ("bot", self.by_bot),
+            ("tool", self.by_tool),
+            ("missed_domain", self.missed_by_domain),
+        ):
+            for r in rows:
+                w.writerow([group, r.label, r.cited, r.captured, r.pct])
+        return buf.getvalue()
+
+
+def _grouped(scoped: list[tuple[Source, list]], key_of) -> list[CoverageRow]:
+    agg: dict[str, list[int]] = defaultdict(lambda: [0, 0])
+    for source, cits in scoped:
+        keys = {k for k in (key_of(c) for c in cits) if k} or {"(none)"}
+        for k in keys:
+            agg[k][0] += 1
+            if source.captured:
+                agg[k][1] += 1
+    rows = [CoverageRow(label=k, cited=v[0], captured=v[1]) for k, v in agg.items()]
+    return sorted(rows, key=lambda r: (-r.cited, r.label))
+
+
+def coverage_from_sources(
+    sources: list[Source], mode: str, outcomes: dict[str, str] | None = None
+) -> CoverageReport:
+    scoped: list[tuple[Source, list]] = []
+    excluded: dict[str, int] = defaultdict(int)
+    for s in sources:
+        cits = [c for c in s.citations if citation_mode(c) == mode]
+        if not cits:
+            continue
+        reason = exclusion_reason(s.canonical_url, cits)
+        if reason:
+            excluded[reason] += 1
+            continue
+        scoped.append((s, cits))
+
+    captured = sum(1 for s, _ in scoped if s.captured)
+
+    never_fetched = failed = 0
+    if outcomes is not None:
+        from forecasting_tools.agents_and_tools.source_archive.reports import (
+            FAILED_STATUSES,
+        )
+
+        for s, _ in scoped:
+            if s.captured:
+                continue
+            status = outcomes.get(s.canonical_url)
+            if status is None:
+                never_fetched += 1
+            elif status in FAILED_STATUSES:
+                failed += 1
+            else:
+                failed += 1
+
+    domain_agg: dict[str, list[int]] = defaultdict(lambda: [0, 0])
+    for s, _ in scoped:
+        domain_agg[s.domain][0] += 1
+        if s.captured:
+            domain_agg[s.domain][1] += 1
+    missed_by_domain = sorted(
+        (
+            CoverageRow(label=d, cited=c, captured=cap)
+            for d, (c, cap) in domain_agg.items()
+            if cap < c
+        ),
+        key=lambda r: (-(r.cited - r.captured), r.label),
+    )
+
+    return CoverageReport(
+        mode=mode,
+        cited=len(scoped),
+        captured=captured,
+        excluded=dict(excluded),
+        by_question=_grouped(scoped, lambda c: c.question_id),
+        by_bot=_grouped(scoped, lambda c: c.bot),
+        by_tool=_grouped(scoped, lambda c: c.tool_name),
+        missed_by_domain=missed_by_domain,
+        missing_urls=sorted(s.canonical_url for s, _ in scoped if not s.captured),
+        has_outcomes=outcomes is not None,
+        missing_never_fetched=never_fetched,
+        missing_fetch_failed=failed,
+    )
+
+
+def build_coverage(
+    store: BlobStore, config: ArchiveConfig, mode: str
+) -> CoverageReport:
+    from forecasting_tools.agents_and_tools.source_archive.reports import read_outcomes
+
+    sources = build_sources(store, config)
+    outcomes = read_outcomes(store, config) or None
+    return coverage_from_sources(sources, mode, outcomes)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
index 758aa87e..b136ea66 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
@@ -1,19 +1,32 @@
 """Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown).
 
 Most callers want :func:`build_default_fetcher`, which wires the recommended
-tiered setup: self-hosted Playwright primary, Firecrawl fallback.
+cost-ordered tiered setup: self-hosted Playwright primary, then CloakBrowser,
+PDF, Hyperbrowser, and Firecrawl backups.
 """
 
 from __future__ import annotations
 
+import logging
+
 from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
 from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
     Fetcher,
     FetchError,
 )
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+    CloakBrowserFetcher,
+)
 from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
     FirecrawlFetcher,
 )
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+    HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+    PdfFetcher,
+    looks_like_pdf,
+)
 from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
     PlaywrightFetcher,
 )
@@ -21,12 +34,18 @@
     TieredFetcher,
 )
 
+logger = logging.getLogger(__name__)
+
 __all__ = [
     "Fetcher",
     "FetchError",
+    "CloakBrowserFetcher",
     "FirecrawlFetcher",
+    "HyperbrowserFetcher",
+    "PdfFetcher",
     "PlaywrightFetcher",
     "TieredFetcher",
+    "looks_like_pdf",
     "build_default_fetcher",
 ]
 
@@ -39,29 +58,64 @@ def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetc
         with build_default_fetcher(config) as fetcher:
             fetcher.fetch(url)
 
-    Playwright runs first; if a page fails to render or trips the quality gate
-    and a Firecrawl API key is configured, Firecrawl is tried as a fallback.
-
-    The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle
-    is managed by ``with``. On ``__enter__`` it transparently composes itself
-    with Firecrawl (when available) behind a :class:`TieredFetcher`.
+    Backends are tried in **cost order** — the first capture that passes the
+    quality gate wins, so the cheap tiers absorb most of the tail and the paid
+    ones only fire on what survives them:
+
+    1. **Self-hosted browser** (~free) — the primary; ~70% of URLs. Uses
+       **CloakBrowser** (patched Chromium; matches-or-beats vanilla Playwright on
+       anti-bot) when installed, else falls back to vanilla **Playwright**. Only
+       one browser is used: two live ``sync_playwright`` instances conflict in a
+       single process, so cloak *replaces* vanilla rather than stacking with it.
+    2. **PdfFetcher** (local, free; Firecrawl OCR fallback) — captures PDFs,
+       which the browsers can't render.
+    3. **Hyperbrowser** (managed) — consolidated anti-bot fallback. Added when
+       ``HYPERBROWSER_API_KEY`` is set.
+    4. **Firecrawl** (managed) — cheapest stealth + native-PDF safety net. Added
+       when ``FIRECRAWL_API_KEY`` is set.
+
+    The returned object is a :class:`PlaywrightFetcher` subclass so the single
+    browser's lifecycle is managed by ``with``.
     """
     config = config or ArchiveConfig()
     return _ManagedTieredFetcher(config)
 
 
 class _ManagedTieredFetcher(PlaywrightFetcher):
-    """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline.
-
-    Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle
-    while letting us add the Firecrawl fallback once the browser is live.
+    """PlaywrightFetcher whose ``fetch`` is delegated to a cost-ordered tiered
+    pipeline. The single self-hosted browser is CloakBrowser when available
+    (overriding ``_launch_browser``), else vanilla Playwright; the extra backends
+    are composed once it is live.
     """
 
+    _primary_name = "playwright"
+
+    def _launch_browser(self):
+        # Prefer CloakBrowser (patched Chromium, beats vanilla on anti-bot) as
+        # the one self-hosted browser. Two live sync_playwright instances in a
+        # process conflict, so cloak REPLACES vanilla here rather than stacking.
+        try:
+            browser = CloakBrowserFetcher(self.config)._launch_browser()
+            self._primary_name = "cloakbrowser"
+            return browser
+        except FetchError as e:
+            logger.info("cloakbrowser unavailable, using vanilla Playwright: %s", e)
+            self._primary_name = "playwright"
+            return super()._launch_browser()
+
     def __enter__(self) -> "_ManagedTieredFetcher":
-        super().__enter__()
-        backends: list[Fetcher] = [_PlaywrightOnly(self)]
+        super().__enter__()  # launches the chosen browser via _launch_browser
+        backends: list[Fetcher] = [_PrimaryBrowser(self, self._primary_name)]
+
+        # PDFs: free local parse (Firecrawl OCR fallback wired internally when a
+        # key is present). Cheap to keep in the chain unconditionally.
+        backends.append(PdfFetcher(self.config))
+
+        if self.config.hyperbrowser_api_key:
+            backends.append(HyperbrowserFetcher(self.config))
         if self.config.firecrawl_api_key:
             backends.append(FirecrawlFetcher(self.config))
+
         self._tiered = TieredFetcher(*backends)
         return self
 
@@ -69,14 +123,16 @@ def fetch(self, url: str):  # type: ignore[override]
         return self._tiered.fetch(url)
 
 
-class _PlaywrightOnly:
-    """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering,
-    calling the un-overridden ``fetch`` so we don't recurse."""
-
-    name = "playwright"
+class _PrimaryBrowser:
+    """Adapts the live primary browser to the Fetcher protocol for tiering,
+    calling the un-overridden ``fetch`` so we don't recurse, and labelling the
+    capture with the actual browser used (cloakbrowser/playwright)."""
 
-    def __init__(self, owner: PlaywrightFetcher):
+    def __init__(self, owner: PlaywrightFetcher, name: str):
         self._owner = owner
+        self.name = name
 
     def fetch(self, url: str):
-        return PlaywrightFetcher.fetch(self._owner, url)
+        result = PlaywrightFetcher.fetch(self._owner, url)
+        result.fetcher = self.name
+        return result
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py
new file mode 100644
index 00000000..d4164e70
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py
@@ -0,0 +1,62 @@
+"""CloakBrowser fetcher — a self-hosted anti-bot upgrade to Playwright.
+
+CloakBrowser (``CloakHQ/CloakBrowser``) is an open-source, patched-Chromium fork
+whose ``cloakbrowser.launch()`` returns a standard Playwright ``Browser`` — so
+this fetcher reuses *all* of ``PlaywrightFetcher``'s capture logic (settle,
+autoscroll, full-page screenshot, trafilatura→markdown) and only overrides how
+the browser is launched. The fork applies source-level fingerprint patches that
+get past Cloudflare Turnstile and similar challenges that plain headless Chromium
+trips; in the one rigorous 2026 anti-detect benchmark it cleared more Cloudflare
+targets than vanilla Playwright.
+
+It runs on your own compute, so the marginal service cost is ~$0/page. The
+patched Chromium binary (~200MB) is downloaded automatically on first launch.
+
+Install separately (it is not in the ``source-archive`` extra because of the
+binary): ``pip install cloakbrowser``. The package module is configurable via
+``config.cloakbrowser_import`` (default ``cloakbrowser``) in case it is renamed.
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+    PlaywrightFetcher,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CloakBrowserFetcher(PlaywrightFetcher):
+    name = "cloakbrowser"
+
+    def _launch_browser(self):
+        module = self._import_module()
+        launch = getattr(module, "launch", None)
+        if launch is None:
+            raise FetchError(
+                f"{module.__name__} has no launch(); the CloakBrowser API may "
+                "have changed. Expected `cloakbrowser.launch() -> Browser`."
+            )
+        # stealth_args=True applies the fingerprint patches; the returned object
+        # is a Playwright Browser, so the inherited fetch() drives it unchanged.
+        # No separate playwright handle to stop — CloakBrowser owns its driver.
+        browser = launch(headless=True, stealth_args=True)
+        return None, browser
+
+    def _import_module(self):
+        candidates = [self.config.cloakbrowser_import, "cloakbrowser"]
+        tried: list[str] = []
+        for mod_name in dict.fromkeys(c for c in candidates if c):
+            try:
+                return importlib.import_module(mod_name)
+            except ImportError:
+                tried.append(mod_name)
+        raise FetchError(
+            "cloakbrowser is not installed. Install it with "
+            "`pip install cloakbrowser` (or set WEB_ARCHIVE_CLOAKBROWSER_IMPORT "
+            f"to the right module). Tried: {', '.join(tried)}."
+        )
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
index 22aa1a55..622d51ff 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
@@ -1,8 +1,14 @@
-"""Firecrawl fetcher — the FALLBACK backend.
+"""Firecrawl fetcher — a managed FALLBACK backend.
 
-Reserved for sites that block headless Chromium. It costs ~1 credit/page even
-with a screenshot, so it only runs when the primary backend fails or its capture
-fails the quality gate.
+Reserved for sites that block headless Chromium. A basic scrape costs 1 credit/
+page even with a screenshot, so it only runs when the primary backend fails or
+its capture fails the quality gate.
+
+For hardened anti-bot sites, set ``config.firecrawl_proxy`` to ``"auto"`` or
+``"stealth"`` (a.k.a. "enhanced") — this routes through residential proxies and
+is billed at ~5 credits/page, so it is opt-in and reserved for the Cloudflare
+tier. Firecrawl also natively parses PDFs to markdown (1 credit per PDF page),
+which is why it is the fallback for the tiered ``PdfFetcher``.
 
 The Firecrawl SDK is optional and imported lazily. The screenshot comes back as
 a hosted URL, which we download to bytes.
@@ -50,10 +56,24 @@ def _get_client(self):
         self._client = Firecrawl(api_key=self.config.firecrawl_api_key)
         return self._client
 
+    def _scrape_kwargs(self, formats: list[str]) -> dict:
+        kwargs: dict = {"formats": formats}
+        # Firecrawl 4.x renamed "stealth" to the "enhanced" proxy mode but still
+        # accepts the legacy spelling; pass whatever the operator configured and
+        # let the SDK normalize. "basic" is the implicit default, so only send
+        # the param when something stronger is requested (keeps the call 1-credit
+        # unless the operator explicitly opts into the 5-credit proxy).
+        proxy = (self.config.firecrawl_proxy or "basic").strip().lower()
+        if proxy and proxy != "basic":
+            kwargs["proxy"] = proxy
+        return kwargs
+
     def fetch(self, url: str) -> CaptureResult:
         client = self._get_client()
         try:
-            doc = client.scrape(url, formats=["markdown", "html", "screenshot"])
+            doc = client.scrape(
+                url, **self._scrape_kwargs(["markdown", "html", "screenshot"])
+            )
         except Exception as e:
             raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e
 
@@ -75,9 +95,23 @@ def fetch(self, url: str) -> CaptureResult:
             screenshot=screenshot,
             screenshot_content_type=content_type,
             fetcher=self.name,
-            metadata={"title": _attr(metadata, "title")},
+            metadata={
+                "title": _attr(metadata, "title"),
+                "firecrawl_proxy": self.config.firecrawl_proxy,
+            },
         )
 
+    def fetch_pdf_markdown(self, url: str) -> str | None:
+        """Scrape just the markdown for a PDF URL via Firecrawl's native PDF
+        parser. Used as the fallback inside :class:`PdfFetcher` when local
+        extraction yields thin text (e.g. a scanned PDF needing OCR)."""
+        client = self._get_client()
+        try:
+            doc = client.scrape(url, **self._scrape_kwargs(["markdown"]))
+        except Exception as e:
+            raise FetchError(f"firecrawl pdf scrape failed for {url}: {e}") from e
+        return _attr(doc, "markdown")
+
     @staticmethod
     def _download(src_url: str) -> tuple[bytes | None, str | None]:
         try:
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py
new file mode 100644
index 00000000..ce728abd
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py
@@ -0,0 +1,149 @@
+"""Hyperbrowser fetcher — a managed FALLBACK backend.
+
+Hyperbrowser exposes a Firecrawl-style ``scrape`` endpoint that returns
+markdown + HTML + a screenshot in one call, with optional stealth, residential
+proxy, and CAPTCHA-solving session options for getting past Cloudflare and other
+anti-bot filters.
+
+Why it's here even though Firecrawl already is: forecasting-tools already uses
+Hyperbrowser elsewhere (``research/computer_use.py``), so routing the anti-bot
+tail through it consolidates spend onto one vendor/bill.
+
+Cost note: a basic scrape is 1 credit ($0.001); enabling ``use_proxy`` makes it
+10 credits ($0.01) plus proxy bandwidth ($10/GB). So the proxy/stealth session
+is opt-in and meant for the small hardened-Cloudflare residual, not every URL.
+Hyperbrowser has no documented PDF→markdown path, so PDFs go to the dedicated
+``PdfFetcher`` instead of here.
+
+The SDK is optional and imported lazily; a screenshot may come back as a hosted
+URL (downloaded to bytes) or inline base64.
+"""
+
+from __future__ import annotations
+
+import base64
+import binascii
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _attr(obj, key, default=None):
+    if obj is None:
+        return default
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+class HyperbrowserFetcher:
+    name = "hyperbrowser"
+
+    def __init__(self, config: ArchiveConfig | None = None, client=None):
+        self.config = config or ArchiveConfig()
+        self._client = client
+
+    def _get_client(self):
+        if self._client is not None:
+            return self._client
+        if not self.config.hyperbrowser_api_key:
+            raise FetchError("HYPERBROWSER_API_KEY is not set")
+        try:
+            from hyperbrowser import Hyperbrowser
+        except ImportError as e:
+            raise FetchError(
+                "hyperbrowser is not installed. Install it with "
+                "`pip install forecasting-tools[source-archive]`."
+            ) from e
+        self._client = Hyperbrowser(api_key=self.config.hyperbrowser_api_key)
+        return self._client
+
+    def _params(self, url: str):
+        """Build the SDK request objects. Imported here (not at module top) so
+        importing this module never requires the SDK."""
+        from hyperbrowser.models import (
+            CreateSessionParams,
+            ScrapeOptions,
+            StartScrapeJobParams,
+        )
+
+        return StartScrapeJobParams(
+            url=url,
+            scrape_options=ScrapeOptions(
+                formats=["markdown", "html", "screenshot"],
+                only_main_content=False,
+            ),
+            session_options=CreateSessionParams(
+                use_proxy=self.config.hyperbrowser_use_proxy,
+                use_stealth=self.config.hyperbrowser_use_stealth,
+                solve_captchas=self.config.hyperbrowser_solve_captchas,
+            ),
+        )
+
+    def fetch(self, url: str) -> CaptureResult:
+        client = self._get_client()
+        try:
+            resp = client.scrape.start_and_wait(self._params(url))
+        except Exception as e:
+            raise FetchError(f"hyperbrowser scrape failed for {url}: {e}") from e
+
+        # The job wrapper carries status/error; the payload is on ``.data``.
+        if _attr(resp, "status") == "failed":
+            raise FetchError(
+                f"hyperbrowser scrape failed for {url}: {_attr(resp, 'error')}"
+            )
+        data = _attr(resp, "data", resp)
+
+        metadata = _attr(data, "metadata", {}) or {}
+        status = _attr(metadata, "statusCode") or _attr(metadata, "status_code")
+        final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url
+
+        screenshot, content_type = self._coerce_screenshot(_attr(data, "screenshot"))
+
+        return CaptureResult(
+            url=url,
+            final_url=final_url,
+            status_code=int(status) if status is not None else None,
+            html=_attr(data, "html"),
+            markdown=_attr(data, "markdown"),
+            screenshot=screenshot,
+            screenshot_content_type=content_type,
+            fetcher=self.name,
+            metadata={
+                "title": _attr(metadata, "title"),
+                "used_proxy": self.config.hyperbrowser_use_proxy,
+            },
+        )
+
+    @classmethod
+    def _coerce_screenshot(cls, value) -> tuple[bytes | None, str | None]:
+        """A screenshot may arrive as a hosted URL, a data: URI, or raw base64."""
+        if not value or not isinstance(value, str):
+            return None, None
+        if value.startswith("http://") or value.startswith("https://"):
+            return cls._download(value)
+        if value.startswith("data:"):
+            try:
+                header, b64 = value.split(",", 1)
+                ctype = header[5:].split(";", 1)[0] or "image/png"
+                return base64.b64decode(b64), ctype
+            except (ValueError, binascii.Error):
+                return None, None
+        try:
+            return base64.b64decode(value, validate=True), "image/png"
+        except (binascii.Error, ValueError):
+            return None, None
+
+    @staticmethod
+    def _download(src_url: str) -> tuple[bytes | None, str | None]:
+        try:
+            with urllib.request.urlopen(src_url, timeout=30) as resp:
+                return resp.read(), resp.headers.get("Content-Type", "image/png")
+        except Exception as e:
+            logger.warning("failed to download hyperbrowser screenshot: %s", e)
+            return None, None
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py
new file mode 100644
index 00000000..0977605c
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py
@@ -0,0 +1,146 @@
+"""PDF fetcher — closes the gap Playwright can't.
+
+Headless Chromium *downloads* a PDF instead of rendering it (``page.goto`` raises
+"Download is starting"), and trafilatura doesn't parse PDFs, so a cited ``.pdf``
+URL produces nothing today. This fetcher fills that hole with a two-tier strategy:
+
+  1. Download the PDF bytes and parse locally with **PyMuPDF4LLM** — free, fast,
+     CPU-only, and excellent on text-layer PDFs (most government/legal reports).
+     The first page is rendered to an image so the viewer still has a screenshot.
+  2. If local extraction yields thin text (a scanned PDF that needs OCR), fall
+     back to **Firecrawl's** native PDF parser (~1 credit/page, OCR included).
+
+Both parsers are optional and imported lazily. Use :func:`looks_like_pdf` /
+:meth:`PdfFetcher.is_pdf` to decide whether a URL should be routed here.
+"""
+
+from __future__ import annotations
+
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+    FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import MIN_TEXT_LEN
+
+logger = logging.getLogger(__name__)
+
+_PDF_MAGIC = b"%PDF-"
+
+
+def looks_like_pdf(url: str) -> bool:
+    """Cheap URL-shape heuristic: does this look like a PDF link? (The fetcher
+    still confirms by sniffing the magic bytes before parsing.)"""
+    path = url.split("?", 1)[0].split("#", 1)[0].lower()
+    return path.endswith(".pdf")
+
+
+class PdfFetcher:
+    name = "pdf"
+
+    def __init__(
+        self,
+        config: ArchiveConfig | None = None,
+        *,
+        firecrawl: FirecrawlFetcher | None = None,
+        downloader=None,
+    ):
+        self.config = config or ArchiveConfig()
+        # Reuse the configured Firecrawl client for the OCR fallback when a key
+        # is present; otherwise the fallback is simply skipped.
+        if firecrawl is not None:
+            self._firecrawl = firecrawl
+        elif self.config.firecrawl_api_key:
+            self._firecrawl = FirecrawlFetcher(self.config)
+        else:
+            self._firecrawl = None
+        self._download = downloader or _download_bytes
+
+    def is_pdf(self, url: str, data: bytes | None = None) -> bool:
+        if data is not None:
+            return data[:5] == _PDF_MAGIC
+        return looks_like_pdf(url)
+
+    def fetch(self, url: str) -> CaptureResult:
+        data, final_url, status = self._download(url, self.config.nav_timeout_ms)
+        if not data or data[:5] != _PDF_MAGIC:
+            raise FetchError(f"not a PDF (no %PDF- magic) for {url}")
+
+        markdown, screenshot, ctype, pages, engine = self._parse_local(data)
+
+        thin = not markdown or len(markdown.strip()) < MIN_TEXT_LEN
+        if thin and self._firecrawl is not None:
+            logger.info("local PDF parse thin for %s; trying Firecrawl OCR", url)
+            try:
+                fc_md = self._firecrawl.fetch_pdf_markdown(url)
+            except FetchError as e:
+                logger.info("firecrawl PDF fallback failed for %s: %s", url, e)
+            else:
+                if fc_md and len(fc_md.strip()) >= MIN_TEXT_LEN:
+                    markdown, engine = fc_md, "firecrawl"
+
+        return CaptureResult(
+            url=url,
+            final_url=final_url or url,
+            status_code=status,
+            html=None,
+            markdown=markdown,
+            screenshot=screenshot,
+            screenshot_content_type=ctype,
+            fetcher=self.name,
+            metadata={"pdf_engine": engine, "pdf_pages": pages},
+        )
+
+    def _parse_local(
+        self, data: bytes
+    ) -> tuple[str | None, bytes | None, str | None, int, str]:
+        """Return (markdown, screenshot_png, content_type, pages, engine)."""
+        try:
+            import pymupdf  # PyMuPDF (a.k.a. fitz)
+            import pymupdf4llm
+        except ImportError:
+            logger.warning(
+                "pymupdf4llm not installed; local PDF parsing unavailable. "
+                "Install with `pip install forecasting-tools[source-archive]`."
+            )
+            return None, None, None, 0, "none"
+
+        try:
+            doc = pymupdf.open(stream=data, filetype="pdf")
+        except Exception as e:
+            raise FetchError(f"could not open PDF: {e}") from e
+
+        try:
+            total = doc.page_count
+            limit = min(total, self.config.pdf_max_pages) or total
+            markdown = pymupdf4llm.to_markdown(doc, pages=list(range(limit)))
+            screenshot, ctype = self._render_first_page(doc)
+            return markdown or None, screenshot, ctype, total, "pymupdf4llm"
+        finally:
+            doc.close()
+
+    @staticmethod
+    def _render_first_page(doc) -> tuple[bytes | None, str | None]:
+        try:
+            pix = doc[0].get_pixmap(dpi=110)
+            return pix.tobytes("png"), "image/png"
+        except Exception as e:
+            logger.info("could not render PDF first page: %s", e)
+            return None, None
+
+
+def _download_bytes(
+    url: str, timeout_ms: int
+) -> tuple[bytes | None, str | None, int | None]:
+    # A browser-ish UA avoids the cheapest 403s; the content store needs the
+    # bytes, not a render, so plain HTTP is fine and free.
+    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+    try:
+        with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000, 1)) as resp:
+            return resp.read(), resp.geturl(), getattr(resp, "status", 200)
+    except Exception as e:
+        raise FetchError(f"could not download PDF for {url}: {e}") from e
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
index ee9900b7..efba5575 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
@@ -27,6 +27,13 @@
 
 logger = logging.getLogger(__name__)
 
+# WebP's hard per-side pixel limit; taller captures must be cropped before encode.
+_WEBP_MAX_DIM = 16383
+# Above this total pixel count, skip the screenshot rather than decode it: a
+# pathological full-page render (very tall × wide) costs minutes of CPU in Pillow
+# for a screenshot that's nice-to-have, not essential.
+_MAX_SCREENSHOT_PIXELS = 200_000_000
+
 
 def _to_markdown(html: str, url: str) -> str | None:
     try:
@@ -39,21 +46,57 @@ def _to_markdown(html: str, url: str) -> str | None:
     )
 
 
-def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
-    """Re-encode a PNG screenshot to the requested format using Pillow.
+# Scroll the document top-to-bottom (triggering lazy-loaded content) then back
+# up, so a subsequent full-page screenshot captures the fully-rendered page.
+_AUTOSCROLL_JS = """
+async () => {
+  await new Promise((resolve) => {
+    let y = 0;
+    const step = () => {
+      window.scrollTo(0, y);
+      y += 1000;
+      if (y < document.body.scrollHeight) setTimeout(step, 40);
+      else resolve();
+    };
+    step();
+  });
+  window.scrollTo(0, 0);
+}
+"""
+
+
+def _encode_screenshot(
+    png_bytes: bytes, fmt: str, max_height: int = 0
+) -> tuple[bytes, str]:
+    """Crop (to ``max_height``) and re-encode a PNG screenshot using Pillow.
 
     Pillow is already a forecasting-tools dependency, so true WebP is available
-    here (Playwright itself only emits PNG/JPEG).
+    here (Playwright itself only emits PNG/JPEG). The height cap is enforced by
+    cropping the *full-page* render to its top ``max_height`` pixels — never via
+    Playwright's ``clip`` (which, without ``full_page``, is bounded by the
+    viewport and silently truncates tall pages to a single screen).
     """
     fmt = fmt.lower()
-    if fmt == "png":
-        return png_bytes, "image/png"
     try:
         from PIL import Image
     except ImportError:
+        # No Pillow: can't crop or transcode; hand back the raw full-page PNG.
         return png_bytes, "image/png"
 
-    image = Image.open(io.BytesIO(png_bytes))
+    image = Image.open(io.BytesIO(png_bytes))  # lazy: reads size, doesn't decode
+    if image.width * image.height > _MAX_SCREENSHOT_PIXELS:
+        raise ValueError(
+            f"screenshot too large to encode ({image.width}x{image.height}px)"
+        )
+    # WebP cannot encode beyond 16383px on a side. Clamp the effective cap for
+    # webp so an over-tall page degrades to a top-crop instead of crashing the
+    # encoder mid-run (which would propagate out of fetch() and abort the URL).
+    limit = max_height or 0
+    if fmt == "webp":
+        limit = min(limit or _WEBP_MAX_DIM, _WEBP_MAX_DIM)
+    if limit and image.height > limit:
+        image = image.crop((0, 0, image.width, limit))
+
     out = io.BytesIO()
     if fmt == "webp":
         image.save(out, format="WEBP", quality=80, method=6)
@@ -61,7 +104,8 @@ def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
     if fmt in ("jpeg", "jpg"):
         image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True)
         return out.getvalue(), "image/jpeg"
-    return png_bytes, "image/png"
+    image.save(out, format="PNG", optimize=True)
+    return out.getvalue(), "image/png"
 
 
 class PlaywrightFetcher:
@@ -82,7 +126,12 @@ def __init__(self, config: ArchiveConfig | None = None):
         self._playwright = None
         self._browser = None
 
-    def __enter__(self) -> "PlaywrightFetcher":
+    def _launch_browser(self):
+        """Start the browser. Returns ``(playwright_or_none, browser)`` where
+        ``browser`` is a Playwright ``Browser``. Subclasses override this to swap
+        in a different stealth browser (see ``CloakBrowserFetcher``) while reusing
+        all of the capture logic. A backend that manages its own driver returns
+        ``None`` for the first element."""
         try:
             from playwright.sync_api import sync_playwright
         except ImportError as e:
@@ -91,8 +140,12 @@ def __enter__(self) -> "PlaywrightFetcher":
                 "`pip install forecasting-tools[source-archive]` and then run "
                 "`playwright install chromium`."
             ) from e
-        self._playwright = sync_playwright().start()
-        self._browser = self._playwright.chromium.launch(headless=True)
+        playwright = sync_playwright().start()
+        browser = playwright.chromium.launch(headless=True)
+        return playwright, browser
+
+    def __enter__(self) -> "PlaywrightFetcher":
+        self._playwright, self._browser = self._launch_browser()
         return self
 
     def __exit__(self, *exc) -> None:
@@ -103,6 +156,32 @@ def __exit__(self, *exc) -> None:
             self._playwright.stop()
             self._playwright = None
 
+    def _settle(self, page) -> None:
+        """Best-effort: let the page finish rendering before the screenshot.
+
+        ``page.goto`` only waits for ``domcontentloaded``, which fires before
+        CSS/images/lazy content have laid out — capturing then yields a short,
+        half-built page. Wait for the load/network to quiesce and scroll the
+        document to force lazy content in, so the full-page capture is complete.
+        Each step is bounded and swallows timeouts: rendering aids are
+        nice-to-have, never fatal to the capture.
+        """
+        try:
+            page.wait_for_load_state("load", timeout=self.config.nav_timeout_ms)
+        except Exception:
+            pass
+        try:
+            page.wait_for_load_state(
+                "networkidle", timeout=min(self.config.nav_timeout_ms, 10_000)
+            )
+        except Exception:
+            pass
+        try:
+            page.evaluate(_AUTOSCROLL_JS)
+            page.wait_for_timeout(500)
+        except Exception:
+            pass
+
     def fetch(self, url: str) -> CaptureResult:
         if self._browser is None:
             raise FetchError("PlaywrightFetcher must be used as a context manager")
@@ -119,26 +198,33 @@ def fetch(self, url: str) -> CaptureResult:
             except Exception as e:
                 raise FetchError(f"navigation failed for {url}: {e}") from e
 
+            self._settle(page)
+
             status = response.status if response is not None else None
             html = page.content()
 
-            shot_kwargs: dict = {"type": "png"}
-            cap = self.config.screenshot_max_height
-            dims = page.evaluate(
-                "() => ({w: document.documentElement.scrollWidth,"
-                " h: document.documentElement.scrollHeight})"
-            )
-            width = max(int(dims.get("w") or 0), 1)
-            height = int(dims.get("h") or 0)
-            if cap and height > cap:
-                shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap}
-            else:
-                shot_kwargs["full_page"] = True
-
-            png = page.screenshot(**shot_kwargs)
-            screenshot, content_type = _encode_screenshot(
-                png, self.config.screenshot_format
-            )
+            # Always capture the entire scrollable page in one shot — Playwright
+            # stitches it internally. The height cap is applied afterward by
+            # cropping in Pillow (see ``_encode_screenshot``). Fall back to a
+            # viewport capture only if a full-page shot fails (e.g. a page taller
+            # than Chromium's screenshot limit).
+            try:
+                png = page.screenshot(full_page=True)
+            except Exception as e:
+                logger.info("full-page screenshot failed for %s: %s", url, e)
+                png = page.screenshot()
+            # Encoding can fail on pathological pages (e.g. a 400M-pixel full-page
+            # render trips Pillow's decompression-bomb guard). A screenshot is
+            # nice-to-have — never lose the whole capture over it.
+            try:
+                screenshot, content_type = _encode_screenshot(
+                    png,
+                    self.config.screenshot_format,
+                    self.config.screenshot_max_height,
+                )
+            except Exception as e:
+                logger.info("screenshot encode failed for %s: %s", url, e)
+                screenshot, content_type = None, None
 
             return CaptureResult(
                 url=url,
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
index 26b54831..8b689781 100644
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
@@ -4,11 +4,19 @@
 from a bot's published reasoning:
 
   - :mod:`url_extraction` — pull URLs out of free text / markdown.
-  - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API.
+  - :mod:`metaculus_db` — read a bot's cited URLs from the platform database.
+  - :mod:`trace_extraction` — build a manifest from a traced bot run (fullest path).
 """
 
-from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
-    MetaculusCommentHarvester,
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import (
+    MetaculusDbHarvester,
+    resolve_dsn,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import (
+    extract_records_from_events,
+    extract_records_from_question_dir,
+    extract_records_from_trace_file,
+    harvest_run,
 )
 from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
     dedupe_records,
@@ -17,8 +25,13 @@
 )
 
 __all__ = [
-    "MetaculusCommentHarvester",
+    "MetaculusDbHarvester",
     "dedupe_records",
     "extract_citation_records",
+    "extract_records_from_events",
+    "extract_records_from_question_dir",
+    "extract_records_from_trace_file",
     "extract_urls",
+    "harvest_run",
+    "resolve_dsn",
 ]
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
deleted file mode 100644
index 0aff84a9..00000000
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Harvest the URLs bots cite, from their public Metaculus comments.
-
-Both first-party and third-party bots publish their reasoning — with the source
-links they used — as comments on the questions they forecast. The public,
-no-auth Metaculus API is therefore the one mechanism that works across *every*
-bot on the platform, which is why this is the general ingestion path.
-
-Flow:
-
-  1. Enumerate the bots participating in a project (tournament) leaderboard.
-  2. Page through each bot's comments.
-  3. Extract the URLs from each comment and emit CitationRecords.
-
-The result is a citation manifest you can feed straight to the capture pipeline.
-
-Caveat: comments are length-truncated when posted, so a comment-harvested URL
-list can be incomplete versus the bot's full research. For bots you control, an
-instrumented trace gives a fuller list; this path is the universal baseline.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from collections.abc import Iterator
-from typing import Any, Callable
-
-from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
-    extract_citation_records,
-)
-from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_BASE_URL = "https://www.metaculus.com/api"
-PAGE_LIMIT = 100
-
-
-def _first(d: dict, *keys, default=None):
-    for k in keys:
-        if k in d and d[k] is not None:
-            return d[k]
-    return default
-
-
-class MetaculusCommentHarvester:
-    """Reads bot comments via the public Metaculus API.
-
-    HTTP is injectable for testing: pass ``fetch_json=callable(path, params) ->
-    dict`` to avoid real network calls.
-    """
-
-    def __init__(
-        self,
-        base_url: str | None = None,
-        *,
-        session: Any = None,
-        timeout: int = 30,
-        fetch_json: Callable[[str, dict], dict] | None = None,
-    ):
-        self.base_url = (
-            base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL
-        ).rstrip("/")
-        self.web_base = (
-            self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url
-        )
-        self.timeout = timeout
-        self._session = session
-        self._fetch_json = fetch_json
-
-    # --- http --------------------------------------------------------------
-    def _get(self, path: str, params: dict) -> dict:
-        if self._fetch_json is not None:
-            return self._fetch_json(path, params)
-        try:
-            import requests
-        except ImportError as e:  # pragma: no cover - requests is a core dep
-            raise ImportError("requests is required for comment harvesting") from e
-        if self._session is None:
-            self._session = requests.Session()
-        resp = self._session.get(
-            f"{self.base_url}{path}", params=params, timeout=self.timeout
-        )
-        resp.raise_for_status()
-        return resp.json()
-
-    # --- bots --------------------------------------------------------------
-    def enumerate_bots(self, project_id: int | str) -> list[dict]:
-        """Return the bot ``user`` records on a project's leaderboard."""
-        data = self._get(
-            f"/leaderboards/project/{project_id}/", {"with_entries": "true"}
-        )
-        entries = _first(data, "leaderboard_entries", "entries", "results", default=[])
-        bots: list[dict] = []
-        seen: set[Any] = set()
-        for entry in entries:
-            user = entry.get("user") if isinstance(entry, dict) else None
-            if not user or not user.get("is_bot"):
-                continue
-            uid = user.get("id")
-            if uid in seen:
-                continue
-            seen.add(uid)
-            bots.append(user)
-        return bots
-
-    # --- comments ----------------------------------------------------------
-    def iter_comments(
-        self, author_id: int | str, post_id: int | str | None = None
-    ) -> Iterator[dict]:
-        """Yield every comment authored by ``author_id`` (optionally on one post)."""
-        offset = 0
-        while True:
-            params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset}
-            if post_id is not None:
-                params["post"] = post_id
-            data = self._get("/comments/", params)
-            results = (
-                _first(data, "results", default=[]) if isinstance(data, dict) else data
-            )
-            if not results:
-                break
-            yield from results
-            if len(results) < PAGE_LIMIT:
-                break
-            offset += PAGE_LIMIT
-
-    # --- harvesting --------------------------------------------------------
-    def _records_from_comment(
-        self, comment: dict, *, run_id: str | None, bot: str | None
-    ) -> list[CitationRecord]:
-        post_id = _first(comment, "on_post", "post", "post_id")
-        post_id_str = str(post_id) if post_id is not None else None
-        question_url = (
-            f"{self.web_base}/questions/{post_id}/" if post_id is not None else None
-        )
-        comment_id = comment.get("id")
-        return extract_citation_records(
-            comment.get("text"),
-            run_id=run_id,
-            bot=bot,
-            question_id=post_id_str,
-            metaculus_id=post_id_str,
-            question_url=question_url,
-            trace=f"comment:{comment_id}" if comment_id is not None else None,
-            origin="metaculus_comment",
-        )
-
-    def harvest_author(
-        self,
-        author_id: int | str,
-        *,
-        run_id: str | None = None,
-        bot: str | None = None,
-        post_id: int | str | None = None,
-    ) -> list[CitationRecord]:
-        """All citation records from one bot's comments."""
-        records: list[CitationRecord] = []
-        for comment in self.iter_comments(author_id, post_id=post_id):
-            records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot))
-        return records
-
-    def harvest_project(
-        self, project_id: int | str, *, run_id: str | None = None
-    ) -> list[CitationRecord]:
-        """All citation records from every bot on a project's leaderboard.
-
-        Records are kept per-citation (duplicates across bots are preserved as
-        distinct provenance); the capture pipeline dedupes URLs before fetching.
-        """
-        run_id = run_id or f"metaculus-comments-{project_id}"
-        records: list[CitationRecord] = []
-        bots = self.enumerate_bots(project_id)
-        logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots))
-        for user in bots:
-            bot_name = user.get("username") or str(user.get("id"))
-            bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name)
-            logger.info("  bot %s: %d cited URL(s)", bot_name, len(bot_records))
-            records.extend(bot_records)
-        return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py
new file mode 100644
index 00000000..c0221bbf
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py
@@ -0,0 +1,215 @@
+"""Read a bot's cited URLs from the platform Postgres database (operator tooling).
+
+For operators with database access, this reads the URLs a forecasting bot cited
+straight from Postgres (``comments_comment`` joined to ``users_user.is_bot``) and
+emits the same :class:`CitationRecord`s as every other ingestion path, so the
+catalog / coverage / capture stages downstream are unchanged. By default it reads
+all of a bot's comments (``include_private=True``); pass ``include_private=False``
+for the public ones only. Only ``u.is_bot`` accounts are ever read.
+
+The DB call is **injected** (``query``) so the core is driver-agnostic and unit
+testable; :meth:`from_dsn` wires a psycopg2 connection for real use (a libpq DSN
+or a ``postgresql://…`` URL — e.g. a Neon connection string). Reads only; no
+secrets are stored — the DSN comes from the caller / environment.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Callable, Mapping, Sequence
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+    extract_citation_records,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+QueryFn = Callable[[str, Sequence[Any]], list[dict]]
+
+# Keychain service name the DSN is stored under (see resolve_dsn / README).
+KEYCHAIN_SERVICE = "metaculus-db-dsn"
+LOCAL_DEFAULT_DSN = "dbname=metaculus"
+
+_WEB = "https://www.metaculus.com"
+
+# The windowed/post-scoped comment set is computed in a MATERIALIZED CTE so
+# Postgres evaluates it FIRST, then joins users_user by primary key. Without the
+# CTE the planner's stale stats misjudge the date window at ~300k rows (it is
+# really ~2k/day) and pick a join order that times out on the remote pooler.
+_OUTER = (
+    "select r.id as comment_id, r.on_post_id, r.text, "
+    "u.username, r.author_id "
+    "from recent r join users_user u on u.id = r.author_id where u.is_bot"
+)
+
+
+def _recent_cte(scope: str, include_private: bool) -> str:
+    """A MATERIALIZED ``recent`` CTE of link-bearing, non-deleted comments.
+
+    ``scope`` is the row-narrowing predicate (a post id or a created_at window).
+    Private comments are included unless ``include_private`` is False.
+
+    ``strpos(text,'http') > 0`` is a cheap substring pre-filter (a regex `~` scan
+    times out on the pooler; ``like`` would need ``%%`` escaping under psycopg2).
+    The real URL parsing happens in extract_citation_records, so over-matching
+    here just costs a few empty rows.
+    """
+    clauses = ["not c.is_soft_deleted", "strpos(c.text, 'http') > 0", scope]
+    if not include_private:
+        clauses.append("not c.is_private")
+    where = " and ".join(clauses)
+    return (
+        "with recent as materialized ("
+        "select c.id, c.on_post_id, c.text, c.author_id, c.created_at "
+        f"from comments_comment c where {where}) "
+    )
+
+
+def _dsn_from_keychain(service: str = KEYCHAIN_SERVICE) -> str | None:
+    """Read the DSN from the macOS login Keychain, or ``None`` if unavailable.
+
+    Uses ``security find-generic-password -w`` so the credential lives only in
+    the Keychain — never in ``.env``, a shell rc, or shell history. If the
+    Keychain item's ACL is set to confirm on access, this call raises a GUI
+    prompt: a human can approve it, an automated agent driving the shell cannot.
+    Returns ``None`` off macOS or when the item is absent / access is denied, so
+    callers fall through to the next source.
+    """
+    import shutil
+    import subprocess
+
+    if not shutil.which("security"):  # not macOS
+        return None
+    try:
+        proc = subprocess.run(
+            ["security", "find-generic-password", "-s", service, "-w"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+    except (OSError, subprocess.SubprocessError):
+        return None
+    if proc.returncode != 0:
+        return None
+    return proc.stdout.strip() or None
+
+
+def resolve_dsn(
+    explicit: str | None = None,
+    *,
+    env: Mapping[str, str] | None = None,
+    keychain_reader: Callable[[], str | None] | None = None,
+) -> str:
+    """Resolve the DB DSN without ever persisting it to disk.
+
+    Resolution order, first hit wins:
+      1. ``explicit`` (e.g. a ``--dsn`` flag — convenient, but lands in shell
+         history, so prefer the Keychain for the real secret),
+      2. ``$METACULUS_DB_DSN``,
+      3. the macOS Keychain item ``metaculus-db-dsn`` (the private path),
+      4. the local default ``dbname=metaculus``.
+    ``env`` / ``keychain_reader`` are injectable for tests.
+    """
+    if explicit:
+        return explicit
+    environ = env if env is not None else os.environ
+    from_env = environ.get("METACULUS_DB_DSN")
+    if from_env:
+        return from_env
+    reader = keychain_reader or _dsn_from_keychain
+    from_keychain = reader()
+    if from_keychain:
+        return from_keychain
+    return LOCAL_DEFAULT_DSN
+
+
+class MetaculusDbHarvester:
+    """Reads bot comments from Postgres. ``query(sql, params) -> list[dict]``."""
+
+    def __init__(self, query: QueryFn):
+        self._query = query
+
+    @classmethod
+    def from_dsn(cls, dsn: str = "dbname=metaculus") -> "MetaculusDbHarvester":
+        try:
+            import psycopg2
+            import psycopg2.extras
+        except ImportError as e:  # pragma: no cover - optional operator dep
+            raise ImportError(
+                "psycopg2 is required for DB harvesting "
+                "(`pip install psycopg2-binary`)."
+            ) from e
+        conn = psycopg2.connect(dsn)
+
+        def query(sql: str, params: Sequence[Any]) -> list[dict]:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(sql, params)
+                return [dict(r) for r in cur.fetchall()]
+
+        return cls(query)
+
+    def _records(self, rows: list[dict], run_id: str | None) -> list[CitationRecord]:
+        out: list[CitationRecord] = []
+        for r in rows:
+            post_id = r.get("on_post_id")
+            pid = str(post_id) if post_id is not None else None
+            comment_id = r.get("comment_id")
+            out.extend(
+                extract_citation_records(
+                    r.get("text"),
+                    run_id=run_id,
+                    bot=r.get("username") or str(r.get("author_id")),
+                    question_id=pid,
+                    metaculus_id=pid,
+                    question_url=(
+                        f"{_WEB}/questions/{post_id}/" if post_id is not None else None
+                    ),
+                    comment_id=str(comment_id) if comment_id is not None else None,
+                    origin="metaculus_comment",
+                )
+            )
+        return out
+
+    def harvest_post(
+        self,
+        post_id: int | str,
+        *,
+        run_id: str | None = None,
+        include_private: bool = True,
+    ) -> list[CitationRecord]:
+        """Every bot-cited URL in the comments on one post."""
+        run_id = run_id or f"metaculus-db-post-{post_id}"
+        sql = (
+            _recent_cte("c.on_post_id = %s", include_private)
+            + _OUTER
+            + " order by r.created_at"
+        )
+        return self._records(self._query(sql, (post_id,)), run_id)
+
+    def harvest_recent(
+        self,
+        *,
+        days: int = 1,
+        limit: int | None = None,
+        run_id: str | None = None,
+        include_private: bool = True,
+    ) -> list[CitationRecord]:
+        """Bot-cited URLs from the most recent ``days`` of comments.
+
+        "Recent" is measured against ``max(created_at)`` in the table, not wall
+        clock, so a replica that lags real time by a day still returns its latest
+        day with ``days=1``. ``limit`` caps the row count; ``None`` (the default)
+        is uncapped, which is what a daily sweep wants.
+        """
+        run_id = run_id or f"metaculus-db-recent-{days}d"
+        scope = (
+            "c.created_at >= "
+            "(select max(created_at) from comments_comment) - (%s * interval '1 day')"
+        )
+        sql = (
+            _recent_cte(scope, include_private) + _OUTER + " order by r.created_at desc"
+        )
+        params: list[Any] = [days]
+        if limit:
+            sql += " limit %s"
+            params.append(limit)
+        return self._records(self._query(sql, tuple(params)), run_id)
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py
new file mode 100644
index 00000000..c330eccb
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py
@@ -0,0 +1,380 @@
+"""Build a citation manifest from a bot's run traces.
+
+When the template bot is run with tracing enabled it writes one JSONL trace per
+forecast attempt, recording the agent loop step by step. Those traces are the
+*fullest* record of what the bot actually looked at — richer than the reasoning
+comment it posts, which is length-truncated (see :mod:`metaculus_db` for the
+shallower comment path).
+
+This module walks those traces and pulls out every external URL the bot touched,
+turning each into a :class:`CitationRecord` with provenance (which trace, which
+tool, the search query that surfaced it). That manifest is the input to the
+capture pipeline, exactly like the comment-harvested one.
+
+Trace layout
+------------
+A traced run is a directory tree::
+
+    <run_dir>/
+        bot_<name>/
+            q_<question_id>/
+                question.json
+                traces_forecast_1_attempt_1.jsonl
+                traces_summarize.jsonl
+                ...
+
+Each ``traces_*.jsonl`` file is a stream of newline-delimited event objects. The
+events that can carry external links are:
+
+- ``tool_call``  — the arguments the bot passed to a tool (e.g. a search query,
+  or a ``url`` handed to a page fetcher). Carries ``name`` and ``call_id``.
+- ``tool_result`` — what the tool returned. Search tools inline their citations
+  here as ``[n](url)`` or as a list of result URLs. Carries ``call_id`` so the
+  result can be attributed back to the originating ``tool_call``.
+- ``initial_prompt`` — the first prompt of a trace. Only scanned for the
+  ``summarize`` trace: the template bot runs research *outside* the agent loop
+  and pastes the research blob verbatim into the summarizer's first prompt, so
+  that is the one place those URLs are recoverable. Other traces' initial
+  prompts just echo the question text (background, resolution criteria), whose
+  URLs aren't research, so they're skipped.
+
+Search provenance (``query`` / ``tool_args``) only exists in these instrumented
+traces — it is populated here from each ``tool_call`` and carried onto the URLs
+that the matching ``tool_result`` returned.
+"""
+
+from __future__ import annotations
+
+import glob
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+    extract_urls,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+METACULUS_QUESTION_URL_FMT = "https://www.metaculus.com/questions/{}/"
+
+# Event type -> the field on that event that carries the URL-bearing payload.
+_SCANNABLE_FIELDS: dict[str, str] = {
+    "tool_call": "args",
+    "tool_result": "content",
+    "initial_prompt": "prompt",
+}
+# The trace whose initial prompt holds pasted-in research (see module docstring).
+_SUMMARIZE_TRACE_LABEL = "summarize"
+# Keys a tool's input commonly uses for the search string, best-effort.
+_QUERY_KEYS = ("query", "q", "search_query", "search", "queries", "question")
+
+
+def _urls_in(value: Any) -> list[str]:
+    """Return URLs found anywhere in a string / dict / list, in first-seen order.
+
+    Tool args are structured (a dict) and tool results may be either a blob of
+    text or a structured payload, so we walk the whole value and run the shared
+    :func:`extract_urls` over every string we reach — keeping markdown-link and
+    trailing-punctuation handling identical to the comment path.
+    """
+    urls: list[str] = []
+
+    def walk(v: Any) -> None:
+        if v is None:
+            return
+        if isinstance(v, str):
+            urls.extend(extract_urls(v))
+            return
+        if isinstance(v, dict):
+            for key, val in v.items():
+                walk(key)
+                walk(val)
+            return
+        if isinstance(v, (list, tuple, set, frozenset)):
+            for item in v:
+                walk(item)
+            return
+        walk(str(v))
+
+    walk(value)
+    return urls
+
+
+def _query_from_args(args: Any) -> str | None:
+    """Pull the search string out of a tool's arguments, if recognisable."""
+    if not isinstance(args, dict):
+        return None
+    for key in _QUERY_KEYS:
+        val = args.get(key)
+        if isinstance(val, str) and val.strip():
+            return val
+        if isinstance(val, (list, tuple)) and val:
+            joined = " ".join(str(item) for item in val if item)
+            if joined.strip():
+                return joined
+    return None
+
+
+def trace_label(trace_path: str) -> str:
+    """``traces_forecast_1_attempt_1.jsonl`` -> ``forecast_1_attempt_1``."""
+    name = os.path.basename(trace_path)
+    if name.startswith("traces_"):
+        name = name[len("traces_") :]
+    if name.endswith(".jsonl"):
+        name = name[: -len(".jsonl")]
+    return name
+
+
+def extract_records_from_events(
+    events: Any,
+    *,
+    trace: str | None = None,
+    include_initial_prompt: bool = False,
+    run_id: str | None = None,
+    bot: str | None = None,
+    question_id: str | None = None,
+    metaculus_id: str | None = None,
+    question_url: str | None = None,
+) -> list[CitationRecord]:
+    """Turn one trace's event stream into CitationRecords.
+
+    ``events`` is any iterable of event dicts (already parsed from JSONL). The
+    given provenance is stamped onto every record; per-event provenance
+    (``trace``, ``tool_name``, ``origin``, ``query``, ``tool_args``,
+    ``first_seen``) is filled in here.
+
+    Set ``include_initial_prompt`` to scan ``initial_prompt`` events — callers
+    should only do this for the ``summarize`` trace (see module docstring).
+    """
+    records: list[CitationRecord] = []
+    # Attribute tool_result events (which only carry call_id) back to the
+    # originating tool_call's name and arguments.
+    tool_name_by_call_id: dict[str, str] = {}
+    tool_args_by_call_id: dict[str, Any] = {}
+
+    for event in events:
+        if not isinstance(event, dict):
+            continue
+        event_type = event.get("type")
+
+        if event_type == "tool_call":
+            call_id = str(event.get("call_id") or "").strip()
+            name = event.get("name") or ""
+            if call_id:
+                if name:
+                    tool_name_by_call_id[call_id] = name
+                if "args" in event:
+                    tool_args_by_call_id[call_id] = event.get("args")
+
+        field = _SCANNABLE_FIELDS.get(event_type or "")
+        if field is None:
+            continue
+        if event_type == "initial_prompt" and not include_initial_prompt:
+            continue
+
+        urls = _urls_in(event.get(field))
+        if not urls:
+            continue
+
+        if event_type == "tool_call":
+            tool_name = event.get("name") or ""
+            origin = "tool_call"
+            tool_args = (
+                event.get("args") if isinstance(event.get("args"), dict) else None
+            )
+        elif event_type == "tool_result":
+            call_id = str(event.get("call_id") or "").strip()
+            tool_name = tool_name_by_call_id.get(call_id, "")
+            origin = "tool_result"
+            originating_args = tool_args_by_call_id.get(call_id)
+            tool_args = originating_args if isinstance(originating_args, dict) else None
+        else:  # initial_prompt
+            tool_name = ""
+            origin = event_type or ""
+            tool_args = None
+
+        query = _query_from_args(tool_args)
+        timestamp = event.get("timestamp")
+        for url in urls:
+            record = CitationRecord(
+                url=url,
+                run_id=run_id,
+                bot=bot,
+                question_id=question_id,
+                metaculus_id=metaculus_id,
+                question_url=question_url,
+                trace=trace,
+                tool_name=tool_name,
+                origin=origin,
+                query=query,
+                tool_args=tool_args,
+            )
+            if timestamp:
+                record.first_seen = str(timestamp)
+            records.append(record)
+
+    return records
+
+
+def _read_jsonl(path: str) -> list[dict]:
+    """Read a JSONL file, skipping blank or unparsable lines."""
+    events: list[dict] = []
+    for raw_line in Path(path).read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        try:
+            events.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    return events
+
+
+def extract_records_from_trace_file(
+    trace_path: str,
+    *,
+    run_id: str | None = None,
+    bot: str | None = None,
+    question_id: str | None = None,
+    metaculus_id: str | None = None,
+    question_url: str | None = None,
+) -> list[CitationRecord]:
+    """Extract CitationRecords from one ``traces_*.jsonl`` file."""
+    label = trace_label(trace_path)
+    return extract_records_from_events(
+        _read_jsonl(trace_path),
+        trace=label,
+        include_initial_prompt=(label == _SUMMARIZE_TRACE_LABEL),
+        run_id=run_id,
+        bot=bot,
+        question_id=question_id,
+        metaculus_id=metaculus_id,
+        question_url=question_url,
+    )
+
+
+def _read_question_metadata(question_dir: str) -> tuple[str | None, str | None]:
+    """Return ``(question_id, metaculus_id)`` from ``question.json`` in the dir.
+
+    Read as a plain dict with flexible keys so the ingest stays decoupled from
+    any particular question model. Missing/unparsable metadata is non-fatal —
+    records are still emitted, just with empty question provenance.
+    """
+    question_path = os.path.join(question_dir, "question.json")
+    if not os.path.exists(question_path):
+        return None, None
+    try:
+        data = json.loads(Path(question_path).read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return None, None
+    if not isinstance(data, dict):
+        return None, None
+
+    def _str_or_none(*keys: str) -> str | None:
+        for key in keys:
+            val = data.get(key)
+            if val is not None:
+                return str(val)
+        return None
+
+    question_id = _str_or_none("question_id", "id", "post_id")
+    metaculus_id = _str_or_none("metaculus_id", "post_id", "id")
+    return question_id, metaculus_id
+
+
+def extract_records_from_question_dir(
+    question_dir: str,
+    *,
+    run_id: str | None = None,
+    bot: str | None = None,
+    question_id: str | None = None,
+    metaculus_id: str | None = None,
+    question_url: str | None = None,
+) -> list[CitationRecord]:
+    """Aggregate CitationRecords across every trace in one ``q_*`` dir.
+
+    Question provenance is read from ``question.json`` in the dir; pass any of
+    ``question_id`` / ``metaculus_id`` / ``question_url`` to override what's
+    found there (or to supply it when the file is absent).
+    """
+    found_qid, found_mid = _read_question_metadata(question_dir)
+    question_id = question_id or found_qid
+    metaculus_id = metaculus_id or found_mid
+    if question_url is None and metaculus_id is not None:
+        question_url = METACULUS_QUESTION_URL_FMT.format(metaculus_id)
+
+    records: list[CitationRecord] = []
+    for trace_path in sorted(glob.glob(os.path.join(question_dir, "traces_*.jsonl"))):
+        records.extend(
+            extract_records_from_trace_file(
+                trace_path,
+                run_id=run_id,
+                bot=bot,
+                question_id=question_id,
+                metaculus_id=metaculus_id,
+                question_url=question_url,
+            )
+        )
+    return records
+
+
+def _bot_name_from_dir(bot_dir: str) -> str:
+    """``.../bot_complex`` -> ``complex``."""
+    name = os.path.basename(bot_dir)
+    return name[len("bot_") :] if name.startswith("bot_") else name
+
+
+def _question_dirs_flat(run_dir: str) -> list[str]:
+    """Question dirs directly under ``run_dir`` (no ``bot_*`` level).
+
+    A "question dir" is any immediate subdirectory that actually contains
+    ``traces_*.jsonl``. This handles flatter layouts (e.g. a backfill of one
+    bot's runs as ``<run_dir>/<question>/traces_*.jsonl``) where the ``bot_*``
+    grouping is absent.
+    """
+    dirs = []
+    for entry in sorted(glob.glob(os.path.join(run_dir, "*"))):
+        if os.path.isdir(entry) and glob.glob(os.path.join(entry, "traces_*.jsonl")):
+            dirs.append(entry)
+    return dirs
+
+
+def harvest_run(
+    run_dir: str, *, run_id: str | None = None, bot: str | None = None
+) -> list[CitationRecord]:
+    """Build a citation manifest from a whole traced run directory.
+
+    Primary layout is ``<run_dir>/bot_*/q_*/traces_*.jsonl``, deriving ``run_id``
+    from the run dir's name and ``bot`` from each ``bot_*`` subdir. If no
+    ``bot_*`` subdirs exist, falls back to a **flat layout** —
+    ``<run_dir>/<question>/traces_*.jsonl`` — attributing every question to a
+    single bot (the ``bot`` argument, else the run dir's name). Question
+    provenance still comes from each dir's ``question.json``.
+
+    Returns the flat list of CitationRecords (one per URL occurrence); feed it
+    through :func:`url_extraction.dedupe_records` before capture for one row per
+    URL.
+    """
+    run_id = run_id or os.path.basename(os.path.normpath(run_dir))
+    records: list[CitationRecord] = []
+
+    bot_dirs = sorted(glob.glob(os.path.join(run_dir, "bot_*")))
+    if bot_dirs:
+        for bot_dir in bot_dirs:
+            bot_name = _bot_name_from_dir(bot_dir)
+            for question_dir in sorted(glob.glob(os.path.join(bot_dir, "q_*"))):
+                records.extend(
+                    extract_records_from_question_dir(
+                        question_dir, run_id=run_id, bot=bot_name
+                    )
+                )
+        return records
+
+    # Flat fallback: no bot_* grouping. One bot, question dirs directly below.
+    bot_name = bot or os.path.basename(os.path.normpath(run_dir))
+    for question_dir in _question_dirs_flat(run_dir):
+        records.extend(
+            extract_records_from_question_dir(question_dir, run_id=run_id, bot=bot_name)
+        )
+    return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
index f97def1c..b8b06d3b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
@@ -14,6 +14,9 @@
 import re
 from collections.abc import Iterable
 
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
 from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
 
 # Markdown link target: [label](url) or [label](<url>), optionally with a title.
@@ -24,13 +27,30 @@
 # unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives.
 _BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE)
 
-# Characters commonly stuck to the end of a URL in prose.
-_TRAILING = ".,;:!?'\""
+# Characters commonly stuck to the end of a URL in prose (incl. markdown-escape
+# residue: a trailing backslash or backtick).
+_TRAILING = ".,;:!?'\"\\`"
+
+
+def _cut_markdown_tail(url: str) -> str:
+    """Cut a URL at a markdown reference/link tail the bare-URL scan can swallow.
+
+    Bots sometimes emit ``…/story?id=123)[10](https://other…)`` where ``)[10](…``
+    is a markdown reference glued onto a real URL. The leading ``)`` was never
+    part of the URL, so cut at the first ``)[`` or ``](`` boundary.
+    """
+    cut = len(url)
+    for marker in (")[", "]("):
+        i = url.find(marker)
+        if i > 0:
+            cut = min(cut, i)
+    return url[:cut]
 
 
 def _trim(url: str) -> str:
     """Strip trailing punctuation, and a closing bracket/paren only when it is
     unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive)."""
+    url = _cut_markdown_tail(url)
     while url:
         last = url[-1]
         if last in _TRAILING:
@@ -45,7 +65,12 @@ def _trim(url: str) -> str:
 
 
 def extract_urls(text: str | None) -> list[str]:
-    """Return the distinct http(s) URLs in ``text``, in first-seen order."""
+    """Return the distinct http(s) URLs in ``text``, in first-seen order.
+
+    Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so
+    ``…/x`` and ``…/x?utm_source=…`` count once; the original first-seen string
+    is returned.
+    """
     if not text:
         return []
     seen: set[str] = set()
@@ -53,8 +78,11 @@ def extract_urls(text: str | None) -> list[str]:
     for pattern in (_MD_LINK, _AUTOLINK, _BARE):
         for match in pattern.finditer(text):
             url = _trim(match.group(1))
-            if url and url not in seen:
-                seen.add(url)
+            if not url:
+                continue
+            key = canonicalize_url(url)
+            if key not in seen:
+                seen.add(key)
                 ordered.append(url)
     return ordered
 
@@ -67,6 +95,7 @@ def extract_citation_records(
     question_id: str | None = None,
     metaculus_id: str | None = None,
     question_url: str | None = None,
+    comment_id: str | None = None,
     trace: str | None = None,
     tool_name: str | None = None,
     origin: str | None = None,
@@ -81,6 +110,7 @@ def extract_citation_records(
             question_id=question_id,
             metaculus_id=metaculus_id,
             question_url=question_url,
+            comment_id=comment_id,
             trace=trace,
             tool_name=tool_name,
             origin=origin,
@@ -90,11 +120,14 @@ def extract_citation_records(
 
 
 def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]:
-    """Keep the first record per URL, preserving order."""
+    """Keep the first record per *canonical* URL, preserving order."""
     seen: set[str] = set()
     out: list[CitationRecord] = []
     for r in records:
-        if r.url and r.url not in seen:
-            seen.add(r.url)
+        if not r.url:
+            continue
+        key = canonicalize_url(r.url)
+        if key not in seen:
+            seen.add(key)
             out.append(r)
     return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py
index 609c74d7..880ab161 100644
--- a/forecasting_tools/agents_and_tools/source_archive/manifest.py
+++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py
@@ -10,6 +10,9 @@
 from collections.abc import Iterable, Iterator
 from pathlib import Path
 
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
 from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
 from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
 from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
@@ -31,11 +34,19 @@ def loads(text: str) -> list[CitationRecord]:
 
 
 def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]:
-    """Yield each distinct URL once, preserving first-seen order."""
+    """Yield each distinct URL once, preserving first-seen order.
+
+    Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so
+    near-duplicate links collapse to a single fetch; the original first-seen URL
+    string is what's yielded, for provenance.
+    """
     seen: set[str] = set()
     for r in records:
-        if r.url and r.url not in seen:
-            seen.add(r.url)
+        if not r.url:
+            continue
+        key = canonicalize_url(r.url)
+        if key not in seen:
+            seen.add(key)
             yield r.url
 
 
diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py
index 8caad9ac..08c63cd6 100644
--- a/forecasting_tools/agents_and_tools/source_archive/models.py
+++ b/forecasting_tools/agents_and_tools/source_archive/models.py
@@ -8,14 +8,24 @@
 
 from pydantic import BaseModel, Field
 
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
+
 
 def utcnow_iso() -> str:
     return datetime.now(timezone.utc).isoformat()
 
 
 def url_hash(url: str) -> str:
-    """Stable key for a URL — groups every capture of that URL together."""
-    return hashlib.sha256(url.encode("utf-8")).hexdigest()
+    """Stable key for a URL — groups every capture of that URL together.
+
+    The URL is canonicalized first (see :func:`canonicalize_url`) so trivially
+    different links — tracking params, a trailing slash, a ``#fragment``,
+    query-param order, host case — collapse onto one key instead of being
+    stored and counted as separate sources.
+    """
+    return hashlib.sha256(canonicalize_url(url).encode("utf-8")).hexdigest()
 
 
 def content_hash(html: str | bytes) -> str:
@@ -56,6 +66,9 @@ class StoredCapture(BaseModel):
     html_key: str | None = None
     screenshot_key: str | None = None
     markdown_key: str | None = None
+    # Set when this capture reuses another URL's blobs because the fetched
+    # content was byte-identical (cross-URL content dedup); holds that URL's hash.
+    content_alias_of: str | None = None
     first_seen: str = Field(default_factory=utcnow_iso)
     last_seen: str = Field(default_factory=utcnow_iso)
 
@@ -74,7 +87,11 @@ class CitationRecord(BaseModel):
     question_id: str | None = None
     metaculus_id: str | None = None
     question_url: str | None = None
+    comment_id: str | None = None  # Metaculus comment the URL was cited in
     trace: str | None = None
     tool_name: str | None = None
     origin: str | None = None
+    # Search provenance (populated by instrumented trace ingest, not comments):
+    query: str | None = None  # the search query the bot ran, if known
+    tool_args: dict[str, Any] | None = None  # full tool input (query + filters…)
     first_seen: str = Field(default_factory=utcnow_iso)
diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
index 1855f039..67f5817b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/pipeline.py
+++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 import logging
+import threading
 from collections.abc import Iterable
 
 from pydantic import BaseModel
@@ -71,6 +72,9 @@ def capture_url(self, url: str) -> CaptureOutcome:
         except FetchError as e:
             logger.info("fetch error for %s: %s", url, e)
             return CaptureOutcome(url=url, status="error", reason=str(e))
+        except Exception as e:  # never let one bad URL abort the whole run
+            logger.warning("unexpected error capturing %s: %s", url, e)
+            return CaptureOutcome(url=url, status="error", reason=f"unexpected: {e}")
 
         # Gate here so any fetcher is covered; the tiered fetcher also gates
         # internally to decide fallback, but this is the authoritative check.
@@ -92,3 +96,169 @@ def run(self, urls: Iterable[str]) -> PipelineSummary:
 
     def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary:
         return self.run(unique_urls(records))
+
+
+# An outcome whose error reason contains one of these means the browser itself
+# died (crash, OOM, or the machine slept and severed the CDP pipe) — not a
+# problem with the URL. Without recovery, every later URL in that worker's shard
+# would error against the dead browser, so we rebuild the browser and retry.
+_DEAD_BROWSER_MARKERS = (
+    "has been closed",
+    "Target page, context or browser",
+    "Browser.new_context",
+    "Connection closed",
+    "browser has been closed",
+)
+
+
+def _browser_died(reason: str | None) -> bool:
+    return any(m in (reason or "") for m in _DEAD_BROWSER_MARKERS)
+
+
+def _close_quietly(cm, timeout_s: float = 15.0) -> None:
+    """Tear down a fetcher context manager, but never block on it: a wedged
+    browser's ``close()`` can itself hang, so run it in a daemon thread and give
+    up after ``timeout_s`` (the leftover process is reaped at the end of the run).
+    """
+    done = threading.Event()
+
+    def _close() -> None:
+        try:
+            cm.__exit__(None, None, None)
+        except Exception:
+            pass
+        finally:
+            done.set()
+
+    threading.Thread(target=_close, daemon=True).start()
+    done.wait(timeout_s)
+
+
+def _reap_browser_descendants() -> None:
+    """Best-effort: kill automation Chromium descending from this process. Used
+    both to recover a wedged worker (kill its browser so the blocked sync call
+    errors out) and to sweep leftovers at end of run. No-op without psutil so it
+    never becomes a hard dependency.
+    """
+    try:
+        import os
+
+        import psutil
+    except Exception:
+        return
+    try:
+        for child in psutil.Process(os.getpid()).children(recursive=True):
+            try:
+                if "chrom" in (child.name() or "").lower():
+                    child.kill()
+            except Exception:
+                pass
+    except Exception:
+        pass
+
+
+def capture_urls_concurrent(
+    urls: Iterable[str],
+    store: ContentStore,
+    config,
+    fetcher_factory,
+    per_url_timeout: float | None = None,
+    reaper=_reap_browser_descendants,
+) -> PipelineSummary:
+    """Capture ``urls`` across ``config.concurrency`` worker threads.
+
+    Headless Chromium's sync API is **thread-affine** — a browser must be used on
+    the thread that created it — so each worker opens its **own** browser via
+    ``fetcher_factory(config)`` and runs all captures inline on its own thread.
+    The content store is shared (writes are keyed by URL hash and idempotent, so
+    shards never collide). Order of outcomes is not preserved.
+
+    Hang protection runs *out of band*: a supervisor thread watches each worker's
+    heartbeat and, if one is stuck on a single URL past ``per_url_timeout`` (a
+    wedged sync call whose Playwright timeout never fires — e.g. the machine
+    slept and severed the CDP pipe), it **kills the browser processes**. That is
+    an OS-level action (safe across threads, unlike touching Playwright objects),
+    so the blocked call errors out and the worker rebuilds via the same
+    dead-browser path — no single stuck worker can freeze the whole run.
+    """
+    import time
+    from concurrent.futures import ThreadPoolExecutor
+
+    url_list = list(urls)
+    workers = max(1, int(getattr(config, "concurrency", 1) or 1))
+    if per_url_timeout is None:
+        nav_s = float(getattr(config, "nav_timeout_ms", 30000)) / 1000.0
+        per_url_timeout = max(90.0, nav_s * 4)
+
+    # worker index -> monotonic start of its current URL (None when between URLs)
+    heartbeats: dict[int, float | None] = {}
+    hb_lock = threading.Lock()
+    stop = threading.Event()
+
+    def supervisor() -> None:
+        interval = max(0.5, min(per_url_timeout / 2, 30.0))
+        while not stop.wait(interval):
+            now = time.monotonic()
+            with hb_lock:
+                stalled = [
+                    w
+                    for w, t in heartbeats.items()
+                    if t is not None and now - t > per_url_timeout
+                ]
+            if stalled:
+                logger.warning(
+                    "worker(s) %s stuck > %.0fs on one URL; killing browsers to recover",
+                    stalled,
+                    per_url_timeout,
+                )
+                reaper()
+                with hb_lock:  # grace: don't reap again before workers rebuild
+                    for w in list(heartbeats):
+                        if heartbeats[w] is not None:
+                            heartbeats[w] = now
+
+    def work(idx: int, shard: list[str]) -> list[CaptureOutcome]:
+        outcomes: list[CaptureOutcome] = []
+        cm = fetcher_factory(config)
+        pipeline = CapturePipeline(cm.__enter__(), store)
+        try:
+            for url in shard:
+                with hb_lock:
+                    heartbeats[idx] = time.monotonic()
+                outcome = pipeline.capture_url(url)
+                if outcome.status == "error" and _browser_died(outcome.reason):
+                    logger.warning(
+                        "browser died; rebuilding worker %d, retrying %s", idx, url
+                    )
+                    _close_quietly(cm)
+                    cm = fetcher_factory(config)
+                    pipeline = CapturePipeline(cm.__enter__(), store)
+                    with hb_lock:
+                        heartbeats[idx] = time.monotonic()
+                    outcome = pipeline.capture_url(url)  # one retry on a fresh browser
+                outcomes.append(outcome)
+                with hb_lock:
+                    heartbeats[idx] = None
+        finally:
+            _close_quietly(cm)
+        return outcomes
+
+    supervisor_thread = threading.Thread(target=supervisor, daemon=True)
+    supervisor_thread.start()
+    try:
+        if workers == 1:
+            heartbeats[0] = None
+            return PipelineSummary(outcomes=work(0, url_list))
+
+        shards = [url_list[i::workers] for i in range(workers)]
+        for i in range(workers):
+            heartbeats[i] = None
+        summary = PipelineSummary()
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = [pool.submit(work, i, shards[i]) for i in range(workers)]
+            for future in futures:
+                summary.outcomes.extend(future.result())
+        return summary
+    finally:
+        stop.set()
+        reaper()
diff --git a/forecasting_tools/agents_and_tools/source_archive/reindex.py b/forecasting_tools/agents_and_tools/source_archive/reindex.py
new file mode 100644
index 00000000..5a7472e6
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/reindex.py
@@ -0,0 +1,278 @@
+"""One-off reindex / dedup audit for an existing archive.
+
+This walks the canonical per-URL indexes already in a store and reports how much
+the smarter-dedup work (see ``ROADMAP.md`` Plan 1) would collapse, **without
+mutating anything by default**. It answers the practical question: *after exact
+canonicalization and content dedup, are there still many URLs that look like the
+same page?* — i.e. whether the fuzzy near-dup phase (D) is worth building.
+
+Three lenses:
+
+  - **Canonicalization (Phase A):** group stored URLs by :func:`canonicalize_url`.
+    Any group with >1 distinct raw URL is a set that *now* shares one key.
+  - **Content (Phase C):** group distinct canonical URLs by their latest content
+    hash. A group with >1 URL is byte-identical pages reachable at different URLs.
+  - **Near-dup signal (Phase D candidate):** of the URLs surviving both dedups,
+    group by ``scheme://host/path`` ignoring the query string. Big groups mean
+    "same path, differing query" pages that exact dedup leaves separate — the
+    cases fuzzy matching would target.
+
+Run it::
+
+    # against the configured S3 bucket (read-only audit)
+    WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive WEB_ARCHIVE_AWS_PROFILE=default \\
+        python -m forecasting_tools.agents_and_tools.source_archive.reindex
+
+    # against a local capture dir
+    python -m forecasting_tools.agents_and_tools.source_archive.reindex --local ./archive
+
+    # additionally (re)build the content reverse index for existing captures
+    python -m forecasting_tools.agents_and_tools.source_archive.reindex --apply
+
+``--apply`` only writes the additive ``index/by-content/`` reverse index (safe,
+idempotent). It does **not** move blobs or re-key the per-URL indexes; that
+heavier migration is intentionally deferred (the archive is young).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+from urllib.parse import urlsplit
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+
+class Cluster(BaseModel):
+    key: str
+    urls: list[str]
+
+
+class AnalysisReport(BaseModel):
+    total_url_indexes: int = 0
+    alias_indexes: int = 0  # already-collapsed redirects (Phase B)
+    canonical_captures: int = 0  # distinct stored URLs with content
+    distinct_after_canonicalization: int = 0
+    distinct_after_content_dedup: int = 0
+    canonicalization_clusters: list[Cluster] = []  # raw URLs that now share a key
+    content_clusters: list[Cluster] = []  # different URLs, identical content
+    near_dup_clusters: list[Cluster] = []  # same host+path, differing query
+
+    def __str__(self) -> str:
+        merged_a = sum(len(c.urls) - 1 for c in self.canonicalization_clusters)
+        merged_c = sum(len(c.urls) - 1 for c in self.content_clusters)
+        lines = [
+            "Source-archive dedup audit",
+            "=" * 40,
+            f"URL indexes scanned          : {self.total_url_indexes}",
+            f"  of which alias (redirect)  : {self.alias_indexes}",
+            f"  of which canonical capture : {self.canonical_captures}",
+            "",
+            f"Distinct URLs (raw)          : {self.canonical_captures}",
+            f"After canonicalization (A)   : {self.distinct_after_canonicalization}"
+            f"   (−{merged_a} merged)",
+            f"After content dedup (C)      : {self.distinct_after_content_dedup}"
+            f"   (−{merged_c} byte-identical)",
+            "",
+            f"Canonicalization clusters    : {len(self.canonicalization_clusters)}",
+            f"Identical-content clusters   : {len(self.content_clusters)}",
+            f"Near-dup candidates (D)      : {len(self.near_dup_clusters)}"
+            "  (same host+path, differing query)",
+        ]
+
+        def _show(title: str, clusters: list[Cluster], limit: int = 5) -> None:
+            if not clusters:
+                return
+            lines.append("")
+            lines.append(f"--- top {title} ---")
+            for c in sorted(clusters, key=lambda x: len(x.urls), reverse=True)[:limit]:
+                lines.append(f"  [{len(c.urls)}] {c.key}")
+                for u in c.urls[:4]:
+                    lines.append(f"        {u}")
+                if len(c.urls) > 4:
+                    lines.append(f"        … +{len(c.urls) - 4} more")
+
+        _show("canonicalization clusters", self.canonicalization_clusters)
+        _show("identical-content clusters", self.content_clusters)
+        _show("near-dup candidates (Phase D signal)", self.near_dup_clusters)
+        return "\n".join(lines)
+
+
+def _host_path(url: str) -> str:
+    parts = urlsplit(canonicalize_url(url))
+    return f"{parts.scheme}://{parts.netloc}{parts.path}"
+
+
+def iter_url_indexes(store: BlobStore, prefix: str):
+    """Yield ``(key, index_dict)`` for each per-URL index, skipping the reverse
+    content index under ``index/by-content/``."""
+    index_prefix = f"{prefix.rstrip('/')}/index/"
+    content_sub = f"{index_prefix}by-content/"
+    for key in store.list_keys(index_prefix):
+        if not key.endswith(".json") or key.startswith(content_sub):
+            continue
+        try:
+            yield key, json.loads(store.get(key).decode("utf-8"))
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            continue
+
+
+def analyze(store: BlobStore, config: ArchiveConfig) -> AnalysisReport:
+    report = AnalysisReport()
+    by_canonical: dict[str, list[str]] = defaultdict(list)
+    by_content: dict[str, list[str]] = defaultdict(list)
+
+    for _key, index in iter_url_indexes(store, config.s3_prefix):
+        report.total_url_indexes += 1
+        if index.get("alias_of"):
+            report.alias_indexes += 1
+            continue
+        url = index.get("url")
+        if not url or not index.get("captures"):
+            continue
+        report.canonical_captures += 1
+        by_canonical[canonicalize_url(url)].append(url)
+        ch = index.get("latest_content_hash")
+        if ch:
+            by_content[ch].append(url)
+
+    report.distinct_after_canonicalization = len(by_canonical)
+    report.canonicalization_clusters = [
+        Cluster(key=k, urls=sorted(set(v)))
+        for k, v in by_canonical.items()
+        if len(set(v)) > 1
+    ]
+
+    # Content dedup operates on the canonicalized URL set.
+    content_groups = {k: sorted(set(v)) for k, v in by_content.items()}
+    report.content_clusters = [
+        Cluster(key=k, urls=v) for k, v in content_groups.items() if len(v) > 1
+    ]
+    # distinct pages after content dedup = canonical URLs minus those merged away
+    merged_by_content = sum(len(v) - 1 for v in content_groups.values() if len(v) > 1)
+    report.distinct_after_content_dedup = max(
+        0, report.distinct_after_canonicalization - merged_by_content
+    )
+
+    # Phase D signal: among canonical URLs, same host+path but differing query.
+    survivors = {canonicalize_url(u) for grp in by_canonical.values() for u in grp}
+    by_host_path: dict[str, set[str]] = defaultdict(set)
+    for u in survivors:
+        by_host_path[_host_path(u)].add(u)
+    report.near_dup_clusters = [
+        Cluster(key=k, urls=sorted(v)) for k, v in by_host_path.items() if len(v) > 1
+    ]
+    return report
+
+
+def rebuild_content_index(
+    store: BlobStore, config: ArchiveConfig, *, apply: bool
+) -> int:
+    """(Re)build ``index/by-content/`` from existing captures. Returns the number
+    of content groups (that would be) written. Additive and idempotent."""
+    cstore = ContentStore(store, config)
+    groups: dict[str, list[tuple[str, str]]] = defaultdict(list)
+    for _key, index in iter_url_indexes(store, config.s3_prefix):
+        if index.get("alias_of") or not index.get("captures"):
+            continue
+        uh = index.get("url_hash")
+        url = index.get("url")
+        ch = index.get("latest_content_hash")
+        if uh and url and ch:
+            groups[ch].append((uh, url))
+
+    written = 0
+    for ch, members in groups.items():
+        written += 1
+        if not apply:
+            continue
+        owner_uh, owner_url = members[0]
+        # Re-register every member; the first becomes canonical owner.
+        for uh, url in members:
+            blob_keys = None
+            if uh == owner_uh:
+                cap = index_blob_keys(store, config, owner_uh, ch)
+                blob_keys = cap
+            cstore._register_content(ch, uh, url, blob_keys)
+    return written
+
+
+def index_blob_keys(
+    store: BlobStore, config: ArchiveConfig, uh: str, ch: str
+) -> dict | None:
+    cstore = ContentStore(store, config)
+    index = cstore._read_index(uh)
+    if not index:
+        return None
+    cap = (index.get("captures") or {}).get(ch)
+    if not cap:
+        return None
+    return {
+        "html": cap.get("html_key"),
+        "markdown": cap.get("markdown_key"),
+        "screenshot": cap.get("screenshot_key"),
+    }
+
+
+def _build_store(local_dir: str | None, bucket: str | None, config: ArchiveConfig):
+    if local_dir:
+        from forecasting_tools.agents_and_tools.source_archive.storage import (
+            LocalBlobStore,
+        )
+
+        return LocalBlobStore(local_dir)
+    bucket = bucket or config.s3_bucket
+    if not bucket:
+        sys.exit(
+            "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), "
+            "or use --local DIR."
+        )
+    from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore
+
+    return S3BlobStore(bucket, config=config)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="source-archive-reindex",
+        description="Audit (and optionally rebuild) dedup structures for an "
+        "existing archive.",
+    )
+    parser.add_argument("--local", metavar="DIR", help="audit a local capture dir")
+    parser.add_argument("--bucket", help="override WEB_ARCHIVE_S3_BUCKET")
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="rebuild index/by-content/ for existing captures (additive)",
+    )
+    parser.add_argument("--json", action="store_true", help="emit the report as JSON")
+    args = parser.parse_args(argv)
+
+    config = ArchiveConfig.from_env()
+    store = _build_store(args.local, args.bucket, config)
+
+    report = analyze(store, config)
+    if args.json:
+        print(report.model_dump_json(indent=2))
+    else:
+        print(report)
+
+    if args.apply:
+        n = rebuild_content_index(store, config, apply=True)
+        print(f"\nRebuilt index/by-content/ for {n} content group(s).")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/reports.py b/forecasting_tools/agents_and_tools/source_archive/reports.py
new file mode 100644
index 00000000..ba75b4b6
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/reports.py
@@ -0,0 +1,72 @@
+"""Persist each capture run's per-URL outcomes to ``reports/<run_id>.json``.
+
+The coverage report's job is to surface sources we should be collecting. A cited
+source we have not archived falls into two very different buckets:
+
+- **never fetched** — it was harvested into a manifest but no capture run ever
+  attempted it. This is the real "we should go collect this" signal.
+- **fetched but failed** — we tried and the fetch/quality gate rejected it
+  (Cloudflare, PDF, 404…). A capture problem, not a collection problem.
+
+Without persisted run outcomes the two are indistinguishable. Writing each run's
+outcomes here lets coverage tell them apart.
+"""
+
+from __future__ import annotations
+
+import json
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+    canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+CAPTURED_STATUSES = {"stored", "deduped", "cache_hit"}
+FAILED_STATUSES = {"quality_failed", "error"}
+
+
+def report_key(run_id: str, config: ArchiveConfig) -> str:
+    return f"{config.s3_prefix.rstrip('/')}/reports/{run_id}.json"
+
+
+def write_run_report(
+    store: BlobStore, run_id: str, summary, config: ArchiveConfig
+) -> str:
+    """Persist a run's per-URL outcomes; ``summary`` is a ``PipelineSummary``."""
+    rows = [
+        {"url": o.url, "status": o.status, "reason": getattr(o, "reason", "")}
+        for o in summary.outcomes
+    ]
+    key = report_key(run_id, config)
+    store.put(
+        key, json.dumps(rows, indent=2).encode("utf-8"), content_type="application/json"
+    )
+    return key
+
+
+def read_outcomes(store: BlobStore, config: ArchiveConfig) -> dict[str, str]:
+    """Map canonical URL -> last known capture status across all run reports.
+
+    A captured status wins over a failed one (if we ever succeeded, that's the
+    truth). Returns ``{}`` if no reports exist yet.
+    """
+    prefix = config.s3_prefix.rstrip("/")
+    out: dict[str, str] = {}
+    for key in store.list_keys(f"{prefix}/reports/"):
+        if not key.endswith(".json"):
+            continue
+        try:
+            rows = json.loads(store.get(key).decode("utf-8"))
+        except (UnicodeDecodeError, ValueError):
+            continue
+        for r in rows:
+            url = canonicalize_url(r.get("url", ""))
+            status = r.get("status", "")
+            if not url:
+                continue
+            if url not in out or status in CAPTURED_STATUSES:
+                out[url] = status
+    return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
index c70d676f..7553c972 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterable
 from typing import Protocol, runtime_checkable
 
 
@@ -18,3 +19,7 @@ def put(
     def get(self, key: str) -> bytes: ...
 
     def exists(self, key: str) -> bool: ...
+
+    def list_keys(self, prefix: str = "") -> Iterable[str]:
+        """Yield every stored key beginning with ``prefix`` (for reindex/audit)."""
+        ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
index 429333ab..d85b0b0b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
@@ -22,3 +22,13 @@ def get(self, key: str) -> bytes:
 
     def exists(self, key: str) -> bool:
         return self._path(key).exists()
+
+    def list_keys(self, prefix: str = "") -> list[str]:
+        if not self.root.exists():
+            return []
+        keys = [
+            p.relative_to(self.root).as_posix()
+            for p in self.root.rglob("*")
+            if p.is_file()
+        ]
+        return sorted(k for k in keys if k.startswith(prefix))
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
index 0d4822b0..10914b94 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
@@ -58,3 +58,9 @@ def exists(self, key: str) -> bool:
             if code in ("404", "NoSuchKey", "NotFound"):
                 return False
             raise
+
+    def list_keys(self, prefix: str = ""):
+        paginator = self._get_client().get_paginator("list_objects_v2")
+        for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+            for obj in page.get("Contents", []):
+                yield obj["Key"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/viewer.py b/forecasting_tools/agents_and_tools/source_archive/viewer.py
new file mode 100644
index 00000000..fe604a2b
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/viewer.py
@@ -0,0 +1,409 @@
+"""Streamlit viewer for the source archive.
+
+Browse what the capture pipeline stored in S3: pick a captured URL and see its
+**screenshot, markdown, and HTML** side by side, with the question/bot it came
+from. Reads provenance from the run manifests and resolves each URL's latest
+capture through its per-URL index — no local file wrangling.
+
+Run it::
+
+    # uses the same env as the rest of the archive (WEB_ARCHIVE_S3_BUCKET, etc.)
+    AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \\
+      streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+
+Nothing here is deployment-specific: bucket/prefix/profile come from
+``ArchiveConfig.from_env()``.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+# `streamlit run <file>` puts only the script's own directory on sys.path, not
+# the repo root — so make `import forecasting_tools` work whether the package is
+# pip-installed or just checked out. (viewer.py -> source_archive -> agents_and_tools
+# -> forecasting_tools -> <repo root>.)
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+import pandas as pd  # noqa: E402
+import streamlit as st  # noqa: E402
+
+from forecasting_tools.agents_and_tools.source_archive.config import (  # noqa: E402
+    ArchiveConfig,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import (  # noqa: E402
+    url_hash,
+)
+
+# --- S3 access (cached) ----------------------------------------------------
+
+
+@st.cache_resource(show_spinner=False)
+def _client(profile: str | None, region: str | None):
+    import boto3
+
+    return boto3.Session(
+        profile_name=profile or None, region_name=region or None
+    ).client("s3")
+
+
+def _cfg() -> ArchiveConfig:
+    return ArchiveConfig.from_env()
+
+
+@st.cache_data(show_spinner=False)
+def _list_keys(bucket: str, prefix: str) -> list[str]:
+    cfg = _cfg()
+    if cfg.local_dir:  # filesystem-backed archive — list matching files
+        root = Path(cfg.local_dir)
+        if not root.exists():
+            return []
+        return [
+            p.relative_to(root).as_posix()
+            for p in root.rglob("*")
+            if p.is_file() and p.relative_to(root).as_posix().startswith(prefix)
+        ]
+    s3 = _client(cfg.aws_profile, cfg.aws_region)
+    keys: list[str] = []
+    token = None
+    while True:
+        kw = {"Bucket": bucket, "Prefix": prefix}
+        if token:
+            kw["ContinuationToken"] = token
+        resp = s3.list_objects_v2(**kw)
+        keys.extend(o["Key"] for o in resp.get("Contents", []))
+        if not resp.get("IsTruncated"):
+            break
+        token = resp.get("NextContinuationToken")
+    return keys
+
+
+@st.cache_data(show_spinner=False)
+def _get_bytes(bucket: str, key: str) -> bytes | None:
+    cfg = _cfg()
+    if cfg.local_dir:
+        p = Path(cfg.local_dir) / key
+        return p.read_bytes() if p.exists() else None
+    s3 = _client(cfg.aws_profile, cfg.aws_region)
+    try:
+        return s3.get_object(Bucket=bucket, Key=key)["Body"].read()
+    except Exception:
+        return None
+
+
+# Metaculus question id -> review URL. Derived at display time (not stored) so
+# there's no redundant, drift-prone URL column in S3.
+_METACULUS_QUESTION_BASE = "https://www.metaculus.com/questions/"
+
+
+def _metaculus_url(metaculus_id) -> str:
+    if metaculus_id in (None, "", "null"):
+        return ""
+    return f"{_METACULUS_QUESTION_BASE}{metaculus_id}/"
+
+
+def _comment_url(metaculus_id, comment_id) -> str:
+    """Deep-link to the specific comment the URL was cited in."""
+    base = _metaculus_url(metaculus_id)
+    if not base or comment_id in (None, "", "null"):
+        return ""
+    return f"{base}#comment-{comment_id}"
+
+
+@st.cache_data(show_spinner="Loading manifests…")
+def _manifest_rows(bucket: str, prefix: str) -> pd.DataFrame:
+    """Every (question, bot, url) the bots cited, from the run manifests."""
+    rows = []
+    for key in _list_keys(bucket, f"{prefix}/manifests/"):
+        body = _get_bytes(bucket, key)
+        if not body:
+            continue
+        for line in body.decode("utf-8").splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            r = json.loads(line)
+            rows.append(
+                {
+                    "question": r.get("question_id") or "(none)",
+                    "bot": r.get("bot") or "(none)",
+                    "run_id": r.get("run_id") or "",
+                    "origin": r.get("origin") or "",
+                    "query": r.get("query") or "",
+                    "metaculus": _metaculus_url(r.get("metaculus_id")),
+                    "comment": _comment_url(r.get("metaculus_id"), r.get("comment_id")),
+                    "url": r.get("url", ""),
+                    "question_url": r.get("question_url") or "",
+                    "tool_args": r.get("tool_args"),
+                }
+            )
+    df = pd.DataFrame(rows)
+    if not df.empty:
+        # Keep distinct provenance (a URL cited via two origins/runs = two rows).
+        df = df.drop_duplicates(
+            subset=["question", "bot", "run_id", "origin", "url"]
+        ).reset_index(drop=True)
+    return df
+
+
+def _scrape_report(bucket: str, prefix: str, view: pd.DataFrame):
+    """Per-question scraping cost: which backend captured each URL.
+
+    Self-hosted Playwright is free; Firecrawl (the fallback) costs ~1 credit per
+    page and is what actually accrues spend once a key is configured. We classify
+    each *stored* capture by ``fetcher`` and count Firecrawl pages per question.
+
+    Caveat: only successful captures are recorded in the index, so a Firecrawl
+    attempt that failed its quality gate isn't counted here — billed attempts
+    aren't yet persisted (see the note in the UI).
+    """
+    per_q: dict[str, dict] = {}
+    for _, row in view.iterrows():
+        cap = _index(bucket, prefix, row["url"])
+        q = row["question"]
+        agg = per_q.setdefault(
+            q,
+            {
+                "question": q,
+                "urls": 0,
+                "captured": 0,
+                "playwright": 0,
+                "firecrawl": 0,
+                "other": 0,
+            },
+        )
+        agg["urls"] += 1
+        if not cap:
+            continue
+        agg["captured"] += 1
+        fetcher = (cap.get("fetcher") or "").lower()
+        if fetcher in ("playwright", "firecrawl"):
+            agg[fetcher] += 1
+        else:
+            agg["other"] += 1
+    return per_q
+
+
+@st.cache_data(show_spinner=False)
+def _index(bucket: str, prefix: str, url: str) -> dict | None:
+    """Latest stored capture for a URL (keys + metadata), or None if uncaptured."""
+    body = _get_bytes(bucket, f"{prefix}/index/{url_hash(url)}.json")
+    if not body:
+        return None
+    idx = json.loads(body.decode("utf-8"))
+    ch = idx.get("latest_content_hash")
+    cap = (idx.get("captures") or {}).get(ch)
+    return cap
+
+
+# --- UI --------------------------------------------------------------------
+
+
+def main() -> None:
+    st.set_page_config(page_title="Source Archive Viewer", layout="wide")
+    cfg = _cfg()
+    st.title("📚 Source Archive Viewer")
+
+    location = cfg.local_dir or cfg.s3_bucket
+    if not location:
+        st.error(
+            "No archive configured. Set WEB_ARCHIVE_LOCAL_DIR (a local capture "
+            "directory) or WEB_ARCHIVE_S3_BUCKET (S3), then reload."
+        )
+        st.stop()
+    if cfg.local_dir:
+        st.caption(f"📂 local: {cfg.local_dir}/{cfg.s3_prefix}")
+    else:
+        st.caption(
+            f"s3://{cfg.s3_bucket}/{cfg.s3_prefix}  ·  "
+            f"profile={cfg.aws_profile or 'default'}"
+        )
+
+    with st.sidebar:
+        st.header("Filters")
+        if st.button("🔄 Refresh"):
+            st.cache_data.clear()
+            st.rerun()
+
+    df = _manifest_rows(location, cfg.s3_prefix)
+    if df.empty:
+        st.warning("No manifests found under this prefix yet. Run a capture first.")
+        st.stop()
+
+    with st.sidebar:
+        bots = sorted(df["bot"].unique())
+        qs = sorted(df["question"].unique())
+        sel_bots = st.multiselect("Bot", bots, default=bots)
+        sel_qs = st.multiselect("Question", qs, default=qs)
+        search = st.text_input("URL contains")
+
+    view = df[df["bot"].isin(sel_bots) & df["question"].isin(sel_qs)]
+    if search:
+        view = view[view["url"].str.contains(search, case=False, na=False)]
+    view = view.reset_index(drop=True)
+
+    st.subheader(f"{len(view)} cited URL(s)")
+
+    # Resolve capture status for the filtered rows (cached per-URL).
+    if len(view) > 300:
+        st.info(
+            "Showing 300 of %d — narrow with the filters for capture details."
+            % len(view)
+        )
+    table = []
+    for _, row in view.head(300).iterrows():
+        cap = _index(location, cfg.s3_prefix, row["url"])
+        table.append(
+            {
+                "question": row["question"],
+                "bot": row["bot"],
+                "run_id": row["run_id"],
+                "origin": row["origin"],
+                "captured": "✅" if cap else "—",
+                "fetcher": (cap or {}).get("fetcher", ""),
+                "captured_at": (cap or {}).get("captured_at", "")[:19],
+                "metaculus": row["metaculus"],
+                "comment": row["comment"],
+                "url": row["url"],
+            }
+        )
+    st.dataframe(
+        pd.DataFrame(table),
+        use_container_width=True,
+        hide_index=True,
+        column_config={
+            # Show the full link address as the clickable text (not a label).
+            "url": st.column_config.LinkColumn("url"),
+            "metaculus": st.column_config.LinkColumn(
+                "metaculus", display_text="question ↗"
+            ),
+            "comment": st.column_config.LinkColumn("comment", display_text="comment ↗"),
+        },
+    )
+
+    if st.sidebar.checkbox("💸 Show scraping cost"):
+        st.subheader("💸 Scraping cost (filtered set)")
+        rate = st.number_input(
+            "Firecrawl cost per page ($)",
+            min_value=0.0,
+            value=0.001,
+            step=0.0005,
+            format="%.4f",
+            help="Self-hosted Playwright is free; this prices the Firecrawl "
+            "fallback. Adjust to your plan's credit rate.",
+        )
+        per_q = _scrape_report(location, cfg.s3_prefix, view.head(300))
+        rows, t_fc, t_pw, t_cap, t_url = [], 0, 0, 0, 0
+        for agg in sorted(per_q.values(), key=lambda a: a["question"]):
+            rows.append(
+                {
+                    "question": agg["question"],
+                    "urls": agg["urls"],
+                    "captured": agg["captured"],
+                    "playwright (free)": agg["playwright"],
+                    "firecrawl (paid)": agg["firecrawl"],
+                    "firecrawl $": round(agg["firecrawl"] * rate, 4),
+                }
+            )
+            t_fc += agg["firecrawl"]
+            t_pw += agg["playwright"]
+            t_cap += agg["captured"]
+            t_url += agg["urls"]
+        st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
+        a, b, c = st.columns(3)
+        a.metric("Captured", f"{t_cap}/{t_url}")
+        b.metric("Firecrawl pages", t_fc, help="Playwright pages are free")
+        c.metric("Est. Firecrawl cost", f"${t_fc * rate:.4f}")
+        st.caption(
+            f"Playwright (free): {t_pw} · Firecrawl (paid): {t_fc}.  "
+            "⚠️ Only **successful** captures carry a fetcher in the index, so "
+            "Firecrawl attempts that failed the quality gate aren't counted — "
+            "billed-attempt tracking needs the pipeline to persist fetch attempts."
+        )
+
+    st.divider()
+    st.subheader("Inspect a capture")
+    labels = [f"[{r['question']}] {r['url']}" for _, r in view.iterrows()]
+    if not labels:
+        st.stop()
+    choice = st.selectbox("URL", range(len(labels)), format_func=lambda i: labels[i])
+    row = view.iloc[choice]
+    url = row["url"]
+    cap = _index(location, cfg.s3_prefix, url)
+
+    c1, c2 = st.columns([3, 2])
+    with c1:
+        st.markdown(f"**URL:** [{url}]({url})")
+        st.markdown(
+            f"**Question:** `{row['question']}` · **Bot:** `{row['bot']}` · "
+            f"**Origin:** `{row['origin'] or '—'}`"
+        )
+        st.markdown(f"**Run:** `{row['run_id'] or '—'}`")
+        review = row["metaculus"] or row["question_url"]
+        if review:
+            st.markdown(f"**Metaculus question:** [{review}]({review})")
+        if row["comment"]:
+            st.markdown(f"**Cited in comment:** [{row['comment']}]({row['comment']})")
+        if row["query"]:
+            st.markdown(f"**Search query:** `{row['query']}`")
+        if row.get("tool_args"):
+            st.markdown(f"**Tool args:** `{row['tool_args']}`")
+    with c2:
+        if cap:
+            st.markdown(
+                f"**Captured:** {cap.get('captured_at','')[:19]}  ·  "
+                f"**Fetcher:** {cap.get('fetcher','')}  ·  "
+                f"**HTTP:** {cap.get('status_code','?')}"
+            )
+
+    if not cap:
+        st.warning(
+            "No stored capture for this URL — it failed the quality gate / errored, "
+            "or hasn't been captured yet."
+        )
+        st.stop()
+
+    tab_shot, tab_md, tab_html = st.tabs(["🖼 Screenshot", "📝 Markdown", "🌐 HTML"])
+
+    with tab_shot:
+        key = cap.get("screenshot_key")
+        data = _get_bytes(location, key) if key else None
+        if data:
+            st.download_button("Download .webp", data, file_name="screenshot.webp")
+            st.image(data, use_container_width=True)
+        else:
+            st.info("No screenshot stored.")
+
+    with tab_md:
+        key = cap.get("markdown_key")
+        data = _get_bytes(location, key) if key else None
+        if data:
+            text = data.decode("utf-8", "replace")
+            st.download_button("Download .md", data, file_name="page.md")
+            st.caption(f"{len(text):,} chars")
+            st.markdown(text)
+        else:
+            st.info("No markdown stored.")
+
+    with tab_html:
+        key = cap.get("html_key")
+        data = _get_bytes(location, key) if key else None
+        if data:
+            html = data.decode("utf-8", "replace")
+            st.download_button("Download .html", data, file_name="page.html")
+            st.caption(
+                f"{len(html):,} chars · rendered below (CSS/images load from the "
+                "original site and may not all resolve — the screenshot is the "
+                "faithful visual record)."
+            )
+            st.components.v1.html(html, height=800, scrolling=True)
+        else:
+            st.info("No HTML stored.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/poetry.lock b/poetry.lock
index c0fcff5e..f741fa95 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -867,6 +867,29 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "cloakbrowser"
+version = "0.3.32"
+description = "Stealth Chromium that passes every bot detection test. Drop-in Playwright replacement with source-level fingerprint patches."
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "cloakbrowser-0.3.32-py3-none-any.whl", hash = "sha256:5a993ee019bfcd00d545d7d6d51837646bcb1e8226545acdf0b543b38a8883df"},
+    {file = "cloakbrowser-0.3.32.tar.gz", hash = "sha256:7361e2f5e366f651b5d54aad3ac13e145462110e0956b538ae3686916c36535a"},
+]
+
+[package.dependencies]
+httpx = ">=0.24"
+playwright = ">=1.40"
+
+[package.extras]
+dev = ["pytest (>=7.0)", "pytest-asyncio (>=0.23)"]
+geoip = ["geoip2 (>=4.0)", "socksio (>=1.0)"]
+patchright = ["patchright (>=1.40)"]
+serve = ["aiohttp (>=3.9)", "websockets (>=12.0)"]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -5371,6 +5394,46 @@ dev = ["coverage[toml] (==7.10.7)", "cryptography (>=3.4.0)", "pre-commit", "pyt
 docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
 tests = ["coverage[toml] (==7.10.7)", "pytest (>=8.4.2,<9.0.0)"]
 
+[[package]]
+name = "pymupdf"
+version = "1.27.2.3"
+description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc1bc3cae6e9e150b0dbb0a9221bdfd411d65f0db2fe359eaa22467d7cc2a05f"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:660d93cb6da5bbddf11d3982ae27745dd3a9902d9f24cdb69adab83962294b5a"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1dd460a3ae4597a755f00a3bd9771f5ebf1531dc111f6a36bf05dd00a6b84425"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:857842b4888827bd6155a1131341b2822a7ebe9a8c15a975fd7d490d7a64a30c"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:580983849c64a08d08344ca3d1580e87c01f046a8392421797bc850efd72a5b6"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-win32.whl", hash = "sha256:a5c1088a87189891a4946ab314a14b7934ac4c5b6077f7e74ebee956f8906d0e"},
+    {file = "pymupdf-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:d20f68ef15195e073071dbc4ae7455257c7889af7584e39df490c0a92728526e"},
+    {file = "pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2"},
+    {file = "pymupdf-1.27.2.3.tar.gz", hash = "sha256:7a92faa25129e8bbec5e50eeb9214f187665428c31b05c4ef6e36c58c0b1c6d2"},
+]
+
+[[package]]
+name = "pymupdf4llm"
+version = "0.3.4"
+description = "PyMuPDF Utilities for LLM/RAG"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "pymupdf4llm-0.3.4-py3-none-any.whl", hash = "sha256:0517492f82af978541162ade20fc54649cdca52acd478e33b97cb6171d69956f"},
+    {file = "pymupdf4llm-0.3.4.tar.gz", hash = "sha256:48d396a5fb3c14351493c7f1dd25b2a843efdbdc4526e489ee100643a2cebec1"},
+]
+
+[package.dependencies]
+pymupdf = ">=1.27.1"
+tabulate = "*"
+
+[package.extras]
+layout = ["pymupdf-layout (>=1.27.1)"]
+
 [[package]]
 name = "pyparsing"
 version = "3.3.2"
@@ -6659,6 +6722,22 @@ typepy = ">=1.2.0,<3"
 logging = ["loguru (>=0.4.1,<1)"]
 test = ["pytablewriter (>=0.46)", "pytest"]
 
+[[package]]
+name = "tabulate"
+version = "0.10.0"
+description = "Pretty-print tabular data"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3"},
+    {file = "tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
 [[package]]
 name = "tcolorpy"
 version = "0.1.7"
@@ -7818,9 +7897,9 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_it
 type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""]
 
 [extras]
-source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"]
+source-archive = ["boto3", "cloakbrowser", "firecrawl-py", "playwright", "pymupdf4llm", "trafilatura"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e"
+content-hash = "d9abd6c9194bdd4769704c8c60f48f438f9d77370b35ee739555d3b9fd3e5e22"
diff --git a/pyproject.toml b/pyproject.toml
index d15ad580..c8b322f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,9 +57,16 @@ boto3 = {version = ">=1.34,<2.0.0", optional = true}
 playwright = {version = ">=1.44,<2.0.0", optional = true}
 firecrawl-py = {version = ">=4.0,<5.0.0", optional = true}
 trafilatura = {version = ">=1.9,<3.0.0", optional = true}
+pymupdf4llm = {version = ">=0.0.17,<1.0.0", optional = true}
+# Self-hosted anti-bot backend (CloakBrowser). Pinned tight to 0.3.x: it's a
+# young, fast-moving 0.x package whose launch() API changed recently, so bump
+# the minor deliberately. The pip wheel is light (httpx + playwright); the
+# ~200MB patched Chromium downloads at first launch, not at install.
+cloakbrowser = {version = ">=0.3.31,<0.4.0", optional = true}
 
 [tool.poetry.extras]
-source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"]
+# hyperbrowser is already a core dep (used elsewhere too).
+source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura", "pymupdf4llm", "cloakbrowser"]
 
 [tool.poetry.scripts]
 source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main"