diff --git a/.env.template b/.env.template
index 7f167fd0..e96d4a48 100644
--- a/.env.template
+++ b/.env.template
@@ -17,8 +17,40 @@ METACULUS_API_BASE_URL=https://www.metaculus.com/api
# As of Jan 23rd 2025, only used for free semantic similarity calculation in Deduplicator, but defaults to OpenAI if not filled in
HUGGINGFACE_API_KEY=
-# As of Jun 10 2025, used for browser use agents
+# As of Jun 10 2025, used for browser use agents.
+# Also a fallback capture backend for the source archive (see below).
HYPERBROWSER_API_KEY=
+# --- Source archive (agents_and_tools/source_archive) -----------------------
+# Capture HTML + screenshot + markdown for every URL a bot cites. All optional;
+# blank WEB_ARCHIVE_S3_BUCKET stores locally instead of S3.
+WEB_ARCHIVE_S3_BUCKET=
+WEB_ARCHIVE_S3_PREFIX=source-archive
+WEB_ARCHIVE_AWS_PROFILE=
+# Set to a local capture directory to run/view the archive with no S3 (the
+# viewer reads from here when set). E.g. `capture --local ./archive`.
+WEB_ARCHIVE_LOCAL_DIR=
+WEB_ARCHIVE_TTL_DAYS=14
+# Managed fallback backends for the anti-bot / PDF tail behind self-hosted
+# Playwright. FIRECRAWL also parses PDFs natively (OCR fallback for PdfFetcher).
+FIRECRAWL_API_KEY=
+# Firecrawl proxy mode for hardened anti-bot sites: basic (1 credit) | auto |
+# stealth/enhanced (5 credits). Leave "basic" unless you need Cloudflare bypass.
+WEB_ARCHIVE_FIRECRAWL_PROXY=basic
+# Hyperbrowser session knobs (proxy turns a 1-credit scrape into 10 credits).
+WEB_ARCHIVE_HYPERBROWSER_PROXY=true
+WEB_ARCHIVE_HYPERBROWSER_STEALTH=true
+WEB_ARCHIVE_HYPERBROWSER_CAPTCHA=true
+# CloakBrowser (self-hosted anti-bot Playwright fork) module, if installed
+# (`pip install cloakbrowser`). Exposes cloakbrowser.launch() -> Browser.
+WEB_ARCHIVE_CLOAKBROWSER_IMPORT=cloakbrowser
+WEB_ARCHIVE_PDF_MAX_PAGES=50
+# Operator-only: database DSN for `harvest-db` (reads a bot's cited URLs straight
+# from Postgres). libpq DSN or postgresql:// URL — e.g. a Neon connection string.
+# This DSN is a real secret. PREFER the macOS Keychain (item `metaculus-db-dsn`)
+# over this file — see the source_archive README "DSN resolution". Leave blank to
+# use the Keychain / local default.
+METACULUS_DB_DSN=
+
# Disable if in Streamlit Cloud
FILE_WRITING_ALLOWED=TRUE
diff --git a/.gitignore b/.gitignore
index 96b48188..4a05a450 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,7 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
# .idea/
+
+# Private bot trace samples must never land in this public repo (kept locally).
+butler-traces/
+**/butler-traces/
diff --git a/_typos.toml b/_typos.toml
index 0359061b..83ba4592 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,6 +1,8 @@
[default]
extend-ignore-identifiers-re = [
"AttributeID.*Supress.*",
+ # Real tracking-query params stripped during URL canonicalization (not typos).
+ "oly_.*",
]
[default.extend-identifiers]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py
new file mode 100644
index 00000000..3adcf9be
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py
@@ -0,0 +1,266 @@
+"""Unit tests for the backup capture backends and the bake-off pricing model.
+
+These mock the vendor SDKs so they run without API keys, network, browsers, or
+the optional pymupdf/playwright/cloakbrowser packages installed.
+"""
+
+from __future__ import annotations
+
+import base64
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive import benchmark as B
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+ build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+ CloakBrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+ FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+ HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+ PdfFetcher,
+ looks_like_pdf,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+# --- Firecrawl proxy/stealth wiring ------------------------------------------
+def test_firecrawl_basic_sends_no_proxy_key():
+ f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy="basic"))
+ assert "proxy" not in f._scrape_kwargs(["markdown"])
+
+
+@pytest.mark.parametrize("mode", ["auto", "stealth", "enhanced"])
+def test_firecrawl_stealth_sends_proxy_key(mode):
+ f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy=mode))
+ assert f._scrape_kwargs(["markdown"])["proxy"] == mode
+
+
+def test_firecrawl_fetch_pdf_markdown():
+ class FakeClient:
+ def scrape(self, url, **kwargs):
+ assert kwargs["formats"] == ["markdown"]
+ return {"markdown": "# PDF body " + "x " * 200}
+
+ f = FirecrawlFetcher(ArchiveConfig(firecrawl_api_key="k"), client=FakeClient())
+ assert f.fetch_pdf_markdown("https://x/y.pdf").startswith("# PDF body")
+
+
+# --- Hyperbrowser screenshot coercion + result mapping -----------------------
+def test_hyperbrowser_coerce_screenshot_data_uri():
+ raw = b"\x89PNG fake"
+ uri = "data:image/png;base64," + base64.b64encode(raw).decode()
+ shot, ctype = HyperbrowserFetcher._coerce_screenshot(uri)
+ assert shot == raw and ctype == "image/png"
+
+
+def test_hyperbrowser_coerce_screenshot_bare_base64():
+ raw = b"\x89PNG fake"
+ shot, ctype = HyperbrowserFetcher._coerce_screenshot(base64.b64encode(raw).decode())
+ assert shot == raw and ctype == "image/png"
+
+
+def test_hyperbrowser_coerce_screenshot_none():
+ assert HyperbrowserFetcher._coerce_screenshot(None) == (None, None)
+
+
+def test_hyperbrowser_fetch_maps_result(monkeypatch):
+ class Data:
+ metadata = {"statusCode": 200, "title": "T", "sourceURL": "https://final"}
+ html = "ok"
+ markdown = "ok " * 100
+ screenshot = None
+
+ class Resp:
+ status = "completed"
+ error = None
+ data = Data()
+
+ class FakeClient:
+ class scrape:
+ @staticmethod
+ def start_and_wait(params):
+ return Resp()
+
+ f = HyperbrowserFetcher(
+ ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient()
+ )
+ # Avoid constructing real SDK request models in the unit test.
+ monkeypatch.setattr(f, "_params", lambda url: None)
+ result = f.fetch("https://x")
+ assert result.fetcher == "hyperbrowser"
+ assert result.final_url == "https://final"
+ assert result.status_code == 200
+ assert result.metadata["used_proxy"] is True
+
+
+def test_hyperbrowser_failed_job_raises(monkeypatch):
+ class Resp:
+ status = "failed"
+ error = "blocked"
+ data = None
+
+ class FakeClient:
+ class scrape:
+ @staticmethod
+ def start_and_wait(params):
+ return Resp()
+
+ f = HyperbrowserFetcher(
+ ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient()
+ )
+ monkeypatch.setattr(f, "_params", lambda url: None)
+ with pytest.raises(FetchError):
+ f.fetch("https://x")
+
+
+def test_hyperbrowser_requires_key():
+ with pytest.raises(FetchError):
+ HyperbrowserFetcher(ArchiveConfig(hyperbrowser_api_key=None)).fetch("https://x")
+
+
+# --- PDF fetcher -------------------------------------------------------------
+def test_looks_like_pdf():
+ assert looks_like_pdf("https://x/report.pdf")
+ assert looks_like_pdf("https://x/report.PDF?v=2")
+ assert not looks_like_pdf("https://x/report.html")
+
+
+def test_pdf_rejects_non_pdf_bytes():
+ f = PdfFetcher(
+ ArchiveConfig(),
+ downloader=lambda url, t: (b"not a pdf", url, 200),
+ )
+ with pytest.raises(FetchError):
+ f.fetch("https://x/fake.pdf")
+
+
+def test_pdf_falls_back_to_firecrawl_when_local_thin(monkeypatch):
+ class FakeFirecrawl:
+ def fetch_pdf_markdown(self, url):
+ return "# Scanned doc recovered by OCR " + "y " * 200
+
+ f = PdfFetcher(
+ ArchiveConfig(),
+ firecrawl=FakeFirecrawl(),
+ downloader=lambda url, t: (b"%PDF- minimal", url, 200),
+ )
+ # Force the local parser to look thin regardless of whether pymupdf is present.
+ monkeypatch.setattr(f, "_parse_local", lambda data: (None, None, None, 3, "none"))
+ result = f.fetch("https://x/scan.pdf")
+ assert result.metadata["pdf_engine"] == "firecrawl"
+ assert "OCR" in result.markdown
+
+
+def test_pdf_uses_local_when_text_is_rich(monkeypatch):
+ f = PdfFetcher(
+ ArchiveConfig(),
+ downloader=lambda url, t: (b"%PDF- minimal", url, 200),
+ )
+ rich = "# Title\n" + "real body text " * 100
+ monkeypatch.setattr(
+ f, "_parse_local", lambda data: (rich, b"png", "image/png", 5, "pymupdf4llm")
+ )
+ result = f.fetch("https://x/clean.pdf")
+ assert result.metadata["pdf_engine"] == "pymupdf4llm"
+ assert result.metadata["pdf_pages"] == 5
+ assert result.screenshot == b"png"
+
+
+# --- CloakBrowser ------------------------------------------------------------
+def test_cloakbrowser_missing_package_gives_clear_error(monkeypatch):
+ # Force every import to fail so this passes whether or not cloakbrowser is
+ # actually installed in the test environment.
+ import forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher as cb
+
+ def _boom(name):
+ raise ImportError(name)
+
+ monkeypatch.setattr(cb.importlib, "import_module", _boom)
+ f = CloakBrowserFetcher(ArchiveConfig())
+ with pytest.raises(FetchError) as exc:
+ f._launch_browser()
+ assert "cloakbrowser" in str(exc.value).lower()
+
+
+# --- Pricing model -----------------------------------------------------------
+def test_pricing_self_host_is_floor():
+ r = CaptureResult(url="u", final_url="u")
+ assert B.estimate_cost("playwright", r, 1_000_000, B.Pricing()) == 0.00001
+ assert B.estimate_cost("cloakbrowser", r, 1_000_000, B.Pricing()) == 0.00001
+
+
+def test_pricing_firecrawl_basic_vs_stealth():
+ basic = CaptureResult(url="u", final_url="u", metadata={"firecrawl_proxy": "basic"})
+ stealth = CaptureResult(
+ url="u", final_url="u", metadata={"firecrawl_proxy": "auto"}
+ )
+ assert B.estimate_cost("firecrawl", basic, 0, B.Pricing()) == pytest.approx(0.00083)
+ assert B.estimate_cost(
+ "firecrawl-stealth", stealth, 0, B.Pricing()
+ ) == pytest.approx(0.00415)
+
+
+def test_pricing_hyperbrowser_proxy_includes_bandwidth():
+ r = CaptureResult(url="u", final_url="u", metadata={"used_proxy": True})
+ # 10 credits ($0.01) + 1MB * $10/GB ($0.01) = $0.02
+ assert B.estimate_cost("hyperbrowser", r, 1_000_000, B.Pricing()) == pytest.approx(
+ 0.02
+ )
+
+
+def test_pricing_pdf_local_is_free_firecrawl_is_per_page():
+ local = CaptureResult(
+ url="u", final_url="u", metadata={"pdf_engine": "pymupdf4llm"}
+ )
+ ocr = CaptureResult(
+ url="u", final_url="u", metadata={"pdf_engine": "firecrawl", "pdf_pages": 10}
+ )
+ assert B.estimate_cost("pdf", local, 0, B.Pricing()) == 0.0
+ assert B.estimate_cost("pdf", ocr, 0, B.Pricing()) == pytest.approx(0.0083)
+
+
+# --- Default tiered chain composition ----------------------------------------
+def _fake_browser():
+ from unittest.mock import MagicMock
+
+ return None, MagicMock() # (playwright_handle, browser) — browser.close() ok
+
+
+def test_default_chain_cloakbrowser_is_primary(monkeypatch):
+ # CloakBrowser available -> it is the single self-hosted browser tier.
+ monkeypatch.setattr(
+ CloakBrowserFetcher, "_launch_browser", lambda self: _fake_browser()
+ )
+ config = ArchiveConfig(hyperbrowser_api_key="h", firecrawl_api_key="f")
+ with build_default_fetcher(config) as fetcher:
+ names = [b.name for b in fetcher._tiered.backends]
+ # Note: exactly one browser tier (cloakbrowser), not vanilla + cloak.
+ assert names == ["cloakbrowser", "pdf", "hyperbrowser", "firecrawl"]
+
+
+def test_default_chain_falls_back_to_playwright_and_skips_unkeyed(monkeypatch):
+ from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+ PlaywrightFetcher,
+ )
+
+ # CloakBrowser not installed -> vanilla Playwright is the browser tier.
+ def raise_unavailable(self):
+ raise FetchError("cloakbrowser not installed")
+
+ monkeypatch.setattr(CloakBrowserFetcher, "_launch_browser", raise_unavailable)
+ monkeypatch.setattr(
+ PlaywrightFetcher, "_launch_browser", lambda self: _fake_browser()
+ )
+ config = ArchiveConfig(hyperbrowser_api_key=None, firecrawl_api_key=None)
+ with build_default_fetcher(config) as fetcher:
+ names = [b.name for b in fetcher._tiered.backends]
+ assert names == ["playwright", "pdf"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py
new file mode 100644
index 00000000..e9476409
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import url_hash
+
+# (raw, expected canonical) — each pair documents one normalization rule.
+CASES = [
+ # fragment dropped
+ ("https://a.test/x#section", "https://a.test/x"),
+ # trailing slash dropped (non-root)
+ ("https://a.test/x/", "https://a.test/x"),
+ # root path collapses (with or without slash) to host only
+ ("https://a.test/", "https://a.test"),
+ ("https://a.test", "https://a.test"),
+ # scheme + host lowercased, path case preserved
+ ("HTTPS://A.TEST/Path", "https://a.test/Path"),
+ # default ports stripped, non-default kept
+ ("http://a.test:80/x", "http://a.test/x"),
+ ("https://a.test:443/x", "https://a.test/x"),
+ ("https://a.test:8443/x", "https://a.test:8443/x"),
+ # tracking params removed, meaningful params kept
+ ("https://a.test/x?utm_source=z&utm_medium=email", "https://a.test/x"),
+ ("https://a.test/x?id=7&fbclid=abc", "https://a.test/x?id=7"),
+ ("https://a.test/x?gclid=abc&igshid=q", "https://a.test/x"),
+ # remaining params sorted (order-independent)
+ ("https://a.test/x?b=2&a=1", "https://a.test/x?a=1&b=2"),
+ # bare "ref"/"source" are intentionally preserved
+ ("https://a.test/x?ref=home", "https://a.test/x?ref=home"),
+ # combination
+ (
+ "HTTPS://A.TEST:443/Path/?b=2&utm_campaign=spring&a=1#frag",
+ "https://a.test/Path?a=1&b=2",
+ ),
+ # non-http(s) left alone
+ ("mailto:someone@a.test", "mailto:someone@a.test"),
+]
+
+
+@pytest.mark.parametrize("raw,expected", CASES)
+def test_canonicalize_cases(raw: str, expected: str):
+ assert canonicalize_url(raw) == expected
+
+
+@pytest.mark.parametrize("raw,_expected", CASES)
+def test_canonicalize_is_idempotent(raw: str, _expected: str):
+ once = canonicalize_url(raw)
+ assert canonicalize_url(once) == once
+
+
+def test_near_duplicates_share_a_url_hash():
+ variants = [
+ "https://a.test/article",
+ "https://a.test/article/",
+ "https://a.test/article#intro",
+ "https://a.test/article?utm_source=newsletter",
+ "HTTPS://A.test/article",
+ ]
+ hashes = {url_hash(v) for v in variants}
+ assert len(hashes) == 1
+
+
+def test_distinct_pages_keep_distinct_hashes():
+ assert url_hash("https://a.test/x?id=1") != url_hash("https://a.test/x?id=2")
+ assert url_hash("https://a.test/x") != url_hash("https://a.test/y")
+
+
+def test_empty_and_none_safe():
+ assert canonicalize_url("") == ""
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py
new file mode 100644
index 00000000..e50775ee
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.catalog import (
+ build_catalog,
+ write_catalog,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CaptureResult,
+ CitationRecord,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _capture(url: str, html: str) -> CaptureResult:
+ return CaptureResult(
+ url=url,
+ final_url=url,
+ status_code=200,
+ html=html,
+ markdown="md " * 30,
+ screenshot=b"img",
+ screenshot_content_type="image/png",
+ fetcher="fake",
+ )
+
+
+def _seed(tmp_path):
+ store = LocalBlobStore(tmp_path)
+ config = ArchiveConfig(s3_prefix="t")
+ cstore = ContentStore(store, config)
+ cstore.store(_capture("https://a.test/p", "
a
"))
+ cstore.store(_capture("https://b.test/q", "b
"))
+ # uncaptured.test/x is cited but never captured.
+ records = [
+ CitationRecord(
+ url="https://a.test/p?utm_source=news", # canonicalizes to /p
+ run_id="r1",
+ bot="alpha",
+ question_id="100",
+ question_url="https://www.metaculus.com/questions/100/",
+ tool_name="web_search",
+ ),
+ CitationRecord(
+ url="https://b.test/q",
+ run_id="r1",
+ bot="beta",
+ question_id="100",
+ question_url="https://www.metaculus.com/questions/100/",
+ tool_name="page_fetch",
+ ),
+ CitationRecord(
+ url="https://uncaptured.test/x",
+ run_id="r1",
+ bot="alpha",
+ question_id="100",
+ ),
+ # A data/API call made only via run_code -> excluded from the catalog.
+ CitationRecord(
+ url="https://data.test/api?fmt=csv",
+ run_id="r1",
+ bot="beta",
+ question_id="100",
+ tool_name="run_code",
+ ),
+ ]
+ manifest_io.write_blob(store, "r1", records, config)
+ return store, config
+
+
+def test_build_catalog_joins_and_canonicalizes(tmp_path):
+ store, config = _seed(tmp_path)
+ data = build_catalog(store, config)
+
+ # The two a.test variants collapse to one source; the run_code API call is
+ # excluded (tool/API call, not a page).
+ urls = {s.canonical_url for s in data.sources}
+ assert urls == {
+ "https://a.test/p",
+ "https://b.test/q",
+ "https://uncaptured.test/x",
+ }
+ assert data.excluded.get("tool_call") == 1
+ assert "https://data.test/api?fmt=csv" not in urls
+ captured = {s.canonical_url for s in data.sources if s.captured}
+ assert captured == {"https://a.test/p", "https://b.test/q"}
+
+ by_q = data.by_question()
+ assert set(by_q) == {"100"}
+ assert len(by_q["100"]) == 3
+ by_bot = data.by_bot()
+ assert set(by_bot) == {"alpha", "beta"}
+
+
+def test_write_catalog_emits_views(tmp_path):
+ store, config = _seed(tmp_path)
+ summary = write_catalog(store, config)
+
+ assert summary.sources == 3
+ assert summary.captured == 2
+ assert summary.questions == 1
+ assert summary.excluded.get("tool_call") == 1
+
+ keys = set(store.list_keys("t/catalog/"))
+ assert "t/catalog/index.html" in keys
+ assert "t/catalog/READ_ME_FIRST.html" in keys
+ assert "t/catalog/by-question/100.html" in keys
+ assert "t/catalog/by-question/100.csv" in keys
+ assert "t/catalog/by-bot/alpha.html" in keys
+ assert "t/catalog/by-domain/a.test.html" in keys
+
+ q_html = store.get("t/catalog/by-question/100.html").decode("utf-8")
+ assert "https://a.test/p" in q_html
+ assert "alpha" in q_html # bot tag present
+ # Local links are relative into the content store.
+ assert "../../content/" in q_html
+
+ q_csv = store.get("t/catalog/by-question/100.csv").decode("utf-8")
+ assert "https://uncaptured.test/x" in q_csv
+ assert "no" in q_csv # uncaptured row marked
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
deleted file mode 100644
index 81874d80..00000000
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import annotations
-
-from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
- MetaculusCommentHarvester,
-)
-
-
-def _leaderboard():
- return {
- "leaderboard_entries": [
- {"user": {"id": 1, "username": "botA", "is_bot": True}},
- {"user": {"id": 2, "username": "human", "is_bot": False}},
- {"user": {"id": 3, "username": "botB", "is_bot": True}},
- ]
- }
-
-
-def test_enumerate_bots_filters_non_bots():
- def fetch(path, params):
- assert path == "/leaderboards/project/123/"
- assert params["with_entries"] == "true"
- return _leaderboard()
-
- h = MetaculusCommentHarvester(fetch_json=fetch)
- bots = h.enumerate_bots(123)
- assert [b["id"] for b in bots] == [1, 3]
-
-
-def test_harvest_author_builds_records_with_provenance():
- def fetch(path, params):
- assert path == "/comments/"
- if params["offset"] == 0:
- return {
- "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}]
- }
- return {"results": []}
-
- h = MetaculusCommentHarvester(fetch_json=fetch)
- records = h.harvest_author(1, run_id="r1", bot="botA")
- assert len(records) == 1
- rec = records[0]
- assert rec.url == "https://a.test/x"
- assert rec.bot == "botA"
- assert rec.run_id == "r1"
- assert rec.question_id == "555"
- assert rec.question_url == "https://www.metaculus.com/questions/555/"
- assert rec.trace == "comment:10"
- assert rec.origin == "metaculus_comment"
-
-
-def test_iter_comments_paginates_until_short_page():
- calls = []
-
- def fetch(path, params):
- calls.append(params["offset"])
- if params["offset"] == 0:
- return {"results": [{"id": i, "text": ""} for i in range(100)]}
- return {"results": [{"id": 999, "text": ""}]} # short page -> stop
-
- h = MetaculusCommentHarvester(fetch_json=fetch)
- comments = list(h.iter_comments(1))
- assert len(comments) == 101
- assert calls == [0, 100]
-
-
-def test_harvest_project_aggregates_bots():
- def fetch(path, params):
- if path.startswith("/leaderboards/project/"):
- return _leaderboard()
- # one URL per bot, single page each
- if params["offset"] == 0:
- author = params["author"]
- return {
- "results": [
- {"id": author, "on_post": 1, "text": f"https://bot{author}.test"}
- ]
- }
- return {"results": []}
-
- h = MetaculusCommentHarvester(fetch_json=fetch)
- records = h.harvest_project(123)
- assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"}
- assert {r.bot for r in records} == {"botA", "botB"}
- assert all(r.run_id == "metaculus-comments-123" for r in records)
-
-
-def test_custom_base_url_drives_web_base():
- h = MetaculusCommentHarvester(
- base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []}
- )
- assert h.web_base == "https://example.org"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
index c6f83ef3..a1c1d6c0 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
@@ -15,10 +15,10 @@ def _store(tmp_path, **cfg) -> ContentStore:
return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg))
-def _result(url: str, html: str) -> CaptureResult:
+def _result(url: str, html: str, final_url: str | None = None) -> CaptureResult:
return CaptureResult(
url=url,
- final_url=url,
+ final_url=final_url if final_url is not None else url,
status_code=200,
html=html,
markdown="md " * 50,
@@ -73,3 +73,89 @@ def test_changed_content_creates_new_capture(tmp_path):
second = store.store(_result("https://a.test", "v2 changed
"))
assert second.created is True
assert first.capture.content_hash != second.capture.content_hash
+
+
+# --- Phase B: redirect aliasing -------------------------------------------
+def test_redirect_keys_capture_by_final_url(tmp_path):
+ store = _store(tmp_path)
+ res = store.store(
+ _result("https://bit.ly/x", "dest
", final_url="https://dest.test/page")
+ )
+ # Capture is stored under the FINAL url's hash, not the shortener's.
+ assert res.capture.url == "https://dest.test/page"
+ assert res.capture.url_hash == url_hash("https://dest.test/page")
+ # The canonical index records the cited shortener as an alias.
+ canonical = store._read_index(url_hash("https://dest.test/page"))
+ assert "https://bit.ly/x" in canonical["aliases"]
+
+
+def test_lookup_via_shortener_and_final_both_hit(tmp_path):
+ store = _store(tmp_path)
+ store.store(
+ _result("https://bit.ly/x", "dest
", final_url="https://dest.test/page")
+ )
+ via_alias = store.lookup("https://bit.ly/x")
+ via_final = store.lookup("https://dest.test/page")
+ assert via_alias is not None and via_final is not None
+ assert via_alias.content_hash == via_final.content_hash
+ assert via_alias.url == "https://dest.test/page"
+
+
+def test_two_shorteners_to_same_page_store_once(tmp_path):
+ store = _store(tmp_path)
+ first = store.store(
+ _result("https://bit.ly/x", "same
", final_url="https://dest.test/page")
+ )
+ second = store.store(
+ _result("https://t.co/y", "same
", final_url="https://dest.test/page")
+ )
+ assert first.created is True
+ assert second.created is False # identical content deduped, not re-stored
+ canonical = store._read_index(url_hash("https://dest.test/page"))
+ assert set(canonical["aliases"]) == {"https://bit.ly/x", "https://t.co/y"}
+ assert len(canonical["captures"]) == 1
+
+
+# --- Phase C: cross-URL content dedup -------------------------------------
+def test_identical_content_across_distinct_urls_reuses_blobs(tmp_path):
+ store = _store(tmp_path)
+ a = store.store(_result("https://a.test/x", "same
"))
+ b = store.store(_result("https://b.test/y", "same
"))
+
+ # Both are real captures (each URL has its own index entry)...
+ assert a.created is True and b.created is True
+ # ...but B reuses A's blobs instead of writing its own.
+ assert a.capture.content_alias_of is None
+ assert b.capture.content_alias_of == url_hash("https://a.test/x")
+ assert b.capture.html_key == a.capture.html_key
+
+ # No duplicate blob was written under B's url hash.
+ b_own_key = (
+ f"t/content/{url_hash('https://b.test/y')}/{b.capture.content_hash}.html"
+ )
+ assert not store.blobs.exists(b_own_key)
+ assert store.blobs.exists(a.capture.html_key)
+
+
+def test_content_reverse_index_tracks_members(tmp_path):
+ store = _store(tmp_path)
+ store.store(_result("https://a.test/x", "same
"))
+ store.store(_result("https://b.test/y", "same
"))
+
+ ch = store.store(_result("https://c.test/z", "same
")).capture.content_hash
+ reverse = store._read_content_index(ch)
+ assert reverse["canonical_url_hash"] == url_hash("https://a.test/x")
+ member_hashes = {m["url_hash"] for m in reverse["members"]}
+ assert member_hashes == {
+ url_hash("https://a.test/x"),
+ url_hash("https://b.test/y"),
+ url_hash("https://c.test/z"),
+ }
+
+
+def test_different_content_not_aliased(tmp_path):
+ store = _store(tmp_path)
+ a = store.store(_result("https://a.test/x", "one
"))
+ b = store.store(_result("https://b.test/y", "two different
"))
+ assert b.capture.content_alias_of is None
+ assert b.capture.html_key != a.capture.html_key
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py
new file mode 100644
index 00000000..155d3772
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.catalog import Citation, Source
+from forecasting_tools.agents_and_tools.source_archive.coverage import (
+ coverage_from_sources,
+)
+
+
+def _src(url, domain, captured, cits):
+ return Source(canonical_url=url, domain=domain, captured=captured, citations=cits)
+
+
+def _trace(bot, q, tool):
+ return Citation(bot=bot, question_id=q, tool_name=tool, origin="tool_result")
+
+
+def _comment(bot, q):
+ return Citation(bot=bot, question_id=q, origin="metaculus_comment")
+
+
+SOURCES = [
+ _src(
+ "https://a.test/1",
+ "a.test",
+ True,
+ [_trace("template", "100", "scrape_webpage")],
+ ),
+ _src(
+ "https://b.test/2",
+ "b.test",
+ False,
+ [_trace("template", "100", "scrape_webpage")],
+ ),
+ _src("https://c.test/3", "c.test", True, [_comment("otherbot", "200")]),
+ # run_code-only -> excluded as a tool/API call
+ _src(
+ "https://data.test/x",
+ "data.test",
+ False,
+ [_trace("template", "100", "run_code")],
+ ),
+ # search-engine result page -> excluded as a non-source
+ _src(
+ "https://www.google.com/search?q=x",
+ "google.com",
+ False,
+ [_trace("template", "100", "scrape_webpage")],
+ ),
+ # malformed (extractor junk) -> excluded
+ _src(
+ "https://a.test/y%5B1%5D",
+ "a.test",
+ False,
+ [_trace("template", "100", "scrape_webpage")],
+ ),
+]
+
+
+def test_trace_report_excludes_non_sources_and_counts_pages():
+ r = coverage_from_sources(SOURCES, "trace")
+ assert r.cited == 2 # a.test/1 + b.test/2 (data/search/malformed excluded)
+ assert r.captured == 1
+ assert r.pct == 50.0
+ assert r.excluded == {"tool_call": 1, "search": 1, "malformed": 1}
+ assert r.missing == 1
+ assert r.missing_urls == ["https://b.test/2"]
+
+ by_q = {row.label: (row.cited, row.captured) for row in r.by_question}
+ assert by_q == {"100": (2, 1)}
+ by_tool = {row.label: (row.cited, row.captured) for row in r.by_tool}
+ assert by_tool == {"scrape_webpage": (2, 1)}
+ missed = {row.label for row in r.missed_by_domain}
+ assert missed == {"b.test"}
+
+
+def test_comment_report_is_separate():
+ r = coverage_from_sources(SOURCES, "comments")
+ assert r.cited == 1 # only the metaculus_comment source
+ assert r.captured == 1
+ assert r.pct == 100.0
+ assert {row.label for row in r.by_bot} == {"otherbot"}
+
+
+def test_modes_do_not_bleed():
+ trace = coverage_from_sources(SOURCES, "trace")
+ comments = coverage_from_sources(SOURCES, "comments")
+ assert "https://c.test/3" not in trace.missing_urls # comment source not in trace
+ # the trace bot never appears in the comment report
+ assert "template" not in {row.label for row in comments.by_bot}
+
+
+def test_csv_export_has_overall_row():
+ csv_text = coverage_from_sources(SOURCES, "trace").to_csv()
+ assert "group,label,cited,captured,pct" in csv_text
+ assert "overall,trace,2,1,50.0" in csv_text
+
+
+def test_outcomes_split_never_fetched_vs_failed():
+ # b.test/2 is the only missing page source. With no outcome for it, it's a
+ # pure collection gap (never fetched).
+ r = coverage_from_sources(SOURCES, "trace", {"https://a.test/1": "stored"})
+ assert r.has_outcomes is True
+ assert r.missing_never_fetched == 1
+ assert r.missing_fetch_failed == 0
+
+ # If a run report shows b.test/2 was fetched and failed, it's a capture
+ # problem, not a collection gap.
+ r2 = coverage_from_sources(SOURCES, "trace", {"https://b.test/2": "error"})
+ assert r2.missing_never_fetched == 0
+ assert r2.missing_fetch_failed == 1
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py
new file mode 100644
index 00000000..54cab175
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import (
+ LOCAL_DEFAULT_DSN,
+ MetaculusDbHarvester,
+ resolve_dsn,
+)
+
+
+def test_harvest_post_builds_records_with_provenance():
+ rows = [
+ {
+ "comment_id": 1,
+ "on_post_id": 42,
+ "text": "see https://a.test/x and https://b.test/y",
+ "username": "alpha",
+ "author_id": 7,
+ },
+ {
+ "comment_id": 2,
+ "on_post_id": 42,
+ "text": "https://a.test/x again",
+ "username": "beta",
+ "author_id": 8,
+ },
+ ]
+ seen = {}
+
+ def query(sql, params):
+ seen["sql"], seen["params"] = sql, params
+ return rows
+
+ records = MetaculusDbHarvester(query).harvest_post(42)
+
+ assert seen["params"] == (42,)
+ assert {r.url for r in records} == {"https://a.test/x", "https://b.test/y"}
+ r0 = next(r for r in records if r.url == "https://a.test/x")
+ assert r0.origin == "metaculus_comment"
+ assert r0.question_id == "42"
+ assert r0.question_url == "https://www.metaculus.com/questions/42/"
+ assert r0.bot in ("alpha", "beta")
+ # one record per (URL, comment): a.test/x is cited in both comments
+ assert sum(r.url == "https://a.test/x" for r in records) == 2
+
+
+def test_harvest_recent_passes_days_and_limit():
+ seen = {}
+
+ def query(sql, params):
+ seen["sql"], seen["params"] = sql, params
+ return []
+
+ MetaculusDbHarvester(query).harvest_recent(days=3, limit=50)
+ assert seen["params"] == (3, 50)
+ assert "limit %s" in seen["sql"]
+
+
+def test_harvest_recent_uncapped_by_default():
+ seen = {}
+
+ def query(sql, params):
+ seen["sql"], seen["params"] = sql, params
+ return []
+
+ # A daily sweep wants every row from the latest day, not a 1000-row cap.
+ MetaculusDbHarvester(query).harvest_recent(days=1)
+ assert seen["params"] == (1,)
+ assert "limit" not in seen["sql"].lower()
+
+
+def test_includes_private_bot_comments_by_default():
+ seen = {}
+
+ def query(sql, params):
+ seen["sql"] = sql
+ return []
+
+ # The day-behind replica's value is the now-private bot reasoning, so the
+ # default read must NOT filter private rows out.
+ MetaculusDbHarvester(query).harvest_recent(days=1)
+ assert "is_private" not in seen["sql"]
+ assert "u.is_bot" in seen["sql"]
+
+
+def test_public_only_filters_private_comments():
+ seen = {}
+
+ def query(sql, params):
+ seen["sql"] = sql
+ return []
+
+ MetaculusDbHarvester(query).harvest_post(42, include_private=False)
+ assert "not c.is_private" in seen["sql"]
+
+
+def test_resolve_dsn_prefers_explicit_then_env_then_keychain():
+ # explicit flag wins over everything
+ assert (
+ resolve_dsn(
+ "postgresql://flag",
+ env={"METACULUS_DB_DSN": "postgresql://env"},
+ keychain_reader=lambda: "postgresql://kc",
+ )
+ == "postgresql://flag"
+ )
+ # then the env var
+ assert (
+ resolve_dsn(
+ None,
+ env={"METACULUS_DB_DSN": "postgresql://env"},
+ keychain_reader=lambda: "postgresql://kc",
+ )
+ == "postgresql://env"
+ )
+ # then the keychain (the private path)
+ assert (
+ resolve_dsn(None, env={}, keychain_reader=lambda: "postgresql://kc")
+ == "postgresql://kc"
+ )
+
+
+def test_resolve_dsn_falls_back_to_local_default():
+ # nothing configured and no keychain item -> local dev DB, not a crash
+ assert resolve_dsn(None, env={}, keychain_reader=lambda: None) == LOCAL_DEFAULT_DSN
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
index 033d1689..fa87838d 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
@@ -4,7 +4,10 @@
from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
-from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+ CapturePipeline,
+ capture_urls_concurrent,
+)
from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
@@ -15,6 +18,114 @@ def _pipeline(tmp_path, fetcher) -> CapturePipeline:
return CapturePipeline(fetcher, store)
+def test_capture_urls_concurrent_captures_all(tmp_path, make_fetcher):
+ from contextlib import contextmanager
+
+ config = ArchiveConfig(s3_prefix="t", concurrency=4)
+ store = ContentStore(LocalBlobStore(tmp_path), config)
+ urls = [f"https://s{i}.test/p" for i in range(12)]
+
+ @contextmanager
+ def factory(_cfg):
+ f = make_fetcher()
+ for u in urls:
+ f.add(u)
+ yield f
+
+ summary = capture_urls_concurrent(urls, store, config, factory)
+
+ assert len(summary.outcomes) == 12
+ assert summary.count("stored") == 12
+ # every URL is resolvable afterwards (proves the shared store got all writes)
+ assert all(store.lookup(u) is not None for u in urls)
+
+
+def test_concurrent_supervisor_recovers_a_stuck_worker(tmp_path, make_fetcher):
+ import threading
+ from contextlib import contextmanager
+
+ config = ArchiveConfig(s3_prefix="t", concurrency=1)
+ store = ContentStore(LocalBlobStore(tmp_path), config)
+ urls = ["https://stuck.test/x"]
+ reaped = threading.Event()
+ builds = {"n": 0}
+
+ class _Wedges:
+ name = "wedge"
+
+ def fetch(self, url):
+ # Block until the supervisor's reaper "kills the browser", then surface
+ # the dead-browser error a killed Chromium would raise.
+ reaped.wait(5)
+ raise RuntimeError("Target page, context or browser has been closed")
+
+ @contextmanager
+ def factory(_cfg):
+ builds["n"] += 1
+ if builds["n"] == 1:
+ yield _Wedges() # first browser wedges
+ else:
+ fetcher = make_fetcher()
+ fetcher.add(urls[0])
+ yield fetcher # rebuilt browser works
+
+ # Inject a fake reaper so the test drives the supervisor without real Chromium.
+ summary = capture_urls_concurrent(
+ urls, store, config, factory, per_url_timeout=0.3, reaper=reaped.set
+ )
+
+ assert builds["n"] == 2 # stalled -> reaped -> death -> rebuild -> retry
+ assert summary.count("stored") == 1 # recovered and captured on a fresh browser
+
+
+def test_concurrent_restarts_browser_after_death(tmp_path, make_fetcher):
+ from contextlib import contextmanager
+
+ config = ArchiveConfig(s3_prefix="t", concurrency=1)
+ store = ContentStore(LocalBlobStore(tmp_path), config)
+ urls = ["https://a.test/x"]
+ builds = {"n": 0}
+
+ class _DeadBrowser:
+ name = "dead"
+
+ def fetch(self, url):
+ raise RuntimeError("Target page, context or browser has been closed")
+
+ @contextmanager
+ def factory(_cfg):
+ builds["n"] += 1
+ if builds["n"] == 1:
+ yield _DeadBrowser() # first browser is dead
+ else:
+ fetcher = make_fetcher()
+ fetcher.add(urls[0])
+ yield fetcher # rebuilt browser works
+
+ summary = capture_urls_concurrent(urls, store, config, factory)
+
+ assert builds["n"] == 2 # detected death, rebuilt once
+ assert summary.count("stored") == 1 # retry on the fresh browser succeeded
+
+
+class _BoomFetcher:
+ """Raises an unexpected (non-FetchError) exception, like a bad screenshot."""
+
+ name = "boom"
+
+ def fetch(self, url):
+ raise ValueError("kaboom")
+
+
+def test_pipeline_isolates_unexpected_fetcher_errors(tmp_path):
+ # One pathological URL must not abort the whole run.
+ pipe = _pipeline(tmp_path, _BoomFetcher())
+ summary = pipe.run(["https://a.test", "https://b.test"])
+ assert summary.count("error") == 2
+ assert len(summary.outcomes) == 2
+ assert all(o.reason.startswith("unexpected:") for o in summary.outcomes)
+
+
def test_manifest_roundtrip_and_unique_urls():
records = [
CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"),
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py
new file mode 100644
index 00000000..82e5f5b7
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import json
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.reindex import (
+ analyze,
+ rebuild_content_index,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _put_index(store, key: str, body: dict) -> None:
+ store.put(f"t/index/{key}.json", json.dumps(body).encode("utf-8"))
+
+
+def _canonical(url: str, content_hash: str) -> dict:
+ return {
+ "url": url,
+ "url_hash": f"hash_of_{url}",
+ "latest_content_hash": content_hash,
+ "captures": {
+ content_hash: {
+ "url": url,
+ "url_hash": f"hash_of_{url}",
+ "content_hash": content_hash,
+ "html_key": f"t/content/hash_of_{url}/{content_hash}.html",
+ }
+ },
+ }
+
+
+def _seed(tmp_path) -> tuple[LocalBlobStore, ArchiveConfig]:
+ store = LocalBlobStore(tmp_path)
+ config = ArchiveConfig(s3_prefix="t")
+ # Legacy rows stored under raw hashing: two URLs that now canonicalize equal.
+ _put_index(store, "h1", _canonical("https://x.test/p?utm_source=news", "c1"))
+ _put_index(store, "h2", _canonical("https://x.test/p", "c2"))
+ # Two distinct URLs with byte-identical content (same latest hash).
+ _put_index(store, "h3", _canonical("https://a.test/1", "cX"))
+ _put_index(store, "h4", _canonical("https://b.test/2", "cX"))
+ # Same host+path, meaningful query differs -> Phase D candidate.
+ _put_index(store, "h5", _canonical("https://q.test/item?id=1", "n1"))
+ _put_index(store, "h6", _canonical("https://q.test/item?id=2", "n2"))
+ # An alias (redirect) index -> counted but not a capture.
+ _put_index(store, "h7", {"url": "https://bit.ly/z", "alias_of": "hash_of_x"})
+ return store, config
+
+
+def test_analyze_reports_all_three_lenses(tmp_path):
+ store, config = _seed(tmp_path)
+ report = analyze(store, config)
+
+ assert report.total_url_indexes == 7
+ assert report.alias_indexes == 1
+ assert report.canonical_captures == 6
+
+ canon_keys = {c.key for c in report.canonicalization_clusters}
+ assert "https://x.test/p" in canon_keys
+
+ content_urls = {tuple(c.urls) for c in report.content_clusters}
+ assert ("https://a.test/1", "https://b.test/2") in content_urls
+
+ near_keys = {c.key for c in report.near_dup_clusters}
+ assert "https://q.test/item" in near_keys
+
+
+def test_analyze_ignores_reverse_content_index(tmp_path):
+ store, config = _seed(tmp_path)
+ # A by-content reverse index must not be mistaken for a URL index.
+ store.put(
+ "t/index/by-content/cX.json",
+ json.dumps({"content_hash": "cX", "canonical_url_hash": "x"}).encode("utf-8"),
+ )
+ report = analyze(store, config)
+ assert report.total_url_indexes == 7 # unchanged
+
+
+def test_rebuild_content_index_is_dry_by_default(tmp_path):
+ store, config = _seed(tmp_path)
+ groups = rebuild_content_index(store, config, apply=False)
+ assert groups >= 1
+ # Dry run wrote nothing under by-content/.
+ assert not list(store.list_keys("t/index/by-content/"))
+
+ rebuild_content_index(store, config, apply=True)
+ assert list(store.list_keys("t/index/by-content/"))
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py
new file mode 100644
index 00000000..a3248fa8
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+ CaptureOutcome,
+ PipelineSummary,
+)
+from forecasting_tools.agents_and_tools.source_archive.reports import (
+ read_outcomes,
+ write_run_report,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def test_run_report_roundtrip_canonicalizes(tmp_path):
+ store = LocalBlobStore(tmp_path)
+ config = ArchiveConfig(s3_prefix="t")
+ summary = PipelineSummary(
+ outcomes=[
+ CaptureOutcome(url="https://a.test/p?utm_source=x", status="stored"),
+ CaptureOutcome(url="https://b.test/q", status="error", reason="cloudflare"),
+ ]
+ )
+ write_run_report(store, "r1", summary, config)
+
+ out = read_outcomes(store, config)
+ # keys are canonicalized (tracking param stripped)
+ assert out["https://a.test/p"] == "stored"
+ assert out["https://b.test/q"] == "error"
+
+
+def test_captured_status_wins_over_failure(tmp_path):
+ store = LocalBlobStore(tmp_path)
+ config = ArchiveConfig(s3_prefix="t")
+ write_run_report(
+ store,
+ "early",
+ PipelineSummary(
+ outcomes=[CaptureOutcome(url="https://a.test", status="error")]
+ ),
+ config,
+ )
+ write_run_report(
+ store,
+ "later",
+ PipelineSummary(
+ outcomes=[CaptureOutcome(url="https://a.test", status="stored")]
+ ),
+ config,
+ )
+ assert read_outcomes(store, config)["https://a.test"] == "stored"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py
new file mode 100644
index 00000000..d357982b
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py
@@ -0,0 +1,67 @@
+"""Tests for screenshot encoding + the height cap.
+
+Regression guard for a silent truncation bug: the height cap used to be applied
+via Playwright's ``clip`` *without* ``full_page``, which is bounded by the
+viewport and chopped tall pages down to a single screen. The cap is now enforced
+by cropping the full-page render in Pillow — these tests pin that behavior.
+"""
+
+from __future__ import annotations
+
+import io
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+ _encode_screenshot,
+)
+
+Image = pytest.importorskip("PIL.Image")
+
+
+def _png(width: int, height: int) -> bytes:
+ out = io.BytesIO()
+ Image.new("RGB", (width, height), (255, 0, 0)).save(out, format="PNG")
+ return out.getvalue()
+
+
+def test_tall_page_cropped_to_max_height():
+ data, ct = _encode_screenshot(_png(1280, 12000), "webp", max_height=4000)
+ assert ct == "image/webp"
+ img = Image.open(io.BytesIO(data))
+ assert img.size == (1280, 4000) # cropped to the cap, full width preserved
+
+
+def test_short_page_not_cropped():
+ data, _ = _encode_screenshot(_png(1280, 3000), "webp", max_height=20000)
+ assert Image.open(io.BytesIO(data)).size == (1280, 3000) # untouched
+
+
+def test_webp_clamped_to_format_limit_even_without_cap():
+ # WebP cannot exceed 16383px; an over-tall page must crop, not crash.
+ data, _ = _encode_screenshot(_png(1280, 25000), "webp", max_height=0)
+ assert Image.open(io.BytesIO(data)).size == (1280, 16383)
+
+
+def test_webp_cap_above_format_limit_is_clamped():
+ # A configured cap above WebP's limit still degrades safely to 16383.
+ data, _ = _encode_screenshot(_png(1280, 18000), "webp", max_height=16000)
+ assert Image.open(io.BytesIO(data)).height == 16000
+
+
+def test_png_keeps_full_height_uncapped():
+ # PNG has no such limit, so max_height=0 preserves the whole render.
+ data, _ = _encode_screenshot(_png(1280, 20000), "png", max_height=0)
+ assert Image.open(io.BytesIO(data)).size == (1280, 20000)
+
+
+def test_webp_is_real_webp():
+ data, ct = _encode_screenshot(_png(800, 600), "webp")
+ assert ct == "image/webp"
+ assert data[:4] == b"RIFF" and data[8:12] == b"WEBP"
+
+
+def test_jpeg_format():
+ data, ct = _encode_screenshot(_png(800, 600), "jpeg")
+ assert ct == "image/jpeg"
+ assert Image.open(io.BytesIO(data)).format == "JPEG"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py
new file mode 100644
index 00000000..f3555523
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import (
+ extract_records_from_events,
+ extract_records_from_question_dir,
+ extract_records_from_trace_file,
+ harvest_run,
+ trace_label,
+)
+
+
+def test_trace_label_strips_prefix_and_suffix():
+ assert trace_label("/x/traces_forecast_1_attempt_1.jsonl") == "forecast_1_attempt_1"
+ assert trace_label("traces_summarize.jsonl") == "summarize"
+
+
+def test_tool_call_carries_query_and_tool_args():
+ events = [
+ {
+ "type": "tool_call",
+ "call_id": "c1",
+ "name": "search_online",
+ "args": {"query": "uk election polls", "max_results": 5},
+ }
+ ]
+ records = extract_records_from_events(events, trace="forecast_1")
+ # No URL in the args -> nothing emitted from the tool_call itself.
+ assert records == []
+
+
+def test_tool_result_attributed_to_originating_call():
+ events = [
+ {
+ "type": "tool_call",
+ "call_id": "c1",
+ "name": "search_online",
+ "args": {"query": "uk election polls"},
+ },
+ {
+ "type": "tool_result",
+ "call_id": "c1",
+ "content": "Top hit: [poll](https://a.test/poll) and https://b.test/x",
+ "timestamp": "2026-05-12T12:00:00+00:00",
+ },
+ ]
+ records = extract_records_from_events(events, trace="forecast_1", bot="template")
+ assert [r.url for r in records] == ["https://a.test/poll", "https://b.test/x"]
+ rec = records[0]
+ assert rec.origin == "tool_result"
+ assert rec.tool_name == "search_online"
+ assert rec.query == "uk election polls"
+ assert rec.tool_args == {"query": "uk election polls"}
+ assert rec.trace == "forecast_1"
+ assert rec.bot == "template"
+ assert rec.first_seen == "2026-05-12T12:00:00+00:00"
+
+
+def test_query_from_list_args():
+ events = [
+ {
+ "type": "tool_call",
+ "call_id": "c1",
+ "name": "s",
+ "args": {"queries": ["a", "b"]},
+ },
+ {"type": "tool_result", "call_id": "c1", "content": "https://a.test/x"},
+ ]
+ records = extract_records_from_events(events, trace="t")
+ assert records[0].query == "a b"
+
+
+def test_url_directly_in_tool_call_args():
+ events = [
+ {
+ "type": "tool_call",
+ "call_id": "c1",
+ "name": "fetch_page",
+ "args": {"url": "https://a.test/page"},
+ }
+ ]
+ records = extract_records_from_events(events, trace="t")
+ assert len(records) == 1
+ assert records[0].url == "https://a.test/page"
+ assert records[0].origin == "tool_call"
+ assert records[0].tool_name == "fetch_page"
+ assert records[0].tool_args == {"url": "https://a.test/page"}
+
+
+def test_initial_prompt_only_scanned_when_enabled():
+ events = [
+ {"type": "initial_prompt", "prompt": "background: https://a.test/bg"},
+ ]
+ assert extract_records_from_events(events, trace="forecast_1") == []
+ records = extract_records_from_events(
+ events, trace="summarize", include_initial_prompt=True
+ )
+ assert [r.url for r in records] == ["https://a.test/bg"]
+ assert records[0].origin == "initial_prompt"
+ assert records[0].tool_name == ""
+
+
+def test_non_dict_events_skipped():
+ events = ["garbage", None, {"type": "tool_result", "content": "https://a.test/x"}]
+ records = extract_records_from_events(events, trace="t")
+ assert [r.url for r in records] == ["https://a.test/x"]
+
+
+def _write_jsonl(path: Path, events: list[dict]) -> None:
+ path.write_text("\n".join(json.dumps(e) for e in events), encoding="utf-8")
+
+
+def test_trace_file_uses_summarize_rule(tmp_path: Path):
+ f = tmp_path / "traces_summarize.jsonl"
+ _write_jsonl(f, [{"type": "initial_prompt", "prompt": "see https://a.test/r"}])
+ records = extract_records_from_trace_file(str(f), run_id="run1", bot="template")
+ assert [r.url for r in records] == ["https://a.test/r"]
+ assert records[0].trace == "summarize"
+ assert records[0].run_id == "run1"
+
+
+def test_trace_file_skips_blank_and_bad_lines(tmp_path: Path):
+ f = tmp_path / "traces_forecast_1.jsonl"
+ f.write_text(
+ '\n{"type": "tool_result", "content": "https://a.test/x"}\nnot json\n',
+ encoding="utf-8",
+ )
+ records = extract_records_from_trace_file(str(f))
+ assert [r.url for r in records] == ["https://a.test/x"]
+
+
+def test_question_dir_reads_metadata_and_builds_url(tmp_path: Path):
+ qdir = tmp_path / "q_123"
+ qdir.mkdir()
+ (qdir / "question.json").write_text(
+ json.dumps({"question_id": "metac_123", "metaculus_id": 123}),
+ encoding="utf-8",
+ )
+ _write_jsonl(
+ qdir / "traces_forecast_1.jsonl",
+ [{"type": "tool_result", "content": "https://a.test/x"}],
+ )
+ records = extract_records_from_question_dir(
+ str(qdir), run_id="run1", bot="template"
+ )
+ assert len(records) == 1
+ rec = records[0]
+ assert rec.question_id == "metac_123"
+ assert rec.metaculus_id == "123"
+ assert rec.question_url == "https://www.metaculus.com/questions/123/"
+
+
+def test_question_dir_without_metadata_still_emits(tmp_path: Path):
+ qdir = tmp_path / "q_x"
+ qdir.mkdir()
+ _write_jsonl(
+ qdir / "traces_forecast_1.jsonl",
+ [{"type": "tool_result", "content": "https://a.test/x"}],
+ )
+ records = extract_records_from_question_dir(str(qdir))
+ assert [r.url for r in records] == ["https://a.test/x"]
+ assert records[0].question_id is None
+ assert records[0].question_url is None
+
+
+def test_harvest_run_walks_bot_and_question_dirs(tmp_path: Path):
+ run = tmp_path / "run_demo"
+ qdir = run / "bot_template" / "q_1"
+ qdir.mkdir(parents=True)
+ (qdir / "question.json").write_text(
+ json.dumps({"metaculus_id": 1}), encoding="utf-8"
+ )
+ _write_jsonl(
+ qdir / "traces_forecast_1.jsonl",
+ [{"type": "tool_result", "content": "https://a.test/x"}],
+ )
+ records = harvest_run(str(run))
+ assert len(records) == 1
+ rec = records[0]
+ assert rec.run_id == "run_demo"
+ assert rec.bot == "template"
+ assert rec.metaculus_id == "1"
+
+
+def test_harvest_run_flat_layout_without_bot_dirs(tmp_path: Path):
+ # Butler-style: //traces_*.jsonl with no bot_* grouping.
+ run = tmp_path / "s3_backfill"
+ qdir = run / "2026-05-20_metac_43538"
+ qdir.mkdir(parents=True)
+ (qdir / "question.json").write_text(
+ json.dumps({"metaculus_id": 43538}), encoding="utf-8"
+ )
+ _write_jsonl(
+ qdir / "traces_forecast_1.jsonl",
+ [{"type": "tool_result", "content": "https://a.test/x"}],
+ )
+ records = harvest_run(str(run), bot="butler")
+ assert len(records) == 1
+ rec = records[0]
+ assert rec.bot == "butler" # the flat-layout bot override
+ assert rec.metaculus_id == "43538" # still read from question.json
+
+
+def test_harvest_run_flat_layout_defaults_bot_to_run_name(tmp_path: Path):
+ run = tmp_path / "myrun"
+ qdir = run / "q_only"
+ qdir.mkdir(parents=True)
+ _write_jsonl(
+ qdir / "traces_x.jsonl",
+ [{"type": "tool_result", "content": "https://a.test/y"}],
+ )
+ records = harvest_run(str(run)) # no bot= -> defaults to run dir name
+ assert [r.bot for r in records] == ["myrun"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
index e018af77..443578bb 100644
--- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
@@ -38,6 +38,25 @@ def test_dedupes_preserving_order():
assert extract_urls(text) == ["https://a.test", "https://b.test"]
+def test_strips_trailing_backslash_escape_residue():
+ # Markdown often leaves a trailing backslash, e.g. "Zaporizhzhia\"
+ assert extract_urls("see https://a.test/search?q=Zaporizhzhia\\ ok") == [
+ "https://a.test/search?q=Zaporizhzhia"
+ ]
+
+
+def test_cuts_markdown_reference_tail_and_keeps_both_urls():
+ # The bare scan can glue ")[10](other)" onto a real URL; the tail is cut so
+ # the first URL is clean, and the genuinely-separate second URL (itself a
+ # valid markdown link) is still extracted. Order follows pattern precedence
+ # (markdown links before bare URLs), so compare as a set.
+ text = "https://a.test/story?id=123)[10](https://b.test/other)"
+ assert set(extract_urls(text)) == {
+ "https://a.test/story?id=123",
+ "https://b.test/other",
+ }
+
+
def test_ignores_non_http_and_empty():
assert extract_urls("ftp://a.test mailto:x@y.test nope") == []
assert extract_urls(None) == []
diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md
index 4eb2d9ef..fbcb34cf 100644
--- a/forecasting_tools/agents_and_tools/source_archive/README.md
+++ b/forecasting_tools/agents_and_tools/source_archive/README.md
@@ -43,6 +43,10 @@ Configuration is read from the environment (see the project `.env.template`):
| `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain |
| `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` |
| `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) |
+| `WEB_ARCHIVE_FIRECRAWL_PROXY` | Firecrawl proxy mode for hardened sites: `basic` (1 credit) / `auto` / `stealth` (5 credits). | `basic` |
+| `HYPERBROWSER_API_KEY` | Enables the Hyperbrowser managed fallback. | — (off) |
+| `WEB_ARCHIVE_CLOAKBROWSER_IMPORT` | Module exposing CloakBrowser's `launch()`. | `cloakbrowser` |
+| `WEB_ARCHIVE_PDF_MAX_PAGES` | Cap on PDF pages parsed per document. | `50` |
AWS credentials use the standard AWS resolution chain — environment variables, a
shared config file, or an SSO profile. Nothing secret is committed or baked into
@@ -87,13 +91,131 @@ source-archive capture run.jsonl --local ./archive
# Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself
source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo
+# Skip the Hyperbrowser fallback this run; failures are written to a retry
+# manifest so you can come back to just those sites later (e.g. with it on).
+source-archive capture run.jsonl --no-hyperbrowser --run-id demo
+source-archive capture demo_needs_retry.jsonl --run-id demo # later, hyperbrowser on
+
# Build a manifest by harvesting the URLs bots cited on a Metaculus tournament
source-archive harvest 32506 --out run.jsonl
```
+Because a failed fetch leaves no cache entry while a success does, re-running the
+same manifest only re-attempts the failures — the retry manifest just makes that
+explicit and fast (it skips the already-captured majority).
+
`source-archive` is installed by the extra; the equivalent module form is
`python -m forecasting_tools.agents_and_tools.source_archive.cli`.
+## Backup backends & the bake-off
+
+A self-hosted browser is the primary backend and gets ~70% of URLs for ~free,
+but two kinds of URL fall through it: **anti-bot/Cloudflare** pages (it detects
+the block but can't get past it) and **PDFs** (Chromium downloads them instead of
+rendering, so nothing is captured). The package ships these backups, ordered by
+marginal cost so the cheap tiers absorb most of the tail:
+
+| Backend | Cost (2026) | Closes | Notes |
+| --- | --- | --- | --- |
+| `CloakBrowserFetcher` | ~$0/page (self-host) | Cloudflare | **The primary browser tier when installed** (`pip install cloakbrowser`): patched Chromium that beat vanilla Playwright on Cloudflare in 2026 benchmarks. Only one browser runs — cloak *replaces* vanilla Playwright (two `sync_playwright` instances conflict in one process), falling back to vanilla when cloak isn't installed. |
+| `PdfFetcher` | $0 local; ~$0.0008/pg OCR | PDFs | PyMuPDF4LLM locally, falls back to Firecrawl OCR on scanned PDFs. |
+| `FirecrawlFetcher` | $0.0008 basic / $0.0042 stealth | Cloudflare + PDFs | Native PDF parser; `WEB_ARCHIVE_FIRECRAWL_PROXY=stealth` for hardened sites. |
+| `HyperbrowserFetcher` | $0.001 basic / $0.01 proxy | Cloudflare | Consolidates spend onto a vendor already used elsewhere. No PDF support. |
+
+Selenium was evaluated and **rejected**: it drives the same Chromium as
+Playwright, so it bypasses nothing Playwright can't, and its stealth ecosystem
+(`undetected-chromedriver`) is now legacy. CloakBrowser/Patchright/nodriver are
+the credible self-hosted upgrades.
+
+To decide which backup(s) to wire in, run the bake-off — it runs each selected
+backend independently over the same URLs (not tiered) and reports reliability,
+latency, and estimated cost per backend, broken down by category:
+
+```bash
+python -m forecasting_tools.agents_and_tools.source_archive.benchmark \
+ --manifest forecasting_tools/agents_and_tools/source_archive/benchmarks/sample_urls.jsonl \
+ --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \
+ --out bench.csv
+```
+
+Backends whose API key or dependency is missing are skipped cleanly. Cost
+figures are model estimates (see `PRICING` in `benchmark.py`); tune the credit
+rates with `--firecrawl-credit-usd` / `--hyperbrowser-credit-usd` to match your
+plan. Swap the sample manifest for a JSONL of your own cited URLs (one
+`{"url", "category"}` per line; categories `normal`/`cloudflare`/`pdf`) for a
+representative run.
+
+## Browse what you captured
+
+A Streamlit viewer reads the manifests + index back out of the store and shows
+each captured URL's **screenshot, markdown, and HTML** side by side, filterable
+by bot and question:
+
+```bash
+AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \
+ streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+```
+
+It uses the same `ArchiveConfig.from_env()` settings as capture, so it points at
+whatever bucket/prefix you captured to (no extra configuration).
+
+To browse a **local** capture (no S3/AWS), set `WEB_ARCHIVE_LOCAL_DIR` to the
+directory you captured into with `--local`:
+
+```bash
+WEB_ARCHIVE_LOCAL_DIR=./archive \
+ streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+```
+
+## The catalog: a browsable, coworker-legible view
+
+The viewer is interactive (good for us); the **catalog** is a set of static
+HTML/CSV pages written into the bucket so a non-technical coworker can browse the
+sources without any tooling. It is **question-primary** — the encyclopedia of
+every web source used for a question — plus `by-bot/` and `by-domain/`
+cross-views, built by joining the manifests with the index:
+
+```bash
+# write catalog/ into the bucket (uses WEB_ARCHIVE_S3_BUCKET)
+source-archive catalog
+# or against a local capture dir
+source-archive catalog --local ./archive
+```
+
+Start at `catalog/index.html` (or `catalog/READ_ME_FIRST.html` for the plain
+explainer). Each source shows its screenshot, who used it (bot + tool), and
+whether it was captured; each question also has a CSV. Data/API calls (a bot's
+`run_code` pulling a CSV, etc.) are **excluded** from the catalog — it lists web
+pages a bot read, not data endpoints — though they remain in the raw manifests.
+
+## Coverage: what fraction did we archive?
+
+The catalog shows what we *have*; the **coverage report** shows what we're
+*missing*. It's two separate reports, by ingestion path — different denominators,
+different ground truth:
+
+```bash
+source-archive coverage # both reports
+source-archive coverage --mode trace # just the complex/template bot
+source-archive coverage --csv ./cov # also write cov_.csv (+ _missing.txt)
+```
+
+- **trace** — the complex/template bot's instrumented runs (metac-ai-sdk). Traces
+ hold *every* URL the bot touched, so this is a true archival success-rate.
+- **comments** — every bot (Metaculus's own + outsiders) harvested from public
+ comments. Comments are truncated, so this denominator under-counts — coverage
+ here means "of the links visible in comments, how many we archived."
+
+The report is oriented to one question: **are there sources bots are using that
+we are not yet archiving?** It leads with that gap, then breaks it down by
+question, bot, tool, and the biggest-gap sites, plus the list of sources to
+collect. Non-source URLs — search-engine results, `run_code`-style tool/API
+calls, and malformed extractor junk — are excluded (same as the catalog).
+
+If capture runs have persisted their outcomes (`reports/.json`, written
+automatically by `capture`), the gap is split into **never fetched** (the real
+collection gap) vs **fetched but failed** (a capture problem).
+
## The manifest: what to feed it
A run produces a **citation manifest** — a JSONL file with one record per cited
@@ -107,33 +229,49 @@ The pipeline dedupes URLs within the manifest before fetching.
## Where the manifest comes from
-You can write a manifest yourself, or generate one from a bot's published
-reasoning. Both first-party and third-party bots post their reasoning — with the
-source links they used — as comments on Metaculus, so the public, no-auth
-Metaculus API is the one ingestion path that works across *every* bot:
+You can write a manifest yourself, or generate one from a forecasting bot's
+reasoning — the source links a bot used are recorded in the comment it posts and,
+more completely, in its run traces.
-```python
-from forecasting_tools.agents_and_tools.source_archive.ingest import (
- MetaculusCommentHarvester,
-)
-from forecasting_tools.agents_and_tools.source_archive import manifest
+**From the database (operator path).** `harvest-db` reads the URLs a bot cited
+straight from the platform's Postgres database and emits a manifest. Point it at
+a database (a `postgresql://…` URL works — e.g. a Neon connection string):
-harvester = MetaculusCommentHarvester() # uses METACULUS_API_BASE_URL
-records = harvester.harvest_project(32506) # a tournament / project id
-manifest.write_file("run.jsonl", records) # -> feed to `capture`
+```bash
+# one post, or the latest day of activity
+source-archive harvest-db --post 29495 --dedupe --out run.jsonl
+source-archive harvest-db --days 1 --dedupe --upload --run-id "$(date -u +%F)"
```
-Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`.
+It reads `comments_comment ⋈ users_user (is_bot)` and emits the same manifest.
+`--days` is uncapped by default; `--limit N` caps the row count for spot checks.
+`--public-only` restricts to public comments (all comments are read by default).
-The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in
-`ingest.url_extraction` pull URLs out of any markdown/text (markdown links,
-autolinks, and bare URLs), if you are ingesting from somewhere other than
-comments.
+**DSN resolution (keep the credential off disk).** The DSN is resolved in this
+order: `--dsn` flag → `$METACULUS_DB_DSN` → macOS Keychain item
+`metaculus-db-dsn` → local default `dbname=metaculus`. The DSN is a real secret
+(it grants database read access), so prefer the **Keychain** over `.env` / a
+shell export — those land in files and shell history that any editor or coding
+agent can read. Store it once (you'll be prompted to paste it, so it never
+appears in your shell history):
-Caveat: comments are length-truncated when posted, so a comment-harvested URL
-list can be incomplete versus a bot's full research. For bots you control, an
-instrumented trace gives a fuller list; comment harvesting is the universal
-baseline.
+```bash
+security add-generic-password -U -a "$USER" -s metaculus-db-dsn -w
+# paste the full postgresql://USER:PASS@HOST/dbname?sslmode=require string, return
+```
+
+For the strongest guard, open **Keychain Access.app → login → `metaculus-db-dsn`
+→ Access Control → "Confirm before allowing access"** and clear the always-allow
+list. Every read then raises a GUI confirm: a human running the harvest clicks
+*Allow* (not *Always Allow*), but an automated agent driving a shell can't. With
+that set, the harvester works with no DSN in any file — `source-archive
+harvest-db --days 1` just prompts you once per run.
+
+**From text or traces.** The lower-level `extract_urls(text)` /
+`extract_citation_records(...)` helpers in `ingest.url_extraction` pull URLs out
+of any markdown/text (markdown links, autolinks, and bare URLs). For bots you
+control, an instrumented trace (`ingest-traces`) gives the fullest URL list; a
+comment gives a shallower one, since it is length-truncated when posted.
## How it's organized
@@ -142,7 +280,8 @@ baseline.
| `config.py` | Environment-driven `ArchiveConfig` |
| `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` |
| `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester |
-| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator |
+| `fetchers/` | Playwright (primary) + CloakBrowser / Hyperbrowser / Firecrawl / PDF backups, tiered orchestrator |
+| `benchmark.py` | Backend bake-off: reliability + cost per backend over a manifest |
| `quality.py` | Reject 404s, block pages, and thin content before archiving |
| `storage/` | `BlobStore` interface with S3 and local backends |
| `content_store.py` | `url + content-hash` store with the TTL cache and dedup |
@@ -150,12 +289,22 @@ baseline.
| `pipeline.py` | `lookup → fetch → quality gate → store` |
| `cli.py` | `source-archive` command |
+## Roadmap
+
+Planned and shipped improvements — smarter dedup (URL canonicalization +
+redirect/content aliasing), the coworker-legible catalog, and coverage reports —
+are written up in [ROADMAP.md](ROADMAP.md).
+
## What lands in storage
```
-/index/.json per-URL capture history
+/index/.json per-URL capture history (+ aliases)
+/index/by-content/.json reverse index for content dedup
/content//.html
/content//.webp (screenshot)
/content//.md
/manifests/.jsonl the run's citation manifest
+/reports/.json per-URL capture outcomes (for coverage)
+/catalog/index.html browsable catalog (by question/bot/site)
+/catalog/by-question/.{html,csv}
```
diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py
index 795f4b66..5ede914d 100644
--- a/forecasting_tools/agents_and_tools/source_archive/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py
@@ -29,10 +29,7 @@
from forecasting_tools.agents_and_tools.source_archive.fetchers import (
build_default_fetcher,
)
-from forecasting_tools.agents_and_tools.source_archive.ingest import (
- MetaculusCommentHarvester,
- extract_urls,
-)
+from forecasting_tools.agents_and_tools.source_archive.ingest import extract_urls
from forecasting_tools.agents_and_tools.source_archive.models import (
CaptureResult,
CitationRecord,
@@ -51,7 +48,6 @@
"CapturePipeline",
"CitationRecord",
"ContentStore",
- "MetaculusCommentHarvester",
"PipelineSummary",
"StoreResult",
"StoredCapture",
diff --git a/forecasting_tools/agents_and_tools/source_archive/benchmark.py b/forecasting_tools/agents_and_tools/source_archive/benchmark.py
new file mode 100644
index 00000000..76d79083
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/benchmark.py
@@ -0,0 +1,459 @@
+"""Backend bake-off: run each capture backend independently over the same URLs.
+
+This is the harness for deciding *which* backup to put behind Playwright. Unlike
+the production :class:`TieredFetcher` (which stops at the first backend that
+passes the quality gate), the benchmark runs **every** selected backend over
+**every** URL, so you get an apples-to-apples table of reliability, latency, and
+estimated cost per backend — broken down by URL category (normal / cloudflare /
+pdf).
+
+Run it::
+
+ python -m forecasting_tools.agents_and_tools.source_archive.benchmark \\
+ --manifest sample_urls.jsonl \\
+ --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \\
+ --out bench.csv
+
+A backend whose dependency or API key is missing is skipped with a note rather
+than failing the whole run, so you can benchmark whatever you have configured.
+
+Cost figures are ESTIMATES from a documented pricing model (see ``PRICING``,
+sourced 2026-06); they are not billed amounts. Override the credit rates via
+CLI flags to match your plan.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import io
+import json
+import logging
+import statistics
+import sys
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+ Fetcher,
+ FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+ CloakBrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+ FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+ HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+ PdfFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+ PlaywrightFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+GB = 1_000_000_000
+
+# --- Pricing model -----------------------------------------------------------
+# $/unit as of 2026-06, from each vendor's public pricing + this repo's prior
+# cost experiment. These are the knobs to adjust for your plan.
+#
+# - Self-hosted compute (Playwright / CloakBrowser): ~$0.00001/page rendered
+# (measured in bot-sources probe). Marginal service fee is effectively $0.
+# - Firecrawl: 1 credit basic, 5 credits stealth/"enhanced" proxy. Standard
+# plan ≈ $0.00083/credit.
+# - Hyperbrowser: 1 credit ($0.001) basic, 10 credits ($0.01) with proxy,
+# plus $10/GB proxy bandwidth. 1 credit = $0.001.
+# - PDF: PyMuPDF4LLM local = $0; Firecrawl OCR fallback = ~1 credit/PDF page.
+
+
+@dataclass
+class Pricing:
+ self_host_per_page: float = 0.00001
+ firecrawl_credit_usd: float = 0.00083
+ firecrawl_basic_credits: int = 1
+ firecrawl_stealth_credits: int = 5
+ hyperbrowser_credit_usd: float = 0.001
+ hyperbrowser_basic_credits: int = 1
+ hyperbrowser_proxy_credits: int = 10
+ hyperbrowser_bandwidth_usd_per_gb: float = 10.0
+
+
+def estimate_cost(
+ backend: str, result: CaptureResult, response_bytes: int, pricing: Pricing
+) -> float:
+ """Estimated $ for one successful capture by ``backend``."""
+ meta = result.metadata or {}
+ if backend in ("playwright", "cloakbrowser"):
+ return pricing.self_host_per_page
+ if backend.startswith("firecrawl"):
+ proxy = str(meta.get("firecrawl_proxy", "basic")).lower()
+ credits = (
+ pricing.firecrawl_basic_credits
+ if proxy in ("", "basic")
+ else pricing.firecrawl_stealth_credits
+ )
+ return credits * pricing.firecrawl_credit_usd
+ if backend == "hyperbrowser":
+ credits = (
+ pricing.hyperbrowser_proxy_credits
+ if meta.get("used_proxy")
+ else pricing.hyperbrowser_basic_credits
+ )
+ bandwidth = (response_bytes / GB) * pricing.hyperbrowser_bandwidth_usd_per_gb
+ return credits * pricing.hyperbrowser_credit_usd + bandwidth
+ if backend == "pdf":
+ if meta.get("pdf_engine") == "firecrawl":
+ pages = int(meta.get("pdf_pages") or 1)
+ return pages * pricing.firecrawl_credit_usd
+ return 0.0 # local PyMuPDF4LLM
+ return 0.0
+
+
+# --- Backend registry --------------------------------------------------------
+# Factories so a missing dependency / API key only skips that backend. The
+# ``context`` flag marks browser backends that must be entered as a context
+# manager (the browser launches once and is reused across URLs).
+
+
+@dataclass
+class BackendSpec:
+ name: str
+ factory: Callable[[ArchiveConfig], Fetcher]
+ context: bool = False
+ # Optional pre-flight: return a reason string if the backend can't run
+ # (missing key/dep) so the bake-off reports a clean SKIP instead of N/N
+ # fetch_errors. ``None`` means "looks runnable".
+ precheck: Callable[[ArchiveConfig], str | None] | None = None
+
+
+def _need_firecrawl_key(config: ArchiveConfig) -> str | None:
+ if not config.firecrawl_api_key:
+ return "FIRECRAWL_API_KEY not set"
+ return None
+
+
+def _need_hyperbrowser_key(config: ArchiveConfig) -> str | None:
+ if not config.hyperbrowser_api_key:
+ return "HYPERBROWSER_API_KEY not set"
+ return None
+
+
+def _firecrawl_stealth(config: ArchiveConfig) -> FirecrawlFetcher:
+ # Force the proxy/stealth path so this row measures the Cloudflare-grade
+ # (5-credit) cost, even if the operator left the default at "basic".
+ proxy = config.firecrawl_proxy
+ if proxy in ("", "basic"):
+ proxy = "auto"
+ f = FirecrawlFetcher(config.model_copy(update={"firecrawl_proxy": proxy}))
+ f.name = "firecrawl-stealth"
+ return f
+
+
+BACKENDS: dict[str, BackendSpec] = {
+ "playwright": BackendSpec("playwright", PlaywrightFetcher, context=True),
+ "cloakbrowser": BackendSpec("cloakbrowser", CloakBrowserFetcher, context=True),
+ "firecrawl": BackendSpec(
+ "firecrawl", FirecrawlFetcher, precheck=_need_firecrawl_key
+ ),
+ "firecrawl-stealth": BackendSpec(
+ "firecrawl-stealth", _firecrawl_stealth, precheck=_need_firecrawl_key
+ ),
+ "hyperbrowser": BackendSpec(
+ "hyperbrowser", HyperbrowserFetcher, precheck=_need_hyperbrowser_key
+ ),
+ "pdf": BackendSpec("pdf", PdfFetcher),
+}
+
+
+# --- Sample manifest ---------------------------------------------------------
+# A curated starter set spanning the three categories the backup must handle.
+# Replace/extend with your own real cited URLs for a representative run.
+SAMPLE_MANIFEST: list[dict] = [
+ {"url": "https://example.com", "category": "normal"},
+ {"url": "https://en.wikipedia.org/wiki/Forecasting", "category": "normal"},
+ {"url": "https://www.federalregister.gov/", "category": "normal"},
+ # Sites commonly fronted by Cloudflare / anti-bot:
+ {"url": "https://www.g2.com/", "category": "cloudflare"},
+ {"url": "https://www.indeed.com/", "category": "cloudflare"},
+ {"url": "https://www.zillow.com/", "category": "cloudflare"},
+ # PDFs (the gap Playwright can't render):
+ {"url": "https://arxiv.org/pdf/1706.03762", "category": "pdf"},
+ {"url": "https://bitcoin.org/bitcoin.pdf", "category": "pdf"},
+]
+
+
+@dataclass
+class Row:
+ backend: str
+ url: str
+ category: str
+ passed: bool
+ reason: str
+ seconds: float
+ html_bytes: int
+ md_bytes: int
+ screenshot_bytes: int
+ cost_usd: float
+ error: str = ""
+
+
+@dataclass
+class BackendRun:
+ name: str
+ rows: list[Row] = field(default_factory=list)
+ skipped: str = ""
+
+
+def _sizes(result: CaptureResult) -> tuple[int, int, int]:
+ html = len(result.html.encode()) if result.html else 0
+ md = len(result.markdown.encode()) if result.markdown else 0
+ shot = len(result.screenshot) if result.screenshot else 0
+ return html, md, shot
+
+
+def run_backend(
+ spec: BackendSpec,
+ manifest: list[dict],
+ config: ArchiveConfig,
+ pricing: Pricing,
+) -> BackendRun:
+ run = BackendRun(name=spec.name)
+ if spec.precheck is not None:
+ reason = spec.precheck(config)
+ if reason:
+ run.skipped = reason
+ logger.warning("%s skipped: %s", spec.name, reason)
+ return run
+ try:
+ fetcher = spec.factory(config)
+ except Exception as e: # construction (e.g. missing key) — skip cleanly
+ run.skipped = f"could not construct {spec.name}: {e}"
+ logger.warning(run.skipped)
+ return run
+
+ cm = fetcher if spec.context else nullcontext(fetcher)
+ try:
+ with cm as live:
+ for record in manifest:
+ run.rows.append(_capture_one(spec.name, live, record, pricing))
+ except FetchError as e:
+ # A browser backend can fail to even start (e.g. cloakbrowser not
+ # installed). Record it as a skip rather than crashing the bake-off.
+ if not run.rows:
+ run.skipped = f"{spec.name} unavailable: {e}"
+ logger.warning(run.skipped)
+ else:
+ raise
+ return run
+
+
+def _capture_one(backend: str, fetcher: Fetcher, record: dict, pricing: Pricing) -> Row:
+ url = record["url"]
+ category = record.get("category", "normal")
+ start = time.monotonic()
+ try:
+ result = fetcher.fetch(url)
+ except FetchError as e:
+ return Row(
+ backend,
+ url,
+ category,
+ False,
+ "fetch_error",
+ round(time.monotonic() - start, 2),
+ 0,
+ 0,
+ 0,
+ 0.0,
+ error=str(e)[:300],
+ )
+ except Exception as e: # backend bug / unexpected SDK error
+ return Row(
+ backend,
+ url,
+ category,
+ False,
+ "exception",
+ round(time.monotonic() - start, 2),
+ 0,
+ 0,
+ 0,
+ 0.0,
+ error=str(e)[:300],
+ )
+
+ seconds = round(time.monotonic() - start, 2)
+ verdict = evaluate(result)
+ html_b, md_b, shot_b = _sizes(result)
+ response_bytes = html_b + shot_b
+ cost = (
+ estimate_cost(backend, result, response_bytes, pricing)
+ if verdict.passed
+ else 0.0
+ )
+ return Row(
+ backend,
+ url,
+ category,
+ verdict.passed,
+ verdict.reason or "ok",
+ seconds,
+ html_b,
+ md_b,
+ shot_b,
+ round(cost, 6),
+ )
+
+
+# --- Reporting ---------------------------------------------------------------
+def write_csv(path: str, runs: list[BackendRun]) -> None:
+ buf = io.StringIO()
+ w = csv.writer(buf)
+ w.writerow(
+ [
+ "backend",
+ "url",
+ "category",
+ "passed",
+ "reason",
+ "seconds",
+ "html_bytes",
+ "md_bytes",
+ "screenshot_bytes",
+ "cost_usd",
+ "error",
+ ]
+ )
+ for run in runs:
+ for r in run.rows:
+ w.writerow(
+ [
+ r.backend,
+ r.url,
+ r.category,
+ r.passed,
+ r.reason,
+ r.seconds,
+ r.html_bytes,
+ r.md_bytes,
+ r.screenshot_bytes,
+ r.cost_usd,
+ r.error,
+ ]
+ )
+ Path(path).write_text(buf.getvalue(), encoding="utf-8")
+
+
+def summarize(runs: list[BackendRun], urls_per_question: int, tail_share: float) -> str:
+ cats = ["normal", "cloudflare", "pdf"]
+ lines = []
+ header = (
+ f"{'backend':<18}{'overall':>9}"
+ + "".join(f"{c:>11}" for c in cats)
+ + f"{'med s':>8}{'$/page':>10}{'proj $/q':>10}"
+ )
+ lines.append(header)
+ lines.append("-" * len(header))
+ for run in runs:
+ if run.skipped:
+ lines.append(f"{run.name:<18} SKIPPED: {run.skipped[:80]}")
+ continue
+ total = len(run.rows)
+ passed = [r for r in run.rows if r.passed]
+ overall = f"{len(passed)}/{total}"
+
+ def cat_rate(cat: str) -> str:
+ rows = [r for r in run.rows if r.category == cat]
+ if not rows:
+ return "-"
+ ok = sum(1 for r in rows if r.passed)
+ return f"{ok}/{len(rows)}"
+
+ med = statistics.median([r.seconds for r in run.rows]) if run.rows else 0
+ cost_per = statistics.mean([r.cost_usd for r in passed]) if passed else 0.0
+ # Illustrative: if THIS backend alone handled the whole post-Playwright
+ # tail of a question. (tail_share × urls × $/successful page.)
+ proj = tail_share * urls_per_question * cost_per
+ lines.append(
+ f"{run.name:<18}{overall:>9}"
+ + "".join(f"{cat_rate(c):>11}" for c in cats)
+ + f"{med:>8.1f}{cost_per:>10.5f}{proj:>10.3f}"
+ )
+ note = (
+ f"\nproj $/q assumes one backend covers a {tail_share:.0%} tail of "
+ f"{urls_per_question} URLs/question, BEFORE the TTL cache (which makes "
+ f"re-runs nearly free). Costs are model estimates, not billed amounts."
+ )
+ return "\n".join(lines) + "\n" + note
+
+
+def load_manifest(path: str | None) -> list[dict]:
+ if not path:
+ return SAMPLE_MANIFEST
+ records = []
+ for line in Path(path).read_text(encoding="utf-8").splitlines():
+ line = line.strip()
+ if line:
+ records.append(json.loads(line))
+ return records
+
+
+def main(argv: list[str] | None = None) -> int:
+ logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(message)s")
+ p = argparse.ArgumentParser(description="Capture-backend bake-off.")
+ p.add_argument(
+ "--manifest", help="JSONL of {url, category}. Omit for the built-in sample."
+ )
+ p.add_argument(
+ "--backends",
+ default="playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf",
+ help="Comma-separated subset of: " + ", ".join(BACKENDS),
+ )
+ p.add_argument("--out", default="benchmark.csv", help="CSV output path.")
+ p.add_argument("--urls-per-question", type=int, default=450)
+ p.add_argument(
+ "--tail-share",
+ type=float,
+ default=0.30,
+ help="Fraction of URLs that fall through Playwright.",
+ )
+ p.add_argument("--firecrawl-credit-usd", type=float, default=0.00083)
+ p.add_argument("--hyperbrowser-credit-usd", type=float, default=0.001)
+ args = p.parse_args(argv)
+
+ config = ArchiveConfig.from_env()
+ pricing = Pricing(
+ firecrawl_credit_usd=args.firecrawl_credit_usd,
+ hyperbrowser_credit_usd=args.hyperbrowser_credit_usd,
+ )
+ manifest = load_manifest(args.manifest)
+
+ selected = [b.strip() for b in args.backends.split(",") if b.strip()]
+ unknown = [b for b in selected if b not in BACKENDS]
+ if unknown:
+ p.error(f"unknown backends: {unknown}. Choose from {list(BACKENDS)}")
+
+ runs: list[BackendRun] = []
+ for name in selected:
+ print(f"running {name} over {len(manifest)} URLs...", file=sys.stderr)
+ runs.append(run_backend(BACKENDS[name], manifest, config, pricing))
+
+ write_csv(args.out, runs)
+ print("\n" + summarize(runs, args.urls_per_question, args.tail_share))
+ print(f"\nper-URL detail written to {args.out}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/canonicalize.py b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py
new file mode 100644
index 00000000..b791a47e
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py
@@ -0,0 +1,115 @@
+"""Canonicalize URLs so trivially-different links collapse to one dedup key.
+
+Every capture of a page is grouped under ``url_hash`` (see :mod:`models`).
+Historically that hashed the *raw* URL string, so ``…/x``, ``…/x/``,
+``…/x?utm_source=…`` and ``…/x#frag`` were four different "sources" — inflating
+both storage and any "how many sources have we covered" count.
+
+This module normalizes away differences that do **not** change *which page* you
+get, so the dedup key is stable across those variants:
+
+ - lowercase scheme + host, strip a default port (``:80`` / ``:443``)
+ - drop the fragment (``#…``)
+ - drop known analytics / click-tracking query params, then sort the rest
+ - normalize a trailing slash (``…/x/`` -> ``…/x``; root collapses to no path)
+
+It is deliberately conservative. It does **not** upgrade ``http`` -> ``https`` or
+strip ``www.``: those can resolve to genuinely different pages on some hosts, so
+collapsing them belongs to a later, opt-in phase (see ``ROADMAP.md``).
+"""
+
+from __future__ import annotations
+
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+
+# Query params that are analytics/click tracking and never select the page.
+# Matched case-insensitively; any key starting with a prefix below is also
+# dropped. Bare ``ref`` / ``source`` are intentionally left alone — they are too
+# often load-bearing (API refs, content selectors) to drop blindly.
+_TRACKING_PARAMS = frozenset(
+ {
+ "gclid",
+ "gclsrc",
+ "dclid",
+ "gbraid",
+ "wbraid",
+ "fbclid",
+ "msclkid",
+ "yclid",
+ "twclid",
+ "mc_eid",
+ "mc_cid",
+ "_hsenc",
+ "_hsmi",
+ "igshid",
+ "igsh",
+ "vero_id",
+ "vero_conv",
+ "oly_anon_id",
+ "oly_enc_id",
+ "spm",
+ "scm",
+ "ref_src",
+ "ref_url",
+ }
+)
+_TRACKING_PREFIXES = ("utm_",)
+
+_DEFAULT_PORTS = {"http": "80", "https": "443"}
+
+
+def _is_tracking(key: str) -> bool:
+ k = key.lower()
+ return k in _TRACKING_PARAMS or any(k.startswith(p) for p in _TRACKING_PREFIXES)
+
+
+def canonicalize_url(url: str) -> str:
+ """Return a normalized form of ``url`` to use as a dedup key.
+
+ Idempotent — ``canonicalize_url(canonicalize_url(u)) == canonicalize_url(u)``.
+ Non-http(s) or unparsable input is returned stripped but otherwise as-is
+ (e.g. ``mailto:``, relative paths), so callers can pass anything safely.
+ """
+ if not url:
+ return url
+ raw = url.strip()
+ try:
+ parts = urlsplit(raw)
+ except ValueError:
+ return raw
+ if parts.scheme not in ("http", "https") or not parts.netloc:
+ return raw
+
+ scheme = parts.scheme.lower()
+
+ # netloc: lowercase host (bracket IPv6), keep userinfo, strip default port.
+ host = (parts.hostname or "").lower()
+ if ":" in host: # IPv6 literal
+ host = f"[{host}]"
+ netloc = host
+ if parts.username is not None:
+ auth = parts.username
+ if parts.password is not None:
+ auth += f":{parts.password}"
+ netloc = f"{auth}@{netloc}"
+ if parts.port is not None and str(parts.port) != _DEFAULT_PORTS.get(scheme):
+ netloc += f":{parts.port}"
+
+ # path: collapse the bare root to empty; drop a trailing slash otherwise.
+ path = parts.path
+ if path in ("", "/"):
+ path = ""
+ elif path.endswith("/"):
+ path = path.rstrip("/")
+
+ # query: drop tracking params, then sort so order doesn't matter.
+ kept = [
+ (k, v)
+ for k, v in parse_qsl(parts.query, keep_blank_values=True)
+ if not _is_tracking(k)
+ ]
+ kept.sort()
+ query = urlencode(kept)
+
+ # fragment: always dropped.
+ return urlunsplit((scheme, netloc, path, query, ""))
diff --git a/forecasting_tools/agents_and_tools/source_archive/catalog.py b/forecasting_tools/agents_and_tools/source_archive/catalog.py
new file mode 100644
index 00000000..a9ec97f4
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/catalog.py
@@ -0,0 +1,562 @@
+"""Generate a coworker-legible catalog over the hash-addressed store.
+
+The content store is keyed by URL/content hash — great for dedup, opaque to a
+human browsing the bucket. This builds a browsable ``catalog/`` layer on top by
+joining the citation manifests (who cited what, on which question, with which
+tool) with the per-URL index (what actually got captured). Blobs are never moved
+or duplicated; the catalog only writes small HTML/CSV pointer pages.
+
+Views (question-primary, with two cross-views):
+
+ catalog/READ_ME_FIRST.html plain-language explainer for coworkers
+ catalog/index.html landing page + headline counts
+ catalog/by-question/.html ★ the encyclopedia for one question:
+ catalog/by-question/.csv every source, deduped, tagged with the
+ bots/tools/queries that used it
+ catalog/by-bot/.html one bot's sources across questions
+ catalog/by-domain/.html sources grouped by site
+
+The question view is the default because that's how post-mortems and
+non-technical coworkers think ("what did we know about question X?"); ``by-bot``
+covers profiling/"what is the top bot-maker doing", always next to how other
+bots handled the same question.
+"""
+
+from __future__ import annotations
+
+import csv
+import html
+import io
+from collections import defaultdict
+from urllib.parse import urlsplit
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CitationRecord,
+ url_hash,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+_UNKNOWN_Q = "unknown-question"
+
+# Tools that fetch data/API endpoints, not human-readable web pages. A URL only
+# ever touched by one of these is a data call (e.g. a bot's run_code pulling a
+# CSV), so it is kept out of the page-oriented catalog (it stays in the raw
+# manifests). A URL also seen via search/page-fetch is treated as a real page.
+_NON_PAGE_TOOLS = {
+ "run_code",
+ "code",
+ "python",
+ "run_python",
+ "code_interpreter",
+ "execute_code",
+ "bash",
+ "shell",
+}
+
+
+def tool_call_only(citations: list) -> bool:
+ """True if a URL was touched *only* by code-execution tools (a data/API call,
+ not a page a bot read)."""
+ tools = {(c.tool_name or "").lower() for c in citations}
+ code_tools = tools & _NON_PAGE_TOOLS
+ other_tools = tools - _NON_PAGE_TOOLS - {""}
+ return bool(code_tools) and not other_tools
+
+
+def _is_tool_call_only(source: "Source") -> bool:
+ return tool_call_only(source.citations)
+
+
+# Search-engine result pages are navigation, not sources — a bot citing a
+# google/duckduckgo search URL hasn't handed us a page worth archiving.
+_SEARCH_HOSTS = {
+ "duckduckgo.com",
+ "bing.com",
+ "search.brave.com",
+ "search.yahoo.com",
+ "ecosia.org",
+ "startpage.com",
+ "baidu.com",
+ "ask.com",
+ "qwant.com",
+ "search.marginalia.nu",
+ "kagi.com",
+}
+# Percent-encoded junk that means the extractor swallowed markdown / a second URL
+# / control chars into the URL (legacy captures from before extraction hardening).
+_MALFORMED_MARKERS = ("%5b", "%5d", "%5c", "%0a", "%0d", "%28http", "%29%5b")
+
+
+def is_search_url(url: str) -> bool:
+ host = urlsplit(url).netloc.lower()
+ host = host[4:] if host.startswith("www.") else host
+ return host in _SEARCH_HOSTS or host == "google.com" or host.startswith("google.")
+
+
+def is_malformed_url(url: str) -> bool:
+ low = url.lower()
+ return url.count("://") > 1 or any(m in low for m in _MALFORMED_MARKERS)
+
+
+def exclusion_reason(url: str, citations: list) -> str | None:
+ """Why a cited URL is kept out of the page catalog / coverage, or ``None`` to
+ keep it. ``malformed`` (extractor junk), ``search`` (search-engine results),
+ ``tool_call`` (data/API endpoint touched only by code tools)."""
+ if is_malformed_url(url):
+ return "malformed"
+ if is_search_url(url):
+ return "search"
+ if tool_call_only(citations):
+ return "tool_call"
+ return None
+
+
+class Citation(BaseModel):
+ bot: str | None = None
+ question_id: str | None = None
+ question_url: str | None = None
+ run_id: str | None = None
+ tool_name: str | None = None
+ origin: str | None = None
+ query: str | None = None
+ cited_url: str = "" # the original URL as cited (pre-canonicalization)
+
+
+class Source(BaseModel):
+ canonical_url: str
+ domain: str
+ captured: bool = False
+ content_hash: str | None = None
+ html_key: str | None = None # store-relative (no prefix)
+ screenshot_key: str | None = None
+ markdown_key: str | None = None
+ citations: list[Citation] = []
+
+ @property
+ def bots(self) -> list[str]:
+ return sorted({c.bot for c in self.citations if c.bot})
+
+ @property
+ def question_ids(self) -> list[str]:
+ return sorted({c.question_id for c in self.citations if c.question_id})
+
+
+class CatalogData(BaseModel):
+ sources: list[Source] = []
+ excluded: dict[str, int] = {} # exclusion reason -> count of URLs dropped
+
+ @property
+ def hidden_total(self) -> int:
+ return sum(self.excluded.values())
+
+ def by_question(self) -> dict[str, list[Source]]:
+ out: dict[str, list[Source]] = defaultdict(list)
+ for s in self.sources:
+ qids = s.question_ids or [_UNKNOWN_Q]
+ for qid in qids:
+ out[qid].append(s)
+ return out
+
+ def by_bot(self) -> dict[str, list[Source]]:
+ out: dict[str, list[Source]] = defaultdict(list)
+ for s in self.sources:
+ for bot in s.bots or ["(no bot)"]:
+ out[bot].append(s)
+ return out
+
+ def by_domain(self) -> dict[str, list[Source]]:
+ out: dict[str, list[Source]] = defaultdict(list)
+ for s in self.sources:
+ out[s.domain].append(s)
+ return out
+
+ def question_url(self, qid: str) -> str | None:
+ for s in self.sources:
+ for c in s.citations:
+ if c.question_id == qid and c.question_url:
+ return c.question_url
+ return None
+
+
+# --------------------------------------------------------------------------- #
+# Build (join manifests + index)
+# --------------------------------------------------------------------------- #
+def _domain(url: str) -> str:
+ host = urlsplit(url).netloc.lower()
+ return host[4:] if host.startswith("www.") else host
+
+
+def _strip_prefix(key: str | None, prefix: str) -> str | None:
+ if not key:
+ return None
+ p = prefix.rstrip("/") + "/"
+ return key[len(p) :] if key.startswith(p) else key
+
+
+def _latest_capture(store: ContentStore, canonical_url: str) -> dict | None:
+ """Return the latest stored capture dict for a URL (ignoring TTL), following
+ a redirect alias if present. ``None`` if nothing was ever captured."""
+ index = store._read_index(url_hash(canonical_url))
+ if not index:
+ return None
+ if index.get("alias_of"):
+ index = store._read_index(index["alias_of"])
+ if not index:
+ return None
+ ch = index.get("latest_content_hash")
+ return (index.get("captures") or {}).get(ch)
+
+
+def _load_all_records(store: BlobStore, prefix: str) -> list[CitationRecord]:
+ records: list[CitationRecord] = []
+ for key in store.list_keys(f"{prefix.rstrip('/')}/manifests/"):
+ if not key.endswith(".jsonl"):
+ continue
+ try:
+ records.extend(manifest_io.loads(store.get(key).decode("utf-8")))
+ except (UnicodeDecodeError, ValueError):
+ continue
+ return records
+
+
+def build_sources(store: BlobStore, config: ArchiveConfig) -> list[Source]:
+ """Join every manifest with the index into one ``Source`` per canonical URL.
+
+ Unfiltered (includes tool/API-call URLs) so other tools — e.g. the coverage
+ report — can classify them. The catalog itself filters these out.
+ """
+ prefix = config.s3_prefix.rstrip("/")
+ cstore = ContentStore(store, config)
+ records = _load_all_records(store, prefix)
+
+ grouped: dict[str, list[CitationRecord]] = defaultdict(list)
+ for r in records:
+ if r.url:
+ grouped[canonicalize_url(r.url)].append(r)
+
+ sources: list[Source] = []
+ for canonical, recs in sorted(grouped.items()):
+ cap = _latest_capture(cstore, canonical)
+ source = Source(
+ canonical_url=canonical,
+ domain=_domain(canonical) or "(unknown)",
+ captured=cap is not None,
+ content_hash=(cap or {}).get("content_hash"),
+ html_key=_strip_prefix((cap or {}).get("html_key"), prefix),
+ screenshot_key=_strip_prefix((cap or {}).get("screenshot_key"), prefix),
+ markdown_key=_strip_prefix((cap or {}).get("markdown_key"), prefix),
+ citations=[
+ Citation(
+ bot=r.bot,
+ question_id=r.question_id or r.metaculus_id,
+ question_url=r.question_url,
+ run_id=r.run_id,
+ tool_name=r.tool_name,
+ origin=r.origin,
+ query=r.query,
+ cited_url=r.url,
+ )
+ for r in recs
+ ],
+ )
+ sources.append(source)
+ return sources
+
+
+def build_catalog(store: BlobStore, config: ArchiveConfig) -> CatalogData:
+ sources = build_sources(store, config)
+ pages: list[Source] = []
+ excluded: dict[str, int] = defaultdict(int)
+ for s in sources:
+ reason = exclusion_reason(s.canonical_url, s.citations)
+ if reason:
+ excluded[reason] += 1
+ else:
+ pages.append(s)
+ return CatalogData(sources=pages, excluded=dict(excluded))
+
+
+# --------------------------------------------------------------------------- #
+# Render
+# --------------------------------------------------------------------------- #
+_CSS = """
+body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;color:#1a1a1a;background:#fafafa}
+header{background:#1f2937;color:#fff;padding:16px 24px}
+header a{color:#cbd5e1}
+h1{font-size:20px;margin:0 0 4px}
+.wrap{padding:24px;max-width:1100px;margin:0 auto}
+.muted{color:#6b7280}
+.badge{display:inline-block;font-size:11px;padding:1px 7px;border-radius:10px}
+.ok{background:#dcfce7;color:#166534}.no{background:#fee2e2;color:#991b1b}
+.card{background:#fff;border:1px solid #e5e7eb;border-radius:8px;padding:12px;margin:12px 0;display:flex;gap:12px}
+.card img{width:160px;height:110px;object-fit:cover;object-position:top;border:1px solid #e5e7eb;border-radius:4px;background:#f3f4f6}
+.card .meta{flex:1;min-width:0}
+.card .u{font-weight:600;word-break:break-all}
+.tags{margin-top:6px}
+.tag{display:inline-block;background:#eef2ff;color:#3730a3;font-size:11px;padding:1px 7px;border-radius:10px;margin:2px 4px 2px 0}
+.links a{margin-right:10px;font-size:12px}
+table{border-collapse:collapse;width:100%;background:#fff}
+td,th{border:1px solid #e5e7eb;padding:6px 8px;text-align:left;font-size:13px}
+th{background:#f3f4f6}
+a.grid{display:inline-block;margin:4px 12px 4px 0}
+"""
+
+
+def _esc(s) -> str:
+ return html.escape(str(s)) if s is not None else ""
+
+
+def _page(title: str, body: str, rel_root: str) -> str:
+ return (
+ ""
+ f"{_esc(title)}"
+ f""
+ f"{body}
"
+ )
+
+
+class Linker:
+ """Turns a store-relative blob key into a link a coworker can open."""
+
+ def __init__(self, store: BlobStore, config: ArchiveConfig):
+ from forecasting_tools.agents_and_tools.source_archive.storage import (
+ S3BlobStore,
+ )
+
+ self.is_s3 = isinstance(store, S3BlobStore)
+ self.bucket = config.s3_bucket
+ self.region = config.aws_region
+ self.prefix = config.s3_prefix.rstrip("/")
+
+ def url(self, rel_key: str | None, rel_root: str) -> str | None:
+ if not rel_key:
+ return None
+ if self.is_s3:
+ host = (
+ f"{self.bucket}.s3.{self.region}.amazonaws.com"
+ if self.region
+ else f"{self.bucket}.s3.amazonaws.com"
+ )
+ return f"https://{host}/{self.prefix}/{rel_key}"
+ return f"{rel_root}{rel_key}" # local: relative within the prefix dir
+
+
+def _source_card(s: Source, linker: Linker, rel_root: str) -> str:
+ shot = linker.url(s.screenshot_key, rel_root)
+ html_link = linker.url(s.html_key, rel_root)
+ md_link = linker.url(s.markdown_key, rel_root)
+ badge = (
+ "captured"
+ if s.captured
+ else "not captured"
+ )
+ img = (
+ f"
"
+ if shot
+ else ""
+ )
+ tools = sorted({c.tool_name for c in s.citations if c.tool_name})
+ tags = "".join(f"{_esc(b)}" for b in s.bots)
+ tool_tags = "".join(f"{_esc(t)}" for t in tools)
+ links = []
+ if html_link:
+ links.append(f"HTML")
+ if md_link:
+ links.append(f"markdown")
+ if shot:
+ links.append(f"screenshot")
+ links.append(f"live ↗")
+ return (
+ f""
+ )
+
+
+def _question_csv(sources: list[Source]) -> str:
+ buf = io.StringIO()
+ w = csv.writer(buf)
+ w.writerow(["url", "domain", "captured", "bots", "tools", "screenshot_key"])
+ for s in sources:
+ tools = sorted({c.tool_name for c in s.citations if c.tool_name})
+ w.writerow(
+ [
+ s.canonical_url,
+ s.domain,
+ "yes" if s.captured else "no",
+ "; ".join(s.bots),
+ "; ".join(tools),
+ s.screenshot_key or "",
+ ]
+ )
+ return buf.getvalue()
+
+
+# --------------------------------------------------------------------------- #
+# Write
+# --------------------------------------------------------------------------- #
+class CatalogSummary(BaseModel):
+ sources: int = 0
+ captured: int = 0
+ questions: int = 0
+ bots: int = 0
+ domains: int = 0
+ excluded: dict[str, int] = {}
+
+ def __str__(self) -> str:
+ excl = sum(self.excluded.values())
+ breakdown = (
+ " (" + ", ".join(f"{k}={v}" for k, v in sorted(self.excluded.items())) + ")"
+ if self.excluded
+ else ""
+ )
+ return (
+ f"Catalog: {self.sources} page sources ({self.captured} captured) across "
+ f"{self.questions} questions, {self.bots} bots, {self.domains} domains "
+ f"— {excl} non-page URLs excluded{breakdown}"
+ )
+
+
+def _slug(value: str) -> str:
+ # Keep dots so domains stay readable (a.test.html); collapse anything else.
+ keep = [c if c.isalnum() or c in "-_." else "-" for c in value]
+ out = "".join(keep).strip("-.").replace("..", ".")[:80]
+ return out or "x"
+
+
+def write_catalog(
+ store: BlobStore,
+ config: ArchiveConfig,
+ out_store: BlobStore | None = None,
+) -> CatalogSummary:
+ """Build the catalog from ``store`` and write it to ``out_store`` (default:
+ ``store``). Pass a separate ``out_store`` to preview a live bucket's catalog
+ into a local directory without mutating the bucket."""
+ prefix = config.s3_prefix.rstrip("/")
+ data = build_catalog(store, config)
+ out = out_store or store
+ linker = Linker(out, config)
+
+ def put(rel: str, body: str, ctype: str) -> None:
+ out.put(f"{prefix}/catalog/{rel}", body.encode("utf-8"), content_type=ctype)
+
+ by_q = data.by_question()
+ by_b = data.by_bot()
+ by_d = data.by_domain()
+
+ # Per-question pages (the encyclopedia) + CSVs. rel_root: catalog// -> ../../
+ rr2 = "../../"
+ for qid, sources in sorted(by_q.items()):
+ sources = sorted(sources, key=lambda s: s.canonical_url)
+ qurl = data.question_url(qid)
+ head = f"Question {_esc(qid)}
"
+ if qurl:
+ head += f"{_esc(qurl)} ↗
"
+ head += (
+ f"{len(sources)} source(s); "
+ f"{sum(s.captured for s in sources)} captured · "
+ f"download CSV
"
+ )
+ cards = "".join(_source_card(s, linker, rr2) for s in sources)
+ put(
+ f"by-question/{_slug(qid)}.html",
+ _page(f"Question {qid}", head + cards, rr2),
+ "text/html",
+ )
+ put(f"by-question/{_slug(qid)}.csv", _question_csv(sources), "text/csv")
+
+ # Per-bot and per-domain cross-views.
+ for bot, sources in sorted(by_b.items()):
+ sources = sorted(sources, key=lambda s: s.canonical_url)
+ body = f"Bot: {_esc(bot)}
{len(sources)} source(s)
"
+ body += "".join(_source_card(s, linker, rr2) for s in sources)
+ put(f"by-bot/{_slug(bot)}.html", _page(f"Bot {bot}", body, rr2), "text/html")
+
+ for domain, sources in sorted(by_d.items()):
+ sources = sorted(sources, key=lambda s: s.canonical_url)
+ body = f"Site: {_esc(domain)}
{len(sources)} source(s)
"
+ body += "".join(_source_card(s, linker, rr2) for s in sources)
+ put(
+ f"by-domain/{_slug(domain)}.html",
+ _page(f"Site {domain}", body, rr2),
+ "text/html",
+ )
+
+ # Landing + readme. rel_root: catalog/ -> ../
+ rr1 = "../"
+ index_body = _index_body(data, by_q, by_b, by_d)
+ put("index.html", _page("Catalog", index_body, rr1), "text/html")
+ put("READ_ME_FIRST.html", _page("Read me first", _readme_body(), rr1), "text/html")
+
+ return CatalogSummary(
+ sources=len(data.sources),
+ captured=sum(s.captured for s in data.sources),
+ questions=len(by_q),
+ bots=len(by_b),
+ domains=len(by_d),
+ excluded=data.excluded,
+ )
+
+
+def _index_body(data, by_q, by_b, by_d) -> str:
+ captured = sum(s.captured for s in data.sources)
+
+ def links(items: dict, view: str) -> str:
+ rows = []
+ for key, sources in sorted(items.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+ rows.append(
+ f""
+ f"{_esc(key)} ({len(sources)})"
+ )
+ return "".join(rows)
+
+ hidden_note = (
+ f" · {data.hidden_total} non-page URLs hidden "
+ f"({', '.join(f'{k} {v}' for k, v in sorted(data.excluded.items()))})"
+ if data.hidden_total
+ else ""
+ )
+ return (
+ f"What is this? →
"
+ f"{len(data.sources)} page sources ({captured} captured) · "
+ f"{len(by_q)} questions · {len(by_b)} bots · {len(by_d)} sites{hidden_note}
"
+ f"By question
The encyclopedia of sources per "
+ f"question — start here.
{links(by_q, 'by-question')}"
+ f"By bot
{links(by_b, 'by-bot')}"
+ f"By site
{links(by_d, 'by-domain')}"
+ )
+
+
+def _readme_body() -> str:
+ return (
+ "What is this bucket?
"
+ "This is a source archive: for every web page a forecasting bot "
+ "cited, we save a snapshot — the page's HTML, a full-page "
+ "screenshot, and a clean markdown copy — so a forecast can be "
+ "audited later even if the original page changes or disappears.
"
+ "How to browse it
"
+ ""
+ "- Open index.html (the catalog home).
"
+ "- By question is the main view: pick a question to see every "
+ "source used for it, who used it, and a screenshot of each.
"
+ "- By bot shows one bot's sources across questions; By site "
+ "groups sources by website.
"
+ "- Each question also has a CSV you can open in a spreadsheet.
"
+ "
"
+ "The folders with long hash names (content/, index/) are "
+ "the machine-readable store — you don't need to open those.
"
+ )
diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py
index c2eed8db..d5ec2545 100644
--- a/forecasting_tools/agents_and_tools/source_archive/cli.py
+++ b/forecasting_tools/agents_and_tools/source_archive/cli.py
@@ -24,7 +24,6 @@
from forecasting_tools.agents_and_tools.source_archive.fetchers import (
build_default_fetcher,
)
-from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
def _load_dotenv() -> None:
@@ -69,6 +68,11 @@ def _cmd_check(config: ArchiveConfig) -> int:
print(f" AWS profile : {config.aws_profile or '(default chain)'}")
print(f" AWS region : {config.aws_region or '(default)'}")
print(f" Firecrawl API key : {_mask(config.firecrawl_api_key)}")
+ print(f" Firecrawl proxy mode : {config.firecrawl_proxy}")
+ print(f" Hyperbrowser API key : {_mask(config.hyperbrowser_api_key)}")
+ print(f" Hyperbrowser proxy : {config.hyperbrowser_use_proxy}")
+ print(f" CloakBrowser module : {config.cloakbrowser_import}")
+ print(f" PDF max pages : {config.pdf_max_pages}")
print(f" TTL (days) : {config.ttl_days}")
print(f" Screenshot format : {config.screenshot_format}")
print(f" Screenshot max height: {config.screenshot_max_height}")
@@ -76,19 +80,64 @@ def _cmd_check(config: ArchiveConfig) -> int:
def _cmd_capture(args, config: ArchiveConfig) -> int:
+ from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls
+ from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+ capture_urls_concurrent,
+ )
+
records = manifest_io.read_file(args.manifest)
+
+ overrides = {}
+ if getattr(args, "no_hyperbrowser", False):
+ overrides["hyperbrowser_api_key"] = None
+ if getattr(args, "concurrency", None):
+ overrides["concurrency"] = args.concurrency
+ if overrides:
+ config = config.model_copy(update=overrides)
+ if "hyperbrowser_api_key" in overrides:
+ print("Hyperbrowser fallback DISABLED for this run.")
+
store = ContentStore(_make_blob_store(config, args.local, args.bucket), config)
+ urls = list(unique_urls(records))
+ if args.limit:
+ urls = urls[: args.limit]
target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
- print(f"Capturing {len(records)} citation record(s) -> {target}")
+ print(
+ f"Capturing {len(urls)} URL(s) at concurrency {config.concurrency} -> {target}"
+ )
- with build_default_fetcher(config) as fetcher:
- pipeline = CapturePipeline(fetcher, store)
- summary = pipeline.run_manifest(records)
+ summary = capture_urls_concurrent(urls, store, config, build_default_fetcher)
print(summary)
+ run_id = args.run_id or (records[0].run_id if records else None)
+ if run_id:
+ from forecasting_tools.agents_and_tools.source_archive import reports
+
+ reports.write_run_report(store.blobs, run_id, summary, config)
+ print(f"Wrote run outcomes -> {config.s3_prefix}/reports/{run_id}.json")
+
+ # Failures leave no cache entry, so re-running retries exactly them. Write a
+ # retry manifest (with provenance) so coming back — e.g. with hyperbrowser
+ # re-enabled — is one command over only the sites that still need it.
+ failed = {
+ o.url for o in summary.outcomes if o.status in ("quality_failed", "error")
+ }
+ if failed:
+ from forecasting_tools.agents_and_tools.source_archive.ingest import (
+ dedupe_records,
+ )
+
+ retry_records = dedupe_records(r for r in records if r.url in failed)
+ retry_path = args.retry_out or f"{run_id or 'run'}_needs_retry.jsonl"
+ manifest_io.write_file(retry_path, retry_records)
+ print(
+ f"{len(failed)} URL(s) failed -> retry manifest {retry_path}\n"
+ f" come back later with: source-archive capture {retry_path} "
+ f"--run-id {run_id or ''} (hyperbrowser on by default)"
+ )
+
if args.upload_manifest:
- run_id = args.run_id or (records[0].run_id if records else None)
if not run_id:
sys.exit("--upload-manifest needs --run-id (no run_id found in records)")
manifest_io.write_blob(store.blobs, run_id, records, config)
@@ -96,19 +145,71 @@ def _cmd_capture(args, config: ArchiveConfig) -> int:
return 0
-def _cmd_harvest(args, config: ArchiveConfig) -> int:
+def _cmd_ingest_traces(args, config: ArchiveConfig) -> int:
from forecasting_tools.agents_and_tools.source_archive.ingest import (
- MetaculusCommentHarvester,
+ dedupe_records,
+ harvest_run,
)
- run_id = args.run_id or f"metaculus-comments-{args.project_id}"
- harvester = MetaculusCommentHarvester()
- records = harvester.harvest_project(args.project_id, run_id=run_id)
- print(
- f"Harvested {len(records)} citation record(s) from project "
- f"{args.project_id}"
+ run_id = args.run_id # None -> derived from the run dir name
+ records = harvest_run(args.run_dir, run_id=run_id, bot=args.bot)
+ if args.dedupe:
+ records = dedupe_records(records)
+ run_id = run_id or (records[0].run_id if records else None)
+ print(f"Ingested {len(records)} citation record(s) from traces in {args.run_dir}")
+
+ out_path = args.out or f"{run_id or 'traces'}.jsonl"
+ if not args.upload or args.out:
+ manifest_io.write_file(out_path, records)
+ print(f"Wrote manifest -> {out_path}")
+ if args.upload:
+ if not run_id:
+ sys.exit("--upload needs a run id (pass --run-id; none found in records)")
+ store = _make_blob_store(config, None, args.bucket)
+ manifest_io.write_blob(store, run_id, records, config)
+ print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+ return 0
+
+
+def _cmd_catalog(args, config: ArchiveConfig) -> int:
+ from forecasting_tools.agents_and_tools.source_archive.catalog import write_catalog
+
+ store = _make_blob_store(config, args.local, args.bucket)
+ target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
+ print(f"Building catalog from manifests + index -> {target}/catalog/")
+ summary = write_catalog(store, config)
+ print(summary)
+ print(f"Open {config.s3_prefix}/catalog/index.html")
+ return 0
+
+
+def _cmd_harvest_db(args, config: ArchiveConfig) -> int:
+ from forecasting_tools.agents_and_tools.source_archive.ingest import (
+ MetaculusDbHarvester,
+ dedupe_records,
+ resolve_dsn,
)
+ dsn = resolve_dsn(args.dsn)
+ include_private = not args.public_only
+ harvester = MetaculusDbHarvester.from_dsn(dsn)
+ if args.post:
+ records = harvester.harvest_post(
+ args.post, run_id=args.run_id, include_private=include_private
+ )
+ run_id = args.run_id or f"metaculus-db-post-{args.post}"
+ else:
+ records = harvester.harvest_recent(
+ days=args.days,
+ limit=args.limit,
+ run_id=args.run_id,
+ include_private=include_private,
+ )
+ run_id = args.run_id or f"metaculus-db-recent-{args.days}d"
+ if args.dedupe:
+ records = dedupe_records(records)
+ print(f"Harvested {len(records)} citation record(s) from the Metaculus DB")
+
out_path = args.out or f"{run_id}.jsonl"
if not args.upload or args.out:
manifest_io.write_file(out_path, records)
@@ -120,6 +221,35 @@ def _cmd_harvest(args, config: ArchiveConfig) -> int:
return 0
+def _cmd_coverage(args, config: ArchiveConfig) -> int:
+ from pathlib import Path
+
+ from forecasting_tools.agents_and_tools.source_archive import reports
+ from forecasting_tools.agents_and_tools.source_archive.catalog import build_sources
+ from forecasting_tools.agents_and_tools.source_archive.coverage import (
+ MODES,
+ coverage_from_sources,
+ )
+
+ store = _make_blob_store(config, args.local, args.bucket)
+ sources = build_sources(store, config) # read manifests + index once
+ outcomes = reports.read_outcomes(store, config) or None
+ modes = MODES if args.mode == "both" else (args.mode,)
+ for mode in modes:
+ report = coverage_from_sources(sources, mode, outcomes)
+ print(report)
+ print()
+ if args.csv:
+ Path(f"{args.csv}_{mode}.csv").write_text(report.to_csv())
+ print(f"Wrote {args.csv}_{mode}.csv")
+ if report.missing_urls:
+ Path(f"{args.csv}_{mode}_missing.txt").write_text(
+ "\n".join(report.missing_urls)
+ )
+ print(f"Wrote {args.csv}_{mode}_missing.txt")
+ return 0
+
+
def main(argv: list[str] | None = None) -> int:
_load_dotenv()
parser = argparse.ArgumentParser(
@@ -145,22 +275,110 @@ def main(argv: list[str] | None = None) -> int:
help="also upload the manifest itself to manifests/.jsonl",
)
cap.add_argument("--run-id", help="run id for the uploaded manifest")
+ cap.add_argument(
+ "--no-hyperbrowser",
+ action="store_true",
+ help="disable the Hyperbrowser fallback for this run (others still run)",
+ )
+ cap.add_argument(
+ "--retry-out",
+ metavar="FILE",
+ help="where to write the failed-URL retry manifest "
+ "(default: _needs_retry.jsonl)",
+ )
+ cap.add_argument(
+ "--concurrency",
+ type=int,
+ metavar="N",
+ help="parallel browser workers for this run (overrides WEB_ARCHIVE_CONCURRENCY)",
+ )
+ cap.add_argument(
+ "--limit",
+ type=int,
+ metavar="N",
+ help="capture only the first N URLs (chunk a big manifest; resume via cache)",
+ )
- harv = sub.add_parser(
- "harvest",
- help="harvest cited URLs from bot comments on a Metaculus project",
+ ing = sub.add_parser(
+ "ingest-traces",
+ help="build a manifest from a traced bot run directory (bot_*/q_*/traces_*.jsonl)",
)
- harv.add_argument("project_id", help="Metaculus project / tournament id")
- harv.add_argument(
+ ing.add_argument("run_dir", help="path to a traced run directory")
+ ing.add_argument(
"--out", metavar="FILE", help="write the manifest to this .jsonl file"
)
- harv.add_argument(
- "--run-id", help="run id (default: metaculus-comments-)"
+ ing.add_argument("--run-id", help="run id (default: the run dir's name)")
+ ing.add_argument(
+ "--bot",
+ help="bot name for a flat (no bot_*/) layout (default: the run dir's name)",
+ )
+ ing.add_argument(
+ "--dedupe", action="store_true", help="keep one record per URL (first seen)"
)
- harv.add_argument(
+ ing.add_argument(
"--upload", action="store_true", help="upload the manifest to S3 manifests/"
)
- harv.add_argument("--bucket", help="override the S3 bucket")
+ ing.add_argument("--bucket", help="override the S3 bucket")
+
+ cat = sub.add_parser(
+ "catalog",
+ help="generate a coworker-legible HTML/CSV catalog (by question/bot/site)",
+ )
+ cat.add_argument(
+ "--local", metavar="DIR", help="read/write the catalog in this directory"
+ )
+ cat.add_argument("--bucket", help="override the S3 bucket")
+
+ hdb = sub.add_parser(
+ "harvest-db",
+ help="read a bot's cited URLs from the platform Postgres database (operator)",
+ )
+ grp = hdb.add_mutually_exclusive_group(required=True)
+ grp.add_argument("--post", help="harvest one post id")
+ grp.add_argument("--days", type=int, help="harvest the most recent N days")
+ hdb.add_argument(
+ "--limit",
+ type=int,
+ default=None,
+ help="cap rows when using --days (default: uncapped — a daily sweep wants all)",
+ )
+ hdb.add_argument(
+ "--public-only",
+ action="store_true",
+ help="read only public comments (default: read all of a bot's comments)",
+ )
+ hdb.add_argument(
+ "--dsn",
+ help="libpq DSN or postgresql:// URL. Default resolution: --dsn > "
+ "$METACULUS_DB_DSN > macOS Keychain item 'metaculus-db-dsn' > "
+ "dbname=metaculus. Prefer the Keychain for the real secret "
+ "(a --dsn value lands in shell history).",
+ )
+ hdb.add_argument("--out", metavar="FILE", help="write the manifest to this .jsonl")
+ hdb.add_argument("--run-id", help="run id (default derived from the slice)")
+ hdb.add_argument(
+ "--dedupe", action="store_true", help="keep one record per URL (first seen)"
+ )
+ hdb.add_argument(
+ "--upload", action="store_true", help="upload the manifest to S3 manifests/"
+ )
+ hdb.add_argument("--bucket", help="override the S3 bucket")
+
+ cov = sub.add_parser(
+ "coverage",
+ help="report what %% of cited sources were archived (trace vs comments)",
+ )
+ cov.add_argument(
+ "--mode",
+ choices=["trace", "comments", "both"],
+ default="both",
+ help="which report(s) to print (default: both)",
+ )
+ cov.add_argument(
+ "--csv", metavar="PREFIX", help="write PREFIX_.csv (+ _missing.txt)"
+ )
+ cov.add_argument("--local", metavar="DIR", help="read from this directory")
+ cov.add_argument("--bucket", help="override the S3 bucket")
args = parser.parse_args(argv)
config = ArchiveConfig.from_env()
@@ -169,8 +387,14 @@ def main(argv: list[str] | None = None) -> int:
return _cmd_check(config)
if args.command == "capture":
return _cmd_capture(args, config)
- if args.command == "harvest":
- return _cmd_harvest(args, config)
+ if args.command == "ingest-traces":
+ return _cmd_ingest_traces(args, config)
+ if args.command == "harvest-db":
+ return _cmd_harvest_db(args, config)
+ if args.command == "catalog":
+ return _cmd_catalog(args, config)
+ if args.command == "coverage":
+ return _cmd_coverage(args, config)
return 1
diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py
index 2572ffc4..cfb643ef 100644
--- a/forecasting_tools/agents_and_tools/source_archive/config.py
+++ b/forecasting_tools/agents_and_tools/source_archive/config.py
@@ -19,17 +19,41 @@ def _get_int(name: str, default: int) -> int:
return int(raw)
+def _get_bool(name: str, default: bool) -> bool:
+ raw = os.environ.get(name)
+ if raw is None or raw == "":
+ return default
+ return raw.strip().lower() in ("1", "true", "yes", "on")
+
+
class ArchiveConfig(BaseModel):
"""Runtime configuration. Construct directly in tests, or ``from_env()``."""
s3_bucket: str | None = None
s3_prefix: str = "source-archive"
+ # Local archive directory. When set, the viewer reads captures from here
+ # instead of S3 — handy for inspecting a local capture run with no AWS.
+ local_dir: str | None = None
aws_profile: str | None = None
aws_region: str | None = None
firecrawl_api_key: str | None = None
+ # Firecrawl proxy mode for the anti-bot path: "basic" (1 credit) | "auto"
+ # (1 credit, escalates to 5 on fallback) | "stealth"/"enhanced" (5 credits).
+ # Only the fallback Firecrawl tier pays this; basic is the default.
+ firecrawl_proxy: str = "basic"
+ hyperbrowser_api_key: str | None = None
+ # Hyperbrowser session knobs for the anti-bot path. Proxy turns a 1-credit
+ # scrape into a 10-credit one, so leave it on only for the Cloudflare tier.
+ hyperbrowser_use_proxy: bool = True
+ hyperbrowser_use_stealth: bool = True
+ hyperbrowser_solve_captchas: bool = True
+ # CloakBrowser exposes ``cloakbrowser.launch() -> Browser``; the module name
+ # is overridable in case the package is renamed.
+ cloakbrowser_import: str = "cloakbrowser"
+ pdf_max_pages: int = 50 # cap PDF parsing so a huge report can't blow latency/cost
ttl_days: int = 14
screenshot_format: str = "webp" # webp | jpeg | png
- screenshot_max_height: int = 4000 # px; cap full-page captures
+ screenshot_max_height: int = 16_000 # px; safety cap (under WebP's 16383 limit)
nav_timeout_ms: int = 30_000
concurrency: int = 5
@@ -38,13 +62,27 @@ def from_env(cls) -> "ArchiveConfig":
return cls(
s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"),
s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"),
+ local_dir=os.environ.get("WEB_ARCHIVE_LOCAL_DIR"),
aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"),
aws_region=os.environ.get("AWS_REGION")
or os.environ.get("AWS_DEFAULT_REGION"),
firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"),
+ firecrawl_proxy=os.environ.get("WEB_ARCHIVE_FIRECRAWL_PROXY", "basic"),
+ hyperbrowser_api_key=os.environ.get("HYPERBROWSER_API_KEY"),
+ hyperbrowser_use_proxy=_get_bool("WEB_ARCHIVE_HYPERBROWSER_PROXY", True),
+ hyperbrowser_use_stealth=_get_bool(
+ "WEB_ARCHIVE_HYPERBROWSER_STEALTH", True
+ ),
+ hyperbrowser_solve_captchas=_get_bool(
+ "WEB_ARCHIVE_HYPERBROWSER_CAPTCHA", True
+ ),
+ cloakbrowser_import=os.environ.get(
+ "WEB_ARCHIVE_CLOAKBROWSER_IMPORT", "cloakbrowser"
+ ),
+ pdf_max_pages=_get_int("WEB_ARCHIVE_PDF_MAX_PAGES", 50),
ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14),
screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"),
- screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000),
+ screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 16_000),
nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000),
concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5),
)
diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py
index 7481ab93..800ead70 100644
--- a/forecasting_tools/agents_and_tools/source_archive/content_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py
@@ -12,17 +12,33 @@
already stored, skip the write (dedup identical re-fetches) and just refresh
timestamps.
+**Redirect aliasing.** A capture is keyed by its *final* URL (after redirects),
+so a link shortener (``bit.ly/x``) and the page it resolves to collapse onto one
+capture instead of two. The original cited URL gets a tiny **alias index** that
+points at the final URL's index, and the final URL's index lists its aliases for
+provenance. So ``lookup(bit.ly/x)`` and ``lookup(final)`` both hit the same
+stored page, and we never store the destination twice.
+
+**Cross-URL content dedup.** Different URLs that return byte-identical content
+share the blobs rather than storing them three times each. The first URL to
+store a given content owns the blobs; later URLs get a capture whose blob keys
+point back at them and whose ``content_alias_of`` names the owner. A reverse
+``index/by-content/.json`` tracks the owner and every member URL.
+
Object layout (under ``config.s3_prefix``)::
- index/.json per-URL index + capture history
- content//.html
- content//.
- content//.md
+ index/.json canonical: capture history + "aliases"
+ index/.json alias: {"alias_of": }
+ index/by-content/.json reverse: owner + member urls
+ content//.html
+ content//.
+ content//.md
"""
from __future__ import annotations
import json
+import threading
from datetime import datetime, timedelta, timezone
from pydantic import BaseModel
@@ -58,6 +74,11 @@ def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None):
self.blobs = blob_store
self.config = config or ArchiveConfig()
self.prefix = self.config.s3_prefix.rstrip("/")
+ # Serializes the shared by-content reverse index across capture threads
+ # (concurrent runs). Per-URL index files are written by a single thread
+ # each, so they don't need it; the by-content index can be contended when
+ # different URLs return identical content.
+ self._content_lock = threading.Lock()
# --- key helpers -------------------------------------------------------
def _index_key(self, uh: str) -> str:
@@ -66,6 +87,9 @@ def _index_key(self, uh: str) -> str:
def _content_key(self, uh: str, ch: str, ext: str) -> str:
return f"{self.prefix}/content/{uh}/{ch}.{ext}"
+ def _content_index_key(self, ch: str) -> str:
+ return f"{self.prefix}/index/by-content/{ch}.json"
+
# --- index io ----------------------------------------------------------
def _read_index(self, uh: str) -> dict | None:
key = self._index_key(uh)
@@ -77,16 +101,61 @@ def _write_index(self, uh: str, index: dict) -> None:
data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8")
self.blobs.put(self._index_key(uh), data, content_type="application/json")
+ def _read_content_index(self, ch: str) -> dict | None:
+ key = self._content_index_key(ch)
+ if not self.blobs.exists(key):
+ return None
+ try:
+ return json.loads(self.blobs.get(key).decode("utf-8"))
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ # A concurrent writer may have left a partial local file mid-write;
+ # treat as absent rather than crash. The locked path below is authoritative.
+ return None
+
+ def _register_content(
+ self, ch: str, uh: str, url: str, blob_keys: dict | None
+ ) -> None:
+ """Record that ``uh`` produced content ``ch`` in the reverse index.
+
+ The first URL to store a given content becomes its ``canonical_url_hash``
+ and owns the blob keys; later URLs with identical content are added as
+ ``members`` and reuse those blobs (see :meth:`store`). Locked so concurrent
+ capture threads with identical content don't clobber each other's members.
+ """
+ with self._content_lock:
+ reverse = self._read_content_index(ch)
+ if reverse is None:
+ reverse = {
+ "content_hash": ch,
+ "canonical_url_hash": uh,
+ "blob_keys": blob_keys or {},
+ "members": [],
+ }
+ members = reverse.setdefault("members", [])
+ if not any(m.get("url_hash") == uh for m in members):
+ members.append({"url_hash": uh, "url": url})
+ data = json.dumps(reverse, indent=2, sort_keys=True).encode("utf-8")
+ self.blobs.put(
+ self._content_index_key(ch), data, content_type="application/json"
+ )
+
# --- public api --------------------------------------------------------
def lookup(self, url: str) -> StoredCapture | None:
"""Return the latest stored capture if within the TTL, else ``None``.
- A non-``None`` return means callers can skip fetching this URL.
+ A non-``None`` return means callers can skip fetching this URL. If ``url``
+ is an alias of a previously-redirected target, the alias is followed to
+ the canonical capture.
"""
uh = url_hash(url)
index = self._read_index(uh)
if not index:
return None
+ alias_of = index.get("alias_of")
+ if alias_of: # follow the alias to the canonical (final-URL) index
+ index = self._read_index(alias_of)
+ if not index:
+ return None
latest_ch = index.get("latest_content_hash")
captures = index.get("captures", {})
latest = captures.get(latest_ch)
@@ -100,13 +169,20 @@ def lookup(self, url: str) -> StoredCapture | None:
return StoredCapture.model_validate(latest)
def store(self, result: CaptureResult) -> StoreResult:
- """Persist a capture, deduping by content hash. Always updates the index."""
- uh = url_hash(result.url)
+ """Persist a capture, deduping by content hash. Always updates the index.
+
+ The capture is keyed by the *final* URL (after redirects). If the cited
+ URL differs from the final one, an alias index is written so the cited
+ URL still resolves here, and the cited URL is recorded under the
+ canonical index's ``aliases``.
+ """
+ final_url = result.final_url or result.url
+ uh = url_hash(final_url)
ch = result.content_hash
now = utcnow_iso()
index = self._read_index(uh) or {
- "url": result.url,
+ "url": final_url,
"url_hash": uh,
"first_seen": now,
"captures": {},
@@ -116,33 +192,48 @@ def store(self, result: CaptureResult) -> StoreResult:
created = existing is None
if existing is not None:
- # Identical content already stored — skip blob writes, refresh time.
+ # Identical content already stored for THIS url — skip writes, touch.
existing["last_seen"] = now
stored = StoredCapture.model_validate(existing)
else:
- html_key = screenshot_key = markdown_key = None
- if result.html is not None:
- html_key = self._content_key(uh, ch, "html")
- self.blobs.put(
- html_key, result.html.encode("utf-8"), content_type="text/html"
- )
- if result.markdown is not None:
- markdown_key = self._content_key(uh, ch, "md")
- self.blobs.put(
- markdown_key,
- result.markdown.encode("utf-8"),
- content_type="text/markdown",
- )
- if result.screenshot is not None:
- ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
- screenshot_key = self._content_key(uh, ch, ext)
- self.blobs.put(
- screenshot_key,
- result.screenshot,
- content_type=result.screenshot_content_type,
- )
+ reverse = self._read_content_index(ch)
+ reuse = bool(
+ reverse and reverse.get("canonical_url_hash") not in (None, uh)
+ )
+ if reuse:
+ # Byte-identical content already stored under a DIFFERENT url —
+ # point at its blobs instead of writing three more (cross-URL
+ # content dedup); each url still keeps its own index history.
+ bk = reverse.get("blob_keys", {})
+ html_key = bk.get("html")
+ markdown_key = bk.get("markdown")
+ screenshot_key = bk.get("screenshot")
+ content_alias_of = reverse["canonical_url_hash"]
+ else:
+ html_key = screenshot_key = markdown_key = None
+ if result.html is not None:
+ html_key = self._content_key(uh, ch, "html")
+ self.blobs.put(
+ html_key, result.html.encode("utf-8"), content_type="text/html"
+ )
+ if result.markdown is not None:
+ markdown_key = self._content_key(uh, ch, "md")
+ self.blobs.put(
+ markdown_key,
+ result.markdown.encode("utf-8"),
+ content_type="text/markdown",
+ )
+ if result.screenshot is not None:
+ ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
+ screenshot_key = self._content_key(uh, ch, ext)
+ self.blobs.put(
+ screenshot_key,
+ result.screenshot,
+ content_type=result.screenshot_content_type,
+ )
+ content_alias_of = None
stored = StoredCapture(
- url=result.url,
+ url=final_url,
url_hash=uh,
content_hash=ch,
status_code=result.status_code,
@@ -151,12 +242,62 @@ def store(self, result: CaptureResult) -> StoreResult:
html_key=html_key,
screenshot_key=screenshot_key,
markdown_key=markdown_key,
+ content_alias_of=content_alias_of,
first_seen=now,
last_seen=now,
)
captures[ch] = stored.model_dump()
+ self._register_content(
+ ch,
+ uh,
+ final_url,
+ blob_keys=(
+ None
+ if reuse
+ else {
+ "html": html_key,
+ "markdown": markdown_key,
+ "screenshot": screenshot_key,
+ }
+ ),
+ )
index["latest_content_hash"] = ch
index["last_checked"] = now
+
+ # If the cited URL redirected to a different final URL, record the alias.
+ orig_uh = url_hash(result.url)
+ if orig_uh != uh:
+ aliases = index.setdefault("aliases", [])
+ if result.url not in aliases:
+ aliases.append(result.url)
+
self._write_index(uh, index)
+
+ if orig_uh != uh:
+ self._write_alias(orig_uh, result.url, uh, now)
+
return StoreResult(capture=stored, created=created)
+
+ def _write_alias(
+ self, orig_uh: str, orig_url: str, final_uh: str, now: str
+ ) -> None:
+ """Write/refresh a pointer from a cited URL's hash to its final capture.
+
+ Never clobbers a canonical index (one that already holds captures), so a
+ URL fetched directly in the past keeps its own history.
+ """
+ existing = self._read_index(orig_uh)
+ if existing and existing.get("captures"):
+ return
+ first_seen = existing.get("first_seen", now) if existing else now
+ self._write_index(
+ orig_uh,
+ {
+ "url": orig_url,
+ "url_hash": orig_uh,
+ "alias_of": final_uh,
+ "first_seen": first_seen,
+ "last_checked": now,
+ },
+ )
diff --git a/forecasting_tools/agents_and_tools/source_archive/coverage.py b/forecasting_tools/agents_and_tools/source_archive/coverage.py
new file mode 100644
index 00000000..9b812e9b
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/coverage.py
@@ -0,0 +1,237 @@
+"""Coverage reports: what fraction of cited sources did we actually archive?
+
+Two **separate** reports, split by ingestion path — they have different
+denominators and different notions of ground truth, so they must not be blurred:
+
+- ``trace`` — the complex/template bot's own instrumented runs (metac-ai-sdk).
+ Traces record *every* URL the bot touched, so this is a true archival
+ success-rate against ground truth.
+- ``comments`` — every bot (Metaculus's own + outside bots) harvested from public
+ Metaculus comments. Comments are length-truncated, so the denominator is itself
+ incomplete: coverage here means "of the links we could *see* in comments, how
+ many we archived" — a weaker claim than the trace report.
+
+For each mode: denominator = distinct canonical **page** sources cited (tool/API
+calls excluded, same rule as the catalog); numerator = those with a successful
+capture in the index. Misses are bucketed by site — the per-URL failure *reason*
+isn't persisted yet (that needs each run's pipeline outcomes saved), so we can
+say *which* sites we miss, not yet *why*.
+"""
+
+from __future__ import annotations
+
+import csv
+import io
+from collections import defaultdict
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.catalog import (
+ Source,
+ build_sources,
+ exclusion_reason,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+MODES = ("trace", "comments")
+_COMMENT_ORIGINS = {"metaculus_comment"}
+
+
+def citation_mode(citation) -> str:
+ return "comments" if (citation.origin or "") in _COMMENT_ORIGINS else "trace"
+
+
+class CoverageRow(BaseModel):
+ label: str
+ cited: int = 0
+ captured: int = 0
+
+ @property
+ def pct(self) -> float:
+ return round(100 * self.captured / self.cited, 1) if self.cited else 0.0
+
+
+class CoverageReport(BaseModel):
+ mode: str
+ cited: int = 0
+ captured: int = 0
+ excluded: dict[str, int] = {} # non-source reason -> count
+ by_question: list[CoverageRow] = []
+ by_bot: list[CoverageRow] = []
+ by_tool: list[CoverageRow] = []
+ missed_by_domain: list[CoverageRow] = []
+ missing_urls: list[str] = []
+ # Populated only when per-run outcomes (reports/) exist:
+ has_outcomes: bool = False
+ missing_never_fetched: int = 0 # the real collection gap
+ missing_fetch_failed: int = 0 # tried, failed (Cloudflare/PDF/…)
+
+ @property
+ def pct(self) -> float:
+ return round(100 * self.captured / self.cited, 1) if self.cited else 0.0
+
+ @property
+ def missing(self) -> int:
+ return self.cited - self.captured
+
+ def __str__(self) -> str:
+ title = {
+ "trace": "Trace coverage — complex/template bot (ground truth)",
+ "comments": "Comment coverage — all bots (truncated denominator)",
+ }.get(self.mode, self.mode)
+ excl = (
+ " (excluded as non-sources: "
+ + ", ".join(f"{k} {v}" for k, v in sorted(self.excluded.items()))
+ + ")"
+ if self.excluded
+ else ""
+ )
+ lines = [
+ title,
+ "=" * len(title),
+ # Lead with the collection gap: this report exists to tell us whether
+ # there are sources bots are using that we are NOT yet archiving.
+ f"{self.missing} of {self.cited} cited page sources are NOT yet in the "
+ f"archive — candidates to collect ({self.captured} archived, "
+ f"{self.pct}%).",
+ excl,
+ ]
+ if self.has_outcomes:
+ lines.append(
+ f" of those {self.missing}: {self.missing_never_fetched} were "
+ f"never fetched (collection gap), {self.missing_fetch_failed} "
+ f"were fetched but failed."
+ )
+ if self.mode == "comments":
+ lines.append(
+ " note: comments are length-truncated, so even this denominator "
+ "under-counts what bots actually read — the true gap is larger."
+ )
+
+ def table(header: str, rows: list[CoverageRow], n: int = 8) -> None:
+ if not rows:
+ return
+ lines.append("")
+ lines.append(f"--- {header} ---")
+ for r in rows[:n]:
+ lines.append(f" {r.captured:>4}/{r.cited:<4} {r.pct:>5}% {r.label}")
+ if len(rows) > n:
+ lines.append(f" … +{len(rows) - n} more")
+
+ table("by question (most-cited first)", self.by_question)
+ table("by bot", self.by_bot)
+ if self.mode == "trace":
+ table("by tool", self.by_tool)
+ table("biggest collection gaps by site (captured/cited)", self.missed_by_domain)
+ if self.missing_urls:
+ lines.append("")
+ lines.append(
+ f"--- {len(self.missing_urls)} source(s) to collect (first 10) ---"
+ )
+ for u in self.missing_urls[:10]:
+ lines.append(f" {u}")
+ return "\n".join(lines)
+
+ def to_csv(self) -> str:
+ buf = io.StringIO()
+ w = csv.writer(buf)
+ w.writerow(["group", "label", "cited", "captured", "pct"])
+ w.writerow(["overall", self.mode, self.cited, self.captured, self.pct])
+ for group, rows in (
+ ("question", self.by_question),
+ ("bot", self.by_bot),
+ ("tool", self.by_tool),
+ ("missed_domain", self.missed_by_domain),
+ ):
+ for r in rows:
+ w.writerow([group, r.label, r.cited, r.captured, r.pct])
+ return buf.getvalue()
+
+
+def _grouped(scoped: list[tuple[Source, list]], key_of) -> list[CoverageRow]:
+ agg: dict[str, list[int]] = defaultdict(lambda: [0, 0])
+ for source, cits in scoped:
+ keys = {k for k in (key_of(c) for c in cits) if k} or {"(none)"}
+ for k in keys:
+ agg[k][0] += 1
+ if source.captured:
+ agg[k][1] += 1
+ rows = [CoverageRow(label=k, cited=v[0], captured=v[1]) for k, v in agg.items()]
+ return sorted(rows, key=lambda r: (-r.cited, r.label))
+
+
+def coverage_from_sources(
+ sources: list[Source], mode: str, outcomes: dict[str, str] | None = None
+) -> CoverageReport:
+ scoped: list[tuple[Source, list]] = []
+ excluded: dict[str, int] = defaultdict(int)
+ for s in sources:
+ cits = [c for c in s.citations if citation_mode(c) == mode]
+ if not cits:
+ continue
+ reason = exclusion_reason(s.canonical_url, cits)
+ if reason:
+ excluded[reason] += 1
+ continue
+ scoped.append((s, cits))
+
+ captured = sum(1 for s, _ in scoped if s.captured)
+
+ never_fetched = failed = 0
+ if outcomes is not None:
+ from forecasting_tools.agents_and_tools.source_archive.reports import (
+ FAILED_STATUSES,
+ )
+
+ for s, _ in scoped:
+ if s.captured:
+ continue
+ status = outcomes.get(s.canonical_url)
+ if status is None:
+ never_fetched += 1
+ elif status in FAILED_STATUSES:
+ failed += 1
+ else:
+ failed += 1
+
+ domain_agg: dict[str, list[int]] = defaultdict(lambda: [0, 0])
+ for s, _ in scoped:
+ domain_agg[s.domain][0] += 1
+ if s.captured:
+ domain_agg[s.domain][1] += 1
+ missed_by_domain = sorted(
+ (
+ CoverageRow(label=d, cited=c, captured=cap)
+ for d, (c, cap) in domain_agg.items()
+ if cap < c
+ ),
+ key=lambda r: (-(r.cited - r.captured), r.label),
+ )
+
+ return CoverageReport(
+ mode=mode,
+ cited=len(scoped),
+ captured=captured,
+ excluded=dict(excluded),
+ by_question=_grouped(scoped, lambda c: c.question_id),
+ by_bot=_grouped(scoped, lambda c: c.bot),
+ by_tool=_grouped(scoped, lambda c: c.tool_name),
+ missed_by_domain=missed_by_domain,
+ missing_urls=sorted(s.canonical_url for s, _ in scoped if not s.captured),
+ has_outcomes=outcomes is not None,
+ missing_never_fetched=never_fetched,
+ missing_fetch_failed=failed,
+ )
+
+
+def build_coverage(
+ store: BlobStore, config: ArchiveConfig, mode: str
+) -> CoverageReport:
+ from forecasting_tools.agents_and_tools.source_archive.reports import read_outcomes
+
+ sources = build_sources(store, config)
+ outcomes = read_outcomes(store, config) or None
+ return coverage_from_sources(sources, mode, outcomes)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
index 758aa87e..b136ea66 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
@@ -1,19 +1,32 @@
"""Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown).
Most callers want :func:`build_default_fetcher`, which wires the recommended
-tiered setup: self-hosted Playwright primary, Firecrawl fallback.
+cost-ordered tiered setup: self-hosted Playwright primary, then CloakBrowser,
+PDF, Hyperbrowser, and Firecrawl backups.
"""
from __future__ import annotations
+import logging
+
from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
Fetcher,
FetchError,
)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import (
+ CloakBrowserFetcher,
+)
from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
FirecrawlFetcher,
)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import (
+ HyperbrowserFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import (
+ PdfFetcher,
+ looks_like_pdf,
+)
from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
PlaywrightFetcher,
)
@@ -21,12 +34,18 @@
TieredFetcher,
)
+logger = logging.getLogger(__name__)
+
__all__ = [
"Fetcher",
"FetchError",
+ "CloakBrowserFetcher",
"FirecrawlFetcher",
+ "HyperbrowserFetcher",
+ "PdfFetcher",
"PlaywrightFetcher",
"TieredFetcher",
+ "looks_like_pdf",
"build_default_fetcher",
]
@@ -39,29 +58,64 @@ def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetc
with build_default_fetcher(config) as fetcher:
fetcher.fetch(url)
- Playwright runs first; if a page fails to render or trips the quality gate
- and a Firecrawl API key is configured, Firecrawl is tried as a fallback.
-
- The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle
- is managed by ``with``. On ``__enter__`` it transparently composes itself
- with Firecrawl (when available) behind a :class:`TieredFetcher`.
+ Backends are tried in **cost order** — the first capture that passes the
+ quality gate wins, so the cheap tiers absorb most of the tail and the paid
+ ones only fire on what survives them:
+
+ 1. **Self-hosted browser** (~free) — the primary; ~70% of URLs. Uses
+ **CloakBrowser** (patched Chromium; matches-or-beats vanilla Playwright on
+ anti-bot) when installed, else falls back to vanilla **Playwright**. Only
+ one browser is used: two live ``sync_playwright`` instances conflict in a
+ single process, so cloak *replaces* vanilla rather than stacking with it.
+ 2. **PdfFetcher** (local, free; Firecrawl OCR fallback) — captures PDFs,
+ which the browsers can't render.
+ 3. **Hyperbrowser** (managed) — consolidated anti-bot fallback. Added when
+ ``HYPERBROWSER_API_KEY`` is set.
+ 4. **Firecrawl** (managed) — cheapest stealth + native-PDF safety net. Added
+ when ``FIRECRAWL_API_KEY`` is set.
+
+ The returned object is a :class:`PlaywrightFetcher` subclass so the single
+ browser's lifecycle is managed by ``with``.
"""
config = config or ArchiveConfig()
return _ManagedTieredFetcher(config)
class _ManagedTieredFetcher(PlaywrightFetcher):
- """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline.
-
- Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle
- while letting us add the Firecrawl fallback once the browser is live.
+ """PlaywrightFetcher whose ``fetch`` is delegated to a cost-ordered tiered
+ pipeline. The single self-hosted browser is CloakBrowser when available
+ (overriding ``_launch_browser``), else vanilla Playwright; the extra backends
+ are composed once it is live.
"""
+ _primary_name = "playwright"
+
+ def _launch_browser(self):
+ # Prefer CloakBrowser (patched Chromium, beats vanilla on anti-bot) as
+ # the one self-hosted browser. Two live sync_playwright instances in a
+ # process conflict, so cloak REPLACES vanilla here rather than stacking.
+ try:
+ browser = CloakBrowserFetcher(self.config)._launch_browser()
+ self._primary_name = "cloakbrowser"
+ return browser
+ except FetchError as e:
+ logger.info("cloakbrowser unavailable, using vanilla Playwright: %s", e)
+ self._primary_name = "playwright"
+ return super()._launch_browser()
+
def __enter__(self) -> "_ManagedTieredFetcher":
- super().__enter__()
- backends: list[Fetcher] = [_PlaywrightOnly(self)]
+ super().__enter__() # launches the chosen browser via _launch_browser
+ backends: list[Fetcher] = [_PrimaryBrowser(self, self._primary_name)]
+
+ # PDFs: free local parse (Firecrawl OCR fallback wired internally when a
+ # key is present). Cheap to keep in the chain unconditionally.
+ backends.append(PdfFetcher(self.config))
+
+ if self.config.hyperbrowser_api_key:
+ backends.append(HyperbrowserFetcher(self.config))
if self.config.firecrawl_api_key:
backends.append(FirecrawlFetcher(self.config))
+
self._tiered = TieredFetcher(*backends)
return self
@@ -69,14 +123,16 @@ def fetch(self, url: str): # type: ignore[override]
return self._tiered.fetch(url)
-class _PlaywrightOnly:
- """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering,
- calling the un-overridden ``fetch`` so we don't recurse."""
-
- name = "playwright"
+class _PrimaryBrowser:
+ """Adapts the live primary browser to the Fetcher protocol for tiering,
+ calling the un-overridden ``fetch`` so we don't recurse, and labelling the
+ capture with the actual browser used (cloakbrowser/playwright)."""
- def __init__(self, owner: PlaywrightFetcher):
+ def __init__(self, owner: PlaywrightFetcher, name: str):
self._owner = owner
+ self.name = name
def fetch(self, url: str):
- return PlaywrightFetcher.fetch(self._owner, url)
+ result = PlaywrightFetcher.fetch(self._owner, url)
+ result.fetcher = self.name
+ return result
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py
new file mode 100644
index 00000000..d4164e70
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py
@@ -0,0 +1,62 @@
+"""CloakBrowser fetcher — a self-hosted anti-bot upgrade to Playwright.
+
+CloakBrowser (``CloakHQ/CloakBrowser``) is an open-source, patched-Chromium fork
+whose ``cloakbrowser.launch()`` returns a standard Playwright ``Browser`` — so
+this fetcher reuses *all* of ``PlaywrightFetcher``'s capture logic (settle,
+autoscroll, full-page screenshot, trafilatura→markdown) and only overrides how
+the browser is launched. The fork applies source-level fingerprint patches that
+get past Cloudflare Turnstile and similar challenges that plain headless Chromium
+trips; in the one rigorous 2026 anti-detect benchmark it cleared more Cloudflare
+targets than vanilla Playwright.
+
+It runs on your own compute, so the marginal service cost is ~$0/page. The
+patched Chromium binary (~200MB) is downloaded automatically on first launch.
+
+Install separately (it is not in the ``source-archive`` extra because of the
+binary): ``pip install cloakbrowser``. The package module is configurable via
+``config.cloakbrowser_import`` (default ``cloakbrowser``) in case it is renamed.
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+ PlaywrightFetcher,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CloakBrowserFetcher(PlaywrightFetcher):
+ name = "cloakbrowser"
+
+ def _launch_browser(self):
+ module = self._import_module()
+ launch = getattr(module, "launch", None)
+ if launch is None:
+ raise FetchError(
+ f"{module.__name__} has no launch(); the CloakBrowser API may "
+ "have changed. Expected `cloakbrowser.launch() -> Browser`."
+ )
+ # stealth_args=True applies the fingerprint patches; the returned object
+ # is a Playwright Browser, so the inherited fetch() drives it unchanged.
+ # No separate playwright handle to stop — CloakBrowser owns its driver.
+ browser = launch(headless=True, stealth_args=True)
+ return None, browser
+
+ def _import_module(self):
+ candidates = [self.config.cloakbrowser_import, "cloakbrowser"]
+ tried: list[str] = []
+ for mod_name in dict.fromkeys(c for c in candidates if c):
+ try:
+ return importlib.import_module(mod_name)
+ except ImportError:
+ tried.append(mod_name)
+ raise FetchError(
+ "cloakbrowser is not installed. Install it with "
+ "`pip install cloakbrowser` (or set WEB_ARCHIVE_CLOAKBROWSER_IMPORT "
+ f"to the right module). Tried: {', '.join(tried)}."
+ )
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
index 22aa1a55..622d51ff 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
@@ -1,8 +1,14 @@
-"""Firecrawl fetcher — the FALLBACK backend.
+"""Firecrawl fetcher — a managed FALLBACK backend.
-Reserved for sites that block headless Chromium. It costs ~1 credit/page even
-with a screenshot, so it only runs when the primary backend fails or its capture
-fails the quality gate.
+Reserved for sites that block headless Chromium. A basic scrape costs 1 credit/
+page even with a screenshot, so it only runs when the primary backend fails or
+its capture fails the quality gate.
+
+For hardened anti-bot sites, set ``config.firecrawl_proxy`` to ``"auto"`` or
+``"stealth"`` (a.k.a. "enhanced") — this routes through residential proxies and
+is billed at ~5 credits/page, so it is opt-in and reserved for the Cloudflare
+tier. Firecrawl also natively parses PDFs to markdown (1 credit per PDF page),
+which is why it is the fallback for the tiered ``PdfFetcher``.
The Firecrawl SDK is optional and imported lazily. The screenshot comes back as
a hosted URL, which we download to bytes.
@@ -50,10 +56,24 @@ def _get_client(self):
self._client = Firecrawl(api_key=self.config.firecrawl_api_key)
return self._client
+ def _scrape_kwargs(self, formats: list[str]) -> dict:
+ kwargs: dict = {"formats": formats}
+ # Firecrawl 4.x renamed "stealth" to the "enhanced" proxy mode but still
+ # accepts the legacy spelling; pass whatever the operator configured and
+ # let the SDK normalize. "basic" is the implicit default, so only send
+ # the param when something stronger is requested (keeps the call 1-credit
+ # unless the operator explicitly opts into the 5-credit proxy).
+ proxy = (self.config.firecrawl_proxy or "basic").strip().lower()
+ if proxy and proxy != "basic":
+ kwargs["proxy"] = proxy
+ return kwargs
+
def fetch(self, url: str) -> CaptureResult:
client = self._get_client()
try:
- doc = client.scrape(url, formats=["markdown", "html", "screenshot"])
+ doc = client.scrape(
+ url, **self._scrape_kwargs(["markdown", "html", "screenshot"])
+ )
except Exception as e:
raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e
@@ -75,9 +95,23 @@ def fetch(self, url: str) -> CaptureResult:
screenshot=screenshot,
screenshot_content_type=content_type,
fetcher=self.name,
- metadata={"title": _attr(metadata, "title")},
+ metadata={
+ "title": _attr(metadata, "title"),
+ "firecrawl_proxy": self.config.firecrawl_proxy,
+ },
)
+ def fetch_pdf_markdown(self, url: str) -> str | None:
+ """Scrape just the markdown for a PDF URL via Firecrawl's native PDF
+ parser. Used as the fallback inside :class:`PdfFetcher` when local
+ extraction yields thin text (e.g. a scanned PDF needing OCR)."""
+ client = self._get_client()
+ try:
+ doc = client.scrape(url, **self._scrape_kwargs(["markdown"]))
+ except Exception as e:
+ raise FetchError(f"firecrawl pdf scrape failed for {url}: {e}") from e
+ return _attr(doc, "markdown")
+
@staticmethod
def _download(src_url: str) -> tuple[bytes | None, str | None]:
try:
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py
new file mode 100644
index 00000000..ce728abd
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py
@@ -0,0 +1,149 @@
+"""Hyperbrowser fetcher — a managed FALLBACK backend.
+
+Hyperbrowser exposes a Firecrawl-style ``scrape`` endpoint that returns
+markdown + HTML + a screenshot in one call, with optional stealth, residential
+proxy, and CAPTCHA-solving session options for getting past Cloudflare and other
+anti-bot filters.
+
+Why it's here even though Firecrawl already is: forecasting-tools already uses
+Hyperbrowser elsewhere (``research/computer_use.py``), so routing the anti-bot
+tail through it consolidates spend onto one vendor/bill.
+
+Cost note: a basic scrape is 1 credit ($0.001); enabling ``use_proxy`` makes it
+10 credits ($0.01) plus proxy bandwidth ($10/GB). So the proxy/stealth session
+is opt-in and meant for the small hardened-Cloudflare residual, not every URL.
+Hyperbrowser has no documented PDF→markdown path, so PDFs go to the dedicated
+``PdfFetcher`` instead of here.
+
+The SDK is optional and imported lazily; a screenshot may come back as a hosted
+URL (downloaded to bytes) or inline base64.
+"""
+
+from __future__ import annotations
+
+import base64
+import binascii
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _attr(obj, key, default=None):
+ if obj is None:
+ return default
+ if isinstance(obj, dict):
+ return obj.get(key, default)
+ return getattr(obj, key, default)
+
+
+class HyperbrowserFetcher:
+ name = "hyperbrowser"
+
+ def __init__(self, config: ArchiveConfig | None = None, client=None):
+ self.config = config or ArchiveConfig()
+ self._client = client
+
+ def _get_client(self):
+ if self._client is not None:
+ return self._client
+ if not self.config.hyperbrowser_api_key:
+ raise FetchError("HYPERBROWSER_API_KEY is not set")
+ try:
+ from hyperbrowser import Hyperbrowser
+ except ImportError as e:
+ raise FetchError(
+ "hyperbrowser is not installed. Install it with "
+ "`pip install forecasting-tools[source-archive]`."
+ ) from e
+ self._client = Hyperbrowser(api_key=self.config.hyperbrowser_api_key)
+ return self._client
+
+ def _params(self, url: str):
+ """Build the SDK request objects. Imported here (not at module top) so
+ importing this module never requires the SDK."""
+ from hyperbrowser.models import (
+ CreateSessionParams,
+ ScrapeOptions,
+ StartScrapeJobParams,
+ )
+
+ return StartScrapeJobParams(
+ url=url,
+ scrape_options=ScrapeOptions(
+ formats=["markdown", "html", "screenshot"],
+ only_main_content=False,
+ ),
+ session_options=CreateSessionParams(
+ use_proxy=self.config.hyperbrowser_use_proxy,
+ use_stealth=self.config.hyperbrowser_use_stealth,
+ solve_captchas=self.config.hyperbrowser_solve_captchas,
+ ),
+ )
+
+ def fetch(self, url: str) -> CaptureResult:
+ client = self._get_client()
+ try:
+ resp = client.scrape.start_and_wait(self._params(url))
+ except Exception as e:
+ raise FetchError(f"hyperbrowser scrape failed for {url}: {e}") from e
+
+ # The job wrapper carries status/error; the payload is on ``.data``.
+ if _attr(resp, "status") == "failed":
+ raise FetchError(
+ f"hyperbrowser scrape failed for {url}: {_attr(resp, 'error')}"
+ )
+ data = _attr(resp, "data", resp)
+
+ metadata = _attr(data, "metadata", {}) or {}
+ status = _attr(metadata, "statusCode") or _attr(metadata, "status_code")
+ final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url
+
+ screenshot, content_type = self._coerce_screenshot(_attr(data, "screenshot"))
+
+ return CaptureResult(
+ url=url,
+ final_url=final_url,
+ status_code=int(status) if status is not None else None,
+ html=_attr(data, "html"),
+ markdown=_attr(data, "markdown"),
+ screenshot=screenshot,
+ screenshot_content_type=content_type,
+ fetcher=self.name,
+ metadata={
+ "title": _attr(metadata, "title"),
+ "used_proxy": self.config.hyperbrowser_use_proxy,
+ },
+ )
+
+ @classmethod
+ def _coerce_screenshot(cls, value) -> tuple[bytes | None, str | None]:
+ """A screenshot may arrive as a hosted URL, a data: URI, or raw base64."""
+ if not value or not isinstance(value, str):
+ return None, None
+ if value.startswith("http://") or value.startswith("https://"):
+ return cls._download(value)
+ if value.startswith("data:"):
+ try:
+ header, b64 = value.split(",", 1)
+ ctype = header[5:].split(";", 1)[0] or "image/png"
+ return base64.b64decode(b64), ctype
+ except (ValueError, binascii.Error):
+ return None, None
+ try:
+ return base64.b64decode(value, validate=True), "image/png"
+ except (binascii.Error, ValueError):
+ return None, None
+
+ @staticmethod
+ def _download(src_url: str) -> tuple[bytes | None, str | None]:
+ try:
+ with urllib.request.urlopen(src_url, timeout=30) as resp:
+ return resp.read(), resp.headers.get("Content-Type", "image/png")
+ except Exception as e:
+ logger.warning("failed to download hyperbrowser screenshot: %s", e)
+ return None, None
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py
new file mode 100644
index 00000000..0977605c
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py
@@ -0,0 +1,146 @@
+"""PDF fetcher — closes the gap Playwright can't.
+
+Headless Chromium *downloads* a PDF instead of rendering it (``page.goto`` raises
+"Download is starting"), and trafilatura doesn't parse PDFs, so a cited ``.pdf``
+URL produces nothing today. This fetcher fills that hole with a two-tier strategy:
+
+ 1. Download the PDF bytes and parse locally with **PyMuPDF4LLM** — free, fast,
+ CPU-only, and excellent on text-layer PDFs (most government/legal reports).
+ The first page is rendered to an image so the viewer still has a screenshot.
+ 2. If local extraction yields thin text (a scanned PDF that needs OCR), fall
+ back to **Firecrawl's** native PDF parser (~1 credit/page, OCR included).
+
+Both parsers are optional and imported lazily. Use :func:`looks_like_pdf` /
+:meth:`PdfFetcher.is_pdf` to decide whether a URL should be routed here.
+"""
+
+from __future__ import annotations
+
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+ FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import MIN_TEXT_LEN
+
+logger = logging.getLogger(__name__)
+
+_PDF_MAGIC = b"%PDF-"
+
+
+def looks_like_pdf(url: str) -> bool:
+ """Cheap URL-shape heuristic: does this look like a PDF link? (The fetcher
+ still confirms by sniffing the magic bytes before parsing.)"""
+ path = url.split("?", 1)[0].split("#", 1)[0].lower()
+ return path.endswith(".pdf")
+
+
+class PdfFetcher:
+ name = "pdf"
+
+ def __init__(
+ self,
+ config: ArchiveConfig | None = None,
+ *,
+ firecrawl: FirecrawlFetcher | None = None,
+ downloader=None,
+ ):
+ self.config = config or ArchiveConfig()
+ # Reuse the configured Firecrawl client for the OCR fallback when a key
+ # is present; otherwise the fallback is simply skipped.
+ if firecrawl is not None:
+ self._firecrawl = firecrawl
+ elif self.config.firecrawl_api_key:
+ self._firecrawl = FirecrawlFetcher(self.config)
+ else:
+ self._firecrawl = None
+ self._download = downloader or _download_bytes
+
+ def is_pdf(self, url: str, data: bytes | None = None) -> bool:
+ if data is not None:
+ return data[:5] == _PDF_MAGIC
+ return looks_like_pdf(url)
+
+ def fetch(self, url: str) -> CaptureResult:
+ data, final_url, status = self._download(url, self.config.nav_timeout_ms)
+ if not data or data[:5] != _PDF_MAGIC:
+ raise FetchError(f"not a PDF (no %PDF- magic) for {url}")
+
+ markdown, screenshot, ctype, pages, engine = self._parse_local(data)
+
+ thin = not markdown or len(markdown.strip()) < MIN_TEXT_LEN
+ if thin and self._firecrawl is not None:
+ logger.info("local PDF parse thin for %s; trying Firecrawl OCR", url)
+ try:
+ fc_md = self._firecrawl.fetch_pdf_markdown(url)
+ except FetchError as e:
+ logger.info("firecrawl PDF fallback failed for %s: %s", url, e)
+ else:
+ if fc_md and len(fc_md.strip()) >= MIN_TEXT_LEN:
+ markdown, engine = fc_md, "firecrawl"
+
+ return CaptureResult(
+ url=url,
+ final_url=final_url or url,
+ status_code=status,
+ html=None,
+ markdown=markdown,
+ screenshot=screenshot,
+ screenshot_content_type=ctype,
+ fetcher=self.name,
+ metadata={"pdf_engine": engine, "pdf_pages": pages},
+ )
+
+ def _parse_local(
+ self, data: bytes
+ ) -> tuple[str | None, bytes | None, str | None, int, str]:
+ """Return (markdown, screenshot_png, content_type, pages, engine)."""
+ try:
+ import pymupdf # PyMuPDF (a.k.a. fitz)
+ import pymupdf4llm
+ except ImportError:
+ logger.warning(
+ "pymupdf4llm not installed; local PDF parsing unavailable. "
+ "Install with `pip install forecasting-tools[source-archive]`."
+ )
+ return None, None, None, 0, "none"
+
+ try:
+ doc = pymupdf.open(stream=data, filetype="pdf")
+ except Exception as e:
+ raise FetchError(f"could not open PDF: {e}") from e
+
+ try:
+ total = doc.page_count
+ limit = min(total, self.config.pdf_max_pages) or total
+ markdown = pymupdf4llm.to_markdown(doc, pages=list(range(limit)))
+ screenshot, ctype = self._render_first_page(doc)
+ return markdown or None, screenshot, ctype, total, "pymupdf4llm"
+ finally:
+ doc.close()
+
+ @staticmethod
+ def _render_first_page(doc) -> tuple[bytes | None, str | None]:
+ try:
+ pix = doc[0].get_pixmap(dpi=110)
+ return pix.tobytes("png"), "image/png"
+ except Exception as e:
+ logger.info("could not render PDF first page: %s", e)
+ return None, None
+
+
+def _download_bytes(
+ url: str, timeout_ms: int
+) -> tuple[bytes | None, str | None, int | None]:
+ # A browser-ish UA avoids the cheapest 403s; the content store needs the
+ # bytes, not a render, so plain HTTP is fine and free.
+ req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+ try:
+ with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000, 1)) as resp:
+ return resp.read(), resp.geturl(), getattr(resp, "status", 200)
+ except Exception as e:
+ raise FetchError(f"could not download PDF for {url}: {e}") from e
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
index ee9900b7..efba5575 100644
--- a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
@@ -27,6 +27,13 @@
logger = logging.getLogger(__name__)
+# WebP's hard per-side pixel limit; taller captures must be cropped before encode.
+_WEBP_MAX_DIM = 16383
+# Above this total pixel count, skip the screenshot rather than decode it: a
+# pathological full-page render (very tall × wide) costs minutes of CPU in Pillow
+# for a screenshot that's nice-to-have, not essential.
+_MAX_SCREENSHOT_PIXELS = 200_000_000
+
def _to_markdown(html: str, url: str) -> str | None:
try:
@@ -39,21 +46,57 @@ def _to_markdown(html: str, url: str) -> str | None:
)
-def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
- """Re-encode a PNG screenshot to the requested format using Pillow.
+# Scroll the document top-to-bottom (triggering lazy-loaded content) then back
+# up, so a subsequent full-page screenshot captures the fully-rendered page.
+_AUTOSCROLL_JS = """
+async () => {
+ await new Promise((resolve) => {
+ let y = 0;
+ const step = () => {
+ window.scrollTo(0, y);
+ y += 1000;
+ if (y < document.body.scrollHeight) setTimeout(step, 40);
+ else resolve();
+ };
+ step();
+ });
+ window.scrollTo(0, 0);
+}
+"""
+
+
+def _encode_screenshot(
+ png_bytes: bytes, fmt: str, max_height: int = 0
+) -> tuple[bytes, str]:
+ """Crop (to ``max_height``) and re-encode a PNG screenshot using Pillow.
Pillow is already a forecasting-tools dependency, so true WebP is available
- here (Playwright itself only emits PNG/JPEG).
+ here (Playwright itself only emits PNG/JPEG). The height cap is enforced by
+ cropping the *full-page* render to its top ``max_height`` pixels — never via
+ Playwright's ``clip`` (which, without ``full_page``, is bounded by the
+ viewport and silently truncates tall pages to a single screen).
"""
fmt = fmt.lower()
- if fmt == "png":
- return png_bytes, "image/png"
try:
from PIL import Image
except ImportError:
+ # No Pillow: can't crop or transcode; hand back the raw full-page PNG.
return png_bytes, "image/png"
- image = Image.open(io.BytesIO(png_bytes))
+ image = Image.open(io.BytesIO(png_bytes)) # lazy: reads size, doesn't decode
+ if image.width * image.height > _MAX_SCREENSHOT_PIXELS:
+ raise ValueError(
+ f"screenshot too large to encode ({image.width}x{image.height}px)"
+ )
+ # WebP cannot encode beyond 16383px on a side. Clamp the effective cap for
+ # webp so an over-tall page degrades to a top-crop instead of crashing the
+ # encoder mid-run (which would propagate out of fetch() and abort the URL).
+ limit = max_height or 0
+ if fmt == "webp":
+ limit = min(limit or _WEBP_MAX_DIM, _WEBP_MAX_DIM)
+ if limit and image.height > limit:
+ image = image.crop((0, 0, image.width, limit))
+
out = io.BytesIO()
if fmt == "webp":
image.save(out, format="WEBP", quality=80, method=6)
@@ -61,7 +104,8 @@ def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
if fmt in ("jpeg", "jpg"):
image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True)
return out.getvalue(), "image/jpeg"
- return png_bytes, "image/png"
+ image.save(out, format="PNG", optimize=True)
+ return out.getvalue(), "image/png"
class PlaywrightFetcher:
@@ -82,7 +126,12 @@ def __init__(self, config: ArchiveConfig | None = None):
self._playwright = None
self._browser = None
- def __enter__(self) -> "PlaywrightFetcher":
+ def _launch_browser(self):
+ """Start the browser. Returns ``(playwright_or_none, browser)`` where
+ ``browser`` is a Playwright ``Browser``. Subclasses override this to swap
+ in a different stealth browser (see ``CloakBrowserFetcher``) while reusing
+ all of the capture logic. A backend that manages its own driver returns
+ ``None`` for the first element."""
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
@@ -91,8 +140,12 @@ def __enter__(self) -> "PlaywrightFetcher":
"`pip install forecasting-tools[source-archive]` and then run "
"`playwright install chromium`."
) from e
- self._playwright = sync_playwright().start()
- self._browser = self._playwright.chromium.launch(headless=True)
+ playwright = sync_playwright().start()
+ browser = playwright.chromium.launch(headless=True)
+ return playwright, browser
+
+ def __enter__(self) -> "PlaywrightFetcher":
+ self._playwright, self._browser = self._launch_browser()
return self
def __exit__(self, *exc) -> None:
@@ -103,6 +156,32 @@ def __exit__(self, *exc) -> None:
self._playwright.stop()
self._playwright = None
+ def _settle(self, page) -> None:
+ """Best-effort: let the page finish rendering before the screenshot.
+
+ ``page.goto`` only waits for ``domcontentloaded``, which fires before
+ CSS/images/lazy content have laid out — capturing then yields a short,
+ half-built page. Wait for the load/network to quiesce and scroll the
+ document to force lazy content in, so the full-page capture is complete.
+ Each step is bounded and swallows timeouts: rendering aids are
+ nice-to-have, never fatal to the capture.
+ """
+ try:
+ page.wait_for_load_state("load", timeout=self.config.nav_timeout_ms)
+ except Exception:
+ pass
+ try:
+ page.wait_for_load_state(
+ "networkidle", timeout=min(self.config.nav_timeout_ms, 10_000)
+ )
+ except Exception:
+ pass
+ try:
+ page.evaluate(_AUTOSCROLL_JS)
+ page.wait_for_timeout(500)
+ except Exception:
+ pass
+
def fetch(self, url: str) -> CaptureResult:
if self._browser is None:
raise FetchError("PlaywrightFetcher must be used as a context manager")
@@ -119,26 +198,33 @@ def fetch(self, url: str) -> CaptureResult:
except Exception as e:
raise FetchError(f"navigation failed for {url}: {e}") from e
+ self._settle(page)
+
status = response.status if response is not None else None
html = page.content()
- shot_kwargs: dict = {"type": "png"}
- cap = self.config.screenshot_max_height
- dims = page.evaluate(
- "() => ({w: document.documentElement.scrollWidth,"
- " h: document.documentElement.scrollHeight})"
- )
- width = max(int(dims.get("w") or 0), 1)
- height = int(dims.get("h") or 0)
- if cap and height > cap:
- shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap}
- else:
- shot_kwargs["full_page"] = True
-
- png = page.screenshot(**shot_kwargs)
- screenshot, content_type = _encode_screenshot(
- png, self.config.screenshot_format
- )
+ # Always capture the entire scrollable page in one shot — Playwright
+ # stitches it internally. The height cap is applied afterward by
+ # cropping in Pillow (see ``_encode_screenshot``). Fall back to a
+ # viewport capture only if a full-page shot fails (e.g. a page taller
+ # than Chromium's screenshot limit).
+ try:
+ png = page.screenshot(full_page=True)
+ except Exception as e:
+ logger.info("full-page screenshot failed for %s: %s", url, e)
+ png = page.screenshot()
+ # Encoding can fail on pathological pages (e.g. a 400M-pixel full-page
+ # render trips Pillow's decompression-bomb guard). A screenshot is
+ # nice-to-have — never lose the whole capture over it.
+ try:
+ screenshot, content_type = _encode_screenshot(
+ png,
+ self.config.screenshot_format,
+ self.config.screenshot_max_height,
+ )
+ except Exception as e:
+ logger.info("screenshot encode failed for %s: %s", url, e)
+ screenshot, content_type = None, None
return CaptureResult(
url=url,
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
index 26b54831..8b689781 100644
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
@@ -4,11 +4,19 @@
from a bot's published reasoning:
- :mod:`url_extraction` — pull URLs out of free text / markdown.
- - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API.
+ - :mod:`metaculus_db` — read a bot's cited URLs from the platform database.
+ - :mod:`trace_extraction` — build a manifest from a traced bot run (fullest path).
"""
-from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
- MetaculusCommentHarvester,
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import (
+ MetaculusDbHarvester,
+ resolve_dsn,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import (
+ extract_records_from_events,
+ extract_records_from_question_dir,
+ extract_records_from_trace_file,
+ harvest_run,
)
from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
dedupe_records,
@@ -17,8 +25,13 @@
)
__all__ = [
- "MetaculusCommentHarvester",
+ "MetaculusDbHarvester",
"dedupe_records",
"extract_citation_records",
+ "extract_records_from_events",
+ "extract_records_from_question_dir",
+ "extract_records_from_trace_file",
"extract_urls",
+ "harvest_run",
+ "resolve_dsn",
]
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
deleted file mode 100644
index 0aff84a9..00000000
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Harvest the URLs bots cite, from their public Metaculus comments.
-
-Both first-party and third-party bots publish their reasoning — with the source
-links they used — as comments on the questions they forecast. The public,
-no-auth Metaculus API is therefore the one mechanism that works across *every*
-bot on the platform, which is why this is the general ingestion path.
-
-Flow:
-
- 1. Enumerate the bots participating in a project (tournament) leaderboard.
- 2. Page through each bot's comments.
- 3. Extract the URLs from each comment and emit CitationRecords.
-
-The result is a citation manifest you can feed straight to the capture pipeline.
-
-Caveat: comments are length-truncated when posted, so a comment-harvested URL
-list can be incomplete versus the bot's full research. For bots you control, an
-instrumented trace gives a fuller list; this path is the universal baseline.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from collections.abc import Iterator
-from typing import Any, Callable
-
-from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
- extract_citation_records,
-)
-from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_BASE_URL = "https://www.metaculus.com/api"
-PAGE_LIMIT = 100
-
-
-def _first(d: dict, *keys, default=None):
- for k in keys:
- if k in d and d[k] is not None:
- return d[k]
- return default
-
-
-class MetaculusCommentHarvester:
- """Reads bot comments via the public Metaculus API.
-
- HTTP is injectable for testing: pass ``fetch_json=callable(path, params) ->
- dict`` to avoid real network calls.
- """
-
- def __init__(
- self,
- base_url: str | None = None,
- *,
- session: Any = None,
- timeout: int = 30,
- fetch_json: Callable[[str, dict], dict] | None = None,
- ):
- self.base_url = (
- base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL
- ).rstrip("/")
- self.web_base = (
- self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url
- )
- self.timeout = timeout
- self._session = session
- self._fetch_json = fetch_json
-
- # --- http --------------------------------------------------------------
- def _get(self, path: str, params: dict) -> dict:
- if self._fetch_json is not None:
- return self._fetch_json(path, params)
- try:
- import requests
- except ImportError as e: # pragma: no cover - requests is a core dep
- raise ImportError("requests is required for comment harvesting") from e
- if self._session is None:
- self._session = requests.Session()
- resp = self._session.get(
- f"{self.base_url}{path}", params=params, timeout=self.timeout
- )
- resp.raise_for_status()
- return resp.json()
-
- # --- bots --------------------------------------------------------------
- def enumerate_bots(self, project_id: int | str) -> list[dict]:
- """Return the bot ``user`` records on a project's leaderboard."""
- data = self._get(
- f"/leaderboards/project/{project_id}/", {"with_entries": "true"}
- )
- entries = _first(data, "leaderboard_entries", "entries", "results", default=[])
- bots: list[dict] = []
- seen: set[Any] = set()
- for entry in entries:
- user = entry.get("user") if isinstance(entry, dict) else None
- if not user or not user.get("is_bot"):
- continue
- uid = user.get("id")
- if uid in seen:
- continue
- seen.add(uid)
- bots.append(user)
- return bots
-
- # --- comments ----------------------------------------------------------
- def iter_comments(
- self, author_id: int | str, post_id: int | str | None = None
- ) -> Iterator[dict]:
- """Yield every comment authored by ``author_id`` (optionally on one post)."""
- offset = 0
- while True:
- params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset}
- if post_id is not None:
- params["post"] = post_id
- data = self._get("/comments/", params)
- results = (
- _first(data, "results", default=[]) if isinstance(data, dict) else data
- )
- if not results:
- break
- yield from results
- if len(results) < PAGE_LIMIT:
- break
- offset += PAGE_LIMIT
-
- # --- harvesting --------------------------------------------------------
- def _records_from_comment(
- self, comment: dict, *, run_id: str | None, bot: str | None
- ) -> list[CitationRecord]:
- post_id = _first(comment, "on_post", "post", "post_id")
- post_id_str = str(post_id) if post_id is not None else None
- question_url = (
- f"{self.web_base}/questions/{post_id}/" if post_id is not None else None
- )
- comment_id = comment.get("id")
- return extract_citation_records(
- comment.get("text"),
- run_id=run_id,
- bot=bot,
- question_id=post_id_str,
- metaculus_id=post_id_str,
- question_url=question_url,
- trace=f"comment:{comment_id}" if comment_id is not None else None,
- origin="metaculus_comment",
- )
-
- def harvest_author(
- self,
- author_id: int | str,
- *,
- run_id: str | None = None,
- bot: str | None = None,
- post_id: int | str | None = None,
- ) -> list[CitationRecord]:
- """All citation records from one bot's comments."""
- records: list[CitationRecord] = []
- for comment in self.iter_comments(author_id, post_id=post_id):
- records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot))
- return records
-
- def harvest_project(
- self, project_id: int | str, *, run_id: str | None = None
- ) -> list[CitationRecord]:
- """All citation records from every bot on a project's leaderboard.
-
- Records are kept per-citation (duplicates across bots are preserved as
- distinct provenance); the capture pipeline dedupes URLs before fetching.
- """
- run_id = run_id or f"metaculus-comments-{project_id}"
- records: list[CitationRecord] = []
- bots = self.enumerate_bots(project_id)
- logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots))
- for user in bots:
- bot_name = user.get("username") or str(user.get("id"))
- bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name)
- logger.info(" bot %s: %d cited URL(s)", bot_name, len(bot_records))
- records.extend(bot_records)
- return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py
new file mode 100644
index 00000000..c0221bbf
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py
@@ -0,0 +1,215 @@
+"""Read a bot's cited URLs from the platform Postgres database (operator tooling).
+
+For operators with database access, this reads the URLs a forecasting bot cited
+straight from Postgres (``comments_comment`` joined to ``users_user.is_bot``) and
+emits the same :class:`CitationRecord`s as every other ingestion path, so the
+catalog / coverage / capture stages downstream are unchanged. By default it reads
+all of a bot's comments (``include_private=True``); pass ``include_private=False``
+for the public ones only. Only ``u.is_bot`` accounts are ever read.
+
+The DB call is **injected** (``query``) so the core is driver-agnostic and unit
+testable; :meth:`from_dsn` wires a psycopg2 connection for real use (a libpq DSN
+or a ``postgresql://…`` URL — e.g. a Neon connection string). Reads only; no
+secrets are stored — the DSN comes from the caller / environment.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Callable, Mapping, Sequence
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+ extract_citation_records,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+QueryFn = Callable[[str, Sequence[Any]], list[dict]]
+
+# Keychain service name the DSN is stored under (see resolve_dsn / README).
+KEYCHAIN_SERVICE = "metaculus-db-dsn"
+LOCAL_DEFAULT_DSN = "dbname=metaculus"
+
+_WEB = "https://www.metaculus.com"
+
+# The windowed/post-scoped comment set is computed in a MATERIALIZED CTE so
+# Postgres evaluates it FIRST, then joins users_user by primary key. Without the
+# CTE the planner's stale stats misjudge the date window at ~300k rows (it is
+# really ~2k/day) and pick a join order that times out on the remote pooler.
+_OUTER = (
+ "select r.id as comment_id, r.on_post_id, r.text, "
+ "u.username, r.author_id "
+ "from recent r join users_user u on u.id = r.author_id where u.is_bot"
+)
+
+
+def _recent_cte(scope: str, include_private: bool) -> str:
+ """A MATERIALIZED ``recent`` CTE of link-bearing, non-deleted comments.
+
+ ``scope`` is the row-narrowing predicate (a post id or a created_at window).
+ Private comments are included unless ``include_private`` is False.
+
+ ``strpos(text,'http') > 0`` is a cheap substring pre-filter (a regex `~` scan
+ times out on the pooler; ``like`` would need ``%%`` escaping under psycopg2).
+ The real URL parsing happens in extract_citation_records, so over-matching
+ here just costs a few empty rows.
+ """
+ clauses = ["not c.is_soft_deleted", "strpos(c.text, 'http') > 0", scope]
+ if not include_private:
+ clauses.append("not c.is_private")
+ where = " and ".join(clauses)
+ return (
+ "with recent as materialized ("
+ "select c.id, c.on_post_id, c.text, c.author_id, c.created_at "
+ f"from comments_comment c where {where}) "
+ )
+
+
+def _dsn_from_keychain(service: str = KEYCHAIN_SERVICE) -> str | None:
+ """Read the DSN from the macOS login Keychain, or ``None`` if unavailable.
+
+ Uses ``security find-generic-password -w`` so the credential lives only in
+ the Keychain — never in ``.env``, a shell rc, or shell history. If the
+ Keychain item's ACL is set to confirm on access, this call raises a GUI
+ prompt: a human can approve it, an automated agent driving the shell cannot.
+ Returns ``None`` off macOS or when the item is absent / access is denied, so
+ callers fall through to the next source.
+ """
+ import shutil
+ import subprocess
+
+ if not shutil.which("security"): # not macOS
+ return None
+ try:
+ proc = subprocess.run(
+ ["security", "find-generic-password", "-s", service, "-w"],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ except (OSError, subprocess.SubprocessError):
+ return None
+ if proc.returncode != 0:
+ return None
+ return proc.stdout.strip() or None
+
+
+def resolve_dsn(
+ explicit: str | None = None,
+ *,
+ env: Mapping[str, str] | None = None,
+ keychain_reader: Callable[[], str | None] | None = None,
+) -> str:
+ """Resolve the DB DSN without ever persisting it to disk.
+
+ Resolution order, first hit wins:
+ 1. ``explicit`` (e.g. a ``--dsn`` flag — convenient, but lands in shell
+ history, so prefer the Keychain for the real secret),
+ 2. ``$METACULUS_DB_DSN``,
+ 3. the macOS Keychain item ``metaculus-db-dsn`` (the private path),
+ 4. the local default ``dbname=metaculus``.
+ ``env`` / ``keychain_reader`` are injectable for tests.
+ """
+ if explicit:
+ return explicit
+ environ = env if env is not None else os.environ
+ from_env = environ.get("METACULUS_DB_DSN")
+ if from_env:
+ return from_env
+ reader = keychain_reader or _dsn_from_keychain
+ from_keychain = reader()
+ if from_keychain:
+ return from_keychain
+ return LOCAL_DEFAULT_DSN
+
+
+class MetaculusDbHarvester:
+ """Reads bot comments from Postgres. ``query(sql, params) -> list[dict]``."""
+
+ def __init__(self, query: QueryFn):
+ self._query = query
+
+ @classmethod
+ def from_dsn(cls, dsn: str = "dbname=metaculus") -> "MetaculusDbHarvester":
+ try:
+ import psycopg2
+ import psycopg2.extras
+ except ImportError as e: # pragma: no cover - optional operator dep
+ raise ImportError(
+ "psycopg2 is required for DB harvesting "
+ "(`pip install psycopg2-binary`)."
+ ) from e
+ conn = psycopg2.connect(dsn)
+
+ def query(sql: str, params: Sequence[Any]) -> list[dict]:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(sql, params)
+ return [dict(r) for r in cur.fetchall()]
+
+ return cls(query)
+
+ def _records(self, rows: list[dict], run_id: str | None) -> list[CitationRecord]:
+ out: list[CitationRecord] = []
+ for r in rows:
+ post_id = r.get("on_post_id")
+ pid = str(post_id) if post_id is not None else None
+ comment_id = r.get("comment_id")
+ out.extend(
+ extract_citation_records(
+ r.get("text"),
+ run_id=run_id,
+ bot=r.get("username") or str(r.get("author_id")),
+ question_id=pid,
+ metaculus_id=pid,
+ question_url=(
+ f"{_WEB}/questions/{post_id}/" if post_id is not None else None
+ ),
+ comment_id=str(comment_id) if comment_id is not None else None,
+ origin="metaculus_comment",
+ )
+ )
+ return out
+
+ def harvest_post(
+ self,
+ post_id: int | str,
+ *,
+ run_id: str | None = None,
+ include_private: bool = True,
+ ) -> list[CitationRecord]:
+ """Every bot-cited URL in the comments on one post."""
+ run_id = run_id or f"metaculus-db-post-{post_id}"
+ sql = (
+ _recent_cte("c.on_post_id = %s", include_private)
+ + _OUTER
+ + " order by r.created_at"
+ )
+ return self._records(self._query(sql, (post_id,)), run_id)
+
+ def harvest_recent(
+ self,
+ *,
+ days: int = 1,
+ limit: int | None = None,
+ run_id: str | None = None,
+ include_private: bool = True,
+ ) -> list[CitationRecord]:
+ """Bot-cited URLs from the most recent ``days`` of comments.
+
+ "Recent" is measured against ``max(created_at)`` in the table, not wall
+ clock, so a replica that lags real time by a day still returns its latest
+ day with ``days=1``. ``limit`` caps the row count; ``None`` (the default)
+ is uncapped, which is what a daily sweep wants.
+ """
+ run_id = run_id or f"metaculus-db-recent-{days}d"
+ scope = (
+ "c.created_at >= "
+ "(select max(created_at) from comments_comment) - (%s * interval '1 day')"
+ )
+ sql = (
+ _recent_cte(scope, include_private) + _OUTER + " order by r.created_at desc"
+ )
+ params: list[Any] = [days]
+ if limit:
+ sql += " limit %s"
+ params.append(limit)
+ return self._records(self._query(sql, tuple(params)), run_id)
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py
new file mode 100644
index 00000000..c330eccb
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py
@@ -0,0 +1,380 @@
+"""Build a citation manifest from a bot's run traces.
+
+When the template bot is run with tracing enabled it writes one JSONL trace per
+forecast attempt, recording the agent loop step by step. Those traces are the
+*fullest* record of what the bot actually looked at — richer than the reasoning
+comment it posts, which is length-truncated (see :mod:`metaculus_db` for the
+shallower comment path).
+
+This module walks those traces and pulls out every external URL the bot touched,
+turning each into a :class:`CitationRecord` with provenance (which trace, which
+tool, the search query that surfaced it). That manifest is the input to the
+capture pipeline, exactly like the comment-harvested one.
+
+Trace layout
+------------
+A traced run is a directory tree::
+
+ /
+ bot_/
+ q_/
+ question.json
+ traces_forecast_1_attempt_1.jsonl
+ traces_summarize.jsonl
+ ...
+
+Each ``traces_*.jsonl`` file is a stream of newline-delimited event objects. The
+events that can carry external links are:
+
+- ``tool_call`` — the arguments the bot passed to a tool (e.g. a search query,
+ or a ``url`` handed to a page fetcher). Carries ``name`` and ``call_id``.
+- ``tool_result`` — what the tool returned. Search tools inline their citations
+ here as ``[n](url)`` or as a list of result URLs. Carries ``call_id`` so the
+ result can be attributed back to the originating ``tool_call``.
+- ``initial_prompt`` — the first prompt of a trace. Only scanned for the
+ ``summarize`` trace: the template bot runs research *outside* the agent loop
+ and pastes the research blob verbatim into the summarizer's first prompt, so
+ that is the one place those URLs are recoverable. Other traces' initial
+ prompts just echo the question text (background, resolution criteria), whose
+ URLs aren't research, so they're skipped.
+
+Search provenance (``query`` / ``tool_args``) only exists in these instrumented
+traces — it is populated here from each ``tool_call`` and carried onto the URLs
+that the matching ``tool_result`` returned.
+"""
+
+from __future__ import annotations
+
+import glob
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+ extract_urls,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+METACULUS_QUESTION_URL_FMT = "https://www.metaculus.com/questions/{}/"
+
+# Event type -> the field on that event that carries the URL-bearing payload.
+_SCANNABLE_FIELDS: dict[str, str] = {
+ "tool_call": "args",
+ "tool_result": "content",
+ "initial_prompt": "prompt",
+}
+# The trace whose initial prompt holds pasted-in research (see module docstring).
+_SUMMARIZE_TRACE_LABEL = "summarize"
+# Keys a tool's input commonly uses for the search string, best-effort.
+_QUERY_KEYS = ("query", "q", "search_query", "search", "queries", "question")
+
+
+def _urls_in(value: Any) -> list[str]:
+ """Return URLs found anywhere in a string / dict / list, in first-seen order.
+
+ Tool args are structured (a dict) and tool results may be either a blob of
+ text or a structured payload, so we walk the whole value and run the shared
+ :func:`extract_urls` over every string we reach — keeping markdown-link and
+ trailing-punctuation handling identical to the comment path.
+ """
+ urls: list[str] = []
+
+ def walk(v: Any) -> None:
+ if v is None:
+ return
+ if isinstance(v, str):
+ urls.extend(extract_urls(v))
+ return
+ if isinstance(v, dict):
+ for key, val in v.items():
+ walk(key)
+ walk(val)
+ return
+ if isinstance(v, (list, tuple, set, frozenset)):
+ for item in v:
+ walk(item)
+ return
+ walk(str(v))
+
+ walk(value)
+ return urls
+
+
+def _query_from_args(args: Any) -> str | None:
+ """Pull the search string out of a tool's arguments, if recognisable."""
+ if not isinstance(args, dict):
+ return None
+ for key in _QUERY_KEYS:
+ val = args.get(key)
+ if isinstance(val, str) and val.strip():
+ return val
+ if isinstance(val, (list, tuple)) and val:
+ joined = " ".join(str(item) for item in val if item)
+ if joined.strip():
+ return joined
+ return None
+
+
+def trace_label(trace_path: str) -> str:
+ """``traces_forecast_1_attempt_1.jsonl`` -> ``forecast_1_attempt_1``."""
+ name = os.path.basename(trace_path)
+ if name.startswith("traces_"):
+ name = name[len("traces_") :]
+ if name.endswith(".jsonl"):
+ name = name[: -len(".jsonl")]
+ return name
+
+
+def extract_records_from_events(
+ events: Any,
+ *,
+ trace: str | None = None,
+ include_initial_prompt: bool = False,
+ run_id: str | None = None,
+ bot: str | None = None,
+ question_id: str | None = None,
+ metaculus_id: str | None = None,
+ question_url: str | None = None,
+) -> list[CitationRecord]:
+ """Turn one trace's event stream into CitationRecords.
+
+ ``events`` is any iterable of event dicts (already parsed from JSONL). The
+ given provenance is stamped onto every record; per-event provenance
+ (``trace``, ``tool_name``, ``origin``, ``query``, ``tool_args``,
+ ``first_seen``) is filled in here.
+
+ Set ``include_initial_prompt`` to scan ``initial_prompt`` events — callers
+ should only do this for the ``summarize`` trace (see module docstring).
+ """
+ records: list[CitationRecord] = []
+ # Attribute tool_result events (which only carry call_id) back to the
+ # originating tool_call's name and arguments.
+ tool_name_by_call_id: dict[str, str] = {}
+ tool_args_by_call_id: dict[str, Any] = {}
+
+ for event in events:
+ if not isinstance(event, dict):
+ continue
+ event_type = event.get("type")
+
+ if event_type == "tool_call":
+ call_id = str(event.get("call_id") or "").strip()
+ name = event.get("name") or ""
+ if call_id:
+ if name:
+ tool_name_by_call_id[call_id] = name
+ if "args" in event:
+ tool_args_by_call_id[call_id] = event.get("args")
+
+ field = _SCANNABLE_FIELDS.get(event_type or "")
+ if field is None:
+ continue
+ if event_type == "initial_prompt" and not include_initial_prompt:
+ continue
+
+ urls = _urls_in(event.get(field))
+ if not urls:
+ continue
+
+ if event_type == "tool_call":
+ tool_name = event.get("name") or ""
+ origin = "tool_call"
+ tool_args = (
+ event.get("args") if isinstance(event.get("args"), dict) else None
+ )
+ elif event_type == "tool_result":
+ call_id = str(event.get("call_id") or "").strip()
+ tool_name = tool_name_by_call_id.get(call_id, "")
+ origin = "tool_result"
+ originating_args = tool_args_by_call_id.get(call_id)
+ tool_args = originating_args if isinstance(originating_args, dict) else None
+ else: # initial_prompt
+ tool_name = ""
+ origin = event_type or ""
+ tool_args = None
+
+ query = _query_from_args(tool_args)
+ timestamp = event.get("timestamp")
+ for url in urls:
+ record = CitationRecord(
+ url=url,
+ run_id=run_id,
+ bot=bot,
+ question_id=question_id,
+ metaculus_id=metaculus_id,
+ question_url=question_url,
+ trace=trace,
+ tool_name=tool_name,
+ origin=origin,
+ query=query,
+ tool_args=tool_args,
+ )
+ if timestamp:
+ record.first_seen = str(timestamp)
+ records.append(record)
+
+ return records
+
+
+def _read_jsonl(path: str) -> list[dict]:
+ """Read a JSONL file, skipping blank or unparsable lines."""
+ events: list[dict] = []
+ for raw_line in Path(path).read_text(encoding="utf-8").splitlines():
+ line = raw_line.strip()
+ if not line:
+ continue
+ try:
+ events.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ return events
+
+
+def extract_records_from_trace_file(
+ trace_path: str,
+ *,
+ run_id: str | None = None,
+ bot: str | None = None,
+ question_id: str | None = None,
+ metaculus_id: str | None = None,
+ question_url: str | None = None,
+) -> list[CitationRecord]:
+ """Extract CitationRecords from one ``traces_*.jsonl`` file."""
+ label = trace_label(trace_path)
+ return extract_records_from_events(
+ _read_jsonl(trace_path),
+ trace=label,
+ include_initial_prompt=(label == _SUMMARIZE_TRACE_LABEL),
+ run_id=run_id,
+ bot=bot,
+ question_id=question_id,
+ metaculus_id=metaculus_id,
+ question_url=question_url,
+ )
+
+
+def _read_question_metadata(question_dir: str) -> tuple[str | None, str | None]:
+ """Return ``(question_id, metaculus_id)`` from ``question.json`` in the dir.
+
+ Read as a plain dict with flexible keys so the ingest stays decoupled from
+ any particular question model. Missing/unparsable metadata is non-fatal —
+ records are still emitted, just with empty question provenance.
+ """
+ question_path = os.path.join(question_dir, "question.json")
+ if not os.path.exists(question_path):
+ return None, None
+ try:
+ data = json.loads(Path(question_path).read_text(encoding="utf-8"))
+ except (OSError, json.JSONDecodeError):
+ return None, None
+ if not isinstance(data, dict):
+ return None, None
+
+ def _str_or_none(*keys: str) -> str | None:
+ for key in keys:
+ val = data.get(key)
+ if val is not None:
+ return str(val)
+ return None
+
+ question_id = _str_or_none("question_id", "id", "post_id")
+ metaculus_id = _str_or_none("metaculus_id", "post_id", "id")
+ return question_id, metaculus_id
+
+
+def extract_records_from_question_dir(
+ question_dir: str,
+ *,
+ run_id: str | None = None,
+ bot: str | None = None,
+ question_id: str | None = None,
+ metaculus_id: str | None = None,
+ question_url: str | None = None,
+) -> list[CitationRecord]:
+ """Aggregate CitationRecords across every trace in one ``q_*`` dir.
+
+ Question provenance is read from ``question.json`` in the dir; pass any of
+ ``question_id`` / ``metaculus_id`` / ``question_url`` to override what's
+ found there (or to supply it when the file is absent).
+ """
+ found_qid, found_mid = _read_question_metadata(question_dir)
+ question_id = question_id or found_qid
+ metaculus_id = metaculus_id or found_mid
+ if question_url is None and metaculus_id is not None:
+ question_url = METACULUS_QUESTION_URL_FMT.format(metaculus_id)
+
+ records: list[CitationRecord] = []
+ for trace_path in sorted(glob.glob(os.path.join(question_dir, "traces_*.jsonl"))):
+ records.extend(
+ extract_records_from_trace_file(
+ trace_path,
+ run_id=run_id,
+ bot=bot,
+ question_id=question_id,
+ metaculus_id=metaculus_id,
+ question_url=question_url,
+ )
+ )
+ return records
+
+
+def _bot_name_from_dir(bot_dir: str) -> str:
+ """``.../bot_complex`` -> ``complex``."""
+ name = os.path.basename(bot_dir)
+ return name[len("bot_") :] if name.startswith("bot_") else name
+
+
+def _question_dirs_flat(run_dir: str) -> list[str]:
+ """Question dirs directly under ``run_dir`` (no ``bot_*`` level).
+
+ A "question dir" is any immediate subdirectory that actually contains
+ ``traces_*.jsonl``. This handles flatter layouts (e.g. a backfill of one
+ bot's runs as ``//traces_*.jsonl``) where the ``bot_*``
+ grouping is absent.
+ """
+ dirs = []
+ for entry in sorted(glob.glob(os.path.join(run_dir, "*"))):
+ if os.path.isdir(entry) and glob.glob(os.path.join(entry, "traces_*.jsonl")):
+ dirs.append(entry)
+ return dirs
+
+
+def harvest_run(
+ run_dir: str, *, run_id: str | None = None, bot: str | None = None
+) -> list[CitationRecord]:
+ """Build a citation manifest from a whole traced run directory.
+
+ Primary layout is ``/bot_*/q_*/traces_*.jsonl``, deriving ``run_id``
+ from the run dir's name and ``bot`` from each ``bot_*`` subdir. If no
+ ``bot_*`` subdirs exist, falls back to a **flat layout** —
+ ``//traces_*.jsonl`` — attributing every question to a
+ single bot (the ``bot`` argument, else the run dir's name). Question
+ provenance still comes from each dir's ``question.json``.
+
+ Returns the flat list of CitationRecords (one per URL occurrence); feed it
+ through :func:`url_extraction.dedupe_records` before capture for one row per
+ URL.
+ """
+ run_id = run_id or os.path.basename(os.path.normpath(run_dir))
+ records: list[CitationRecord] = []
+
+ bot_dirs = sorted(glob.glob(os.path.join(run_dir, "bot_*")))
+ if bot_dirs:
+ for bot_dir in bot_dirs:
+ bot_name = _bot_name_from_dir(bot_dir)
+ for question_dir in sorted(glob.glob(os.path.join(bot_dir, "q_*"))):
+ records.extend(
+ extract_records_from_question_dir(
+ question_dir, run_id=run_id, bot=bot_name
+ )
+ )
+ return records
+
+ # Flat fallback: no bot_* grouping. One bot, question dirs directly below.
+ bot_name = bot or os.path.basename(os.path.normpath(run_dir))
+ for question_dir in _question_dirs_flat(run_dir):
+ records.extend(
+ extract_records_from_question_dir(question_dir, run_id=run_id, bot=bot_name)
+ )
+ return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
index f97def1c..b8b06d3b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
@@ -14,6 +14,9 @@
import re
from collections.abc import Iterable
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
# Markdown link target: [label](url) or [label](), optionally with a title.
@@ -24,13 +27,30 @@
# unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives.
_BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE)
-# Characters commonly stuck to the end of a URL in prose.
-_TRAILING = ".,;:!?'\""
+# Characters commonly stuck to the end of a URL in prose (incl. markdown-escape
+# residue: a trailing backslash or backtick).
+_TRAILING = ".,;:!?'\"\\`"
+
+
+def _cut_markdown_tail(url: str) -> str:
+ """Cut a URL at a markdown reference/link tail the bare-URL scan can swallow.
+
+ Bots sometimes emit ``…/story?id=123)[10](https://other…)`` where ``)[10](…``
+ is a markdown reference glued onto a real URL. The leading ``)`` was never
+ part of the URL, so cut at the first ``)[`` or ``](`` boundary.
+ """
+ cut = len(url)
+ for marker in (")[", "]("):
+ i = url.find(marker)
+ if i > 0:
+ cut = min(cut, i)
+ return url[:cut]
def _trim(url: str) -> str:
"""Strip trailing punctuation, and a closing bracket/paren only when it is
unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive)."""
+ url = _cut_markdown_tail(url)
while url:
last = url[-1]
if last in _TRAILING:
@@ -45,7 +65,12 @@ def _trim(url: str) -> str:
def extract_urls(text: str | None) -> list[str]:
- """Return the distinct http(s) URLs in ``text``, in first-seen order."""
+ """Return the distinct http(s) URLs in ``text``, in first-seen order.
+
+ Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so
+ ``…/x`` and ``…/x?utm_source=…`` count once; the original first-seen string
+ is returned.
+ """
if not text:
return []
seen: set[str] = set()
@@ -53,8 +78,11 @@ def extract_urls(text: str | None) -> list[str]:
for pattern in (_MD_LINK, _AUTOLINK, _BARE):
for match in pattern.finditer(text):
url = _trim(match.group(1))
- if url and url not in seen:
- seen.add(url)
+ if not url:
+ continue
+ key = canonicalize_url(url)
+ if key not in seen:
+ seen.add(key)
ordered.append(url)
return ordered
@@ -67,6 +95,7 @@ def extract_citation_records(
question_id: str | None = None,
metaculus_id: str | None = None,
question_url: str | None = None,
+ comment_id: str | None = None,
trace: str | None = None,
tool_name: str | None = None,
origin: str | None = None,
@@ -81,6 +110,7 @@ def extract_citation_records(
question_id=question_id,
metaculus_id=metaculus_id,
question_url=question_url,
+ comment_id=comment_id,
trace=trace,
tool_name=tool_name,
origin=origin,
@@ -90,11 +120,14 @@ def extract_citation_records(
def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]:
- """Keep the first record per URL, preserving order."""
+ """Keep the first record per *canonical* URL, preserving order."""
seen: set[str] = set()
out: list[CitationRecord] = []
for r in records:
- if r.url and r.url not in seen:
- seen.add(r.url)
+ if not r.url:
+ continue
+ key = canonicalize_url(r.url)
+ if key not in seen:
+ seen.add(key)
out.append(r)
return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py
index 609c74d7..880ab161 100644
--- a/forecasting_tools/agents_and_tools/source_archive/manifest.py
+++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py
@@ -10,6 +10,9 @@
from collections.abc import Iterable, Iterator
from pathlib import Path
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
@@ -31,11 +34,19 @@ def loads(text: str) -> list[CitationRecord]:
def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]:
- """Yield each distinct URL once, preserving first-seen order."""
+ """Yield each distinct URL once, preserving first-seen order.
+
+ Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so
+ near-duplicate links collapse to a single fetch; the original first-seen URL
+ string is what's yielded, for provenance.
+ """
seen: set[str] = set()
for r in records:
- if r.url and r.url not in seen:
- seen.add(r.url)
+ if not r.url:
+ continue
+ key = canonicalize_url(r.url)
+ if key not in seen:
+ seen.add(key)
yield r.url
diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py
index 8caad9ac..08c63cd6 100644
--- a/forecasting_tools/agents_and_tools/source_archive/models.py
+++ b/forecasting_tools/agents_and_tools/source_archive/models.py
@@ -8,14 +8,24 @@
from pydantic import BaseModel, Field
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
+
def utcnow_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def url_hash(url: str) -> str:
- """Stable key for a URL — groups every capture of that URL together."""
- return hashlib.sha256(url.encode("utf-8")).hexdigest()
+ """Stable key for a URL — groups every capture of that URL together.
+
+ The URL is canonicalized first (see :func:`canonicalize_url`) so trivially
+ different links — tracking params, a trailing slash, a ``#fragment``,
+ query-param order, host case — collapse onto one key instead of being
+ stored and counted as separate sources.
+ """
+ return hashlib.sha256(canonicalize_url(url).encode("utf-8")).hexdigest()
def content_hash(html: str | bytes) -> str:
@@ -56,6 +66,9 @@ class StoredCapture(BaseModel):
html_key: str | None = None
screenshot_key: str | None = None
markdown_key: str | None = None
+ # Set when this capture reuses another URL's blobs because the fetched
+ # content was byte-identical (cross-URL content dedup); holds that URL's hash.
+ content_alias_of: str | None = None
first_seen: str = Field(default_factory=utcnow_iso)
last_seen: str = Field(default_factory=utcnow_iso)
@@ -74,7 +87,11 @@ class CitationRecord(BaseModel):
question_id: str | None = None
metaculus_id: str | None = None
question_url: str | None = None
+ comment_id: str | None = None # Metaculus comment the URL was cited in
trace: str | None = None
tool_name: str | None = None
origin: str | None = None
+ # Search provenance (populated by instrumented trace ingest, not comments):
+ query: str | None = None # the search query the bot ran, if known
+ tool_args: dict[str, Any] | None = None # full tool input (query + filters…)
first_seen: str = Field(default_factory=utcnow_iso)
diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
index 1855f039..67f5817b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/pipeline.py
+++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
@@ -11,6 +11,7 @@
from __future__ import annotations
import logging
+import threading
from collections.abc import Iterable
from pydantic import BaseModel
@@ -71,6 +72,9 @@ def capture_url(self, url: str) -> CaptureOutcome:
except FetchError as e:
logger.info("fetch error for %s: %s", url, e)
return CaptureOutcome(url=url, status="error", reason=str(e))
+ except Exception as e: # never let one bad URL abort the whole run
+ logger.warning("unexpected error capturing %s: %s", url, e)
+ return CaptureOutcome(url=url, status="error", reason=f"unexpected: {e}")
# Gate here so any fetcher is covered; the tiered fetcher also gates
# internally to decide fallback, but this is the authoritative check.
@@ -92,3 +96,169 @@ def run(self, urls: Iterable[str]) -> PipelineSummary:
def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary:
return self.run(unique_urls(records))
+
+
+# An outcome whose error reason contains one of these means the browser itself
+# died (crash, OOM, or the machine slept and severed the CDP pipe) — not a
+# problem with the URL. Without recovery, every later URL in that worker's shard
+# would error against the dead browser, so we rebuild the browser and retry.
+_DEAD_BROWSER_MARKERS = (
+ "has been closed",
+ "Target page, context or browser",
+ "Browser.new_context",
+ "Connection closed",
+ "browser has been closed",
+)
+
+
+def _browser_died(reason: str | None) -> bool:
+ return any(m in (reason or "") for m in _DEAD_BROWSER_MARKERS)
+
+
+def _close_quietly(cm, timeout_s: float = 15.0) -> None:
+ """Tear down a fetcher context manager, but never block on it: a wedged
+ browser's ``close()`` can itself hang, so run it in a daemon thread and give
+ up after ``timeout_s`` (the leftover process is reaped at the end of the run).
+ """
+ done = threading.Event()
+
+ def _close() -> None:
+ try:
+ cm.__exit__(None, None, None)
+ except Exception:
+ pass
+ finally:
+ done.set()
+
+ threading.Thread(target=_close, daemon=True).start()
+ done.wait(timeout_s)
+
+
+def _reap_browser_descendants() -> None:
+ """Best-effort: kill automation Chromium descending from this process. Used
+ both to recover a wedged worker (kill its browser so the blocked sync call
+ errors out) and to sweep leftovers at end of run. No-op without psutil so it
+ never becomes a hard dependency.
+ """
+ try:
+ import os
+
+ import psutil
+ except Exception:
+ return
+ try:
+ for child in psutil.Process(os.getpid()).children(recursive=True):
+ try:
+ if "chrom" in (child.name() or "").lower():
+ child.kill()
+ except Exception:
+ pass
+ except Exception:
+ pass
+
+
+def capture_urls_concurrent(
+ urls: Iterable[str],
+ store: ContentStore,
+ config,
+ fetcher_factory,
+ per_url_timeout: float | None = None,
+ reaper=_reap_browser_descendants,
+) -> PipelineSummary:
+ """Capture ``urls`` across ``config.concurrency`` worker threads.
+
+ Headless Chromium's sync API is **thread-affine** — a browser must be used on
+ the thread that created it — so each worker opens its **own** browser via
+ ``fetcher_factory(config)`` and runs all captures inline on its own thread.
+ The content store is shared (writes are keyed by URL hash and idempotent, so
+ shards never collide). Order of outcomes is not preserved.
+
+ Hang protection runs *out of band*: a supervisor thread watches each worker's
+ heartbeat and, if one is stuck on a single URL past ``per_url_timeout`` (a
+ wedged sync call whose Playwright timeout never fires — e.g. the machine
+ slept and severed the CDP pipe), it **kills the browser processes**. That is
+ an OS-level action (safe across threads, unlike touching Playwright objects),
+ so the blocked call errors out and the worker rebuilds via the same
+ dead-browser path — no single stuck worker can freeze the whole run.
+ """
+ import time
+ from concurrent.futures import ThreadPoolExecutor
+
+ url_list = list(urls)
+ workers = max(1, int(getattr(config, "concurrency", 1) or 1))
+ if per_url_timeout is None:
+ nav_s = float(getattr(config, "nav_timeout_ms", 30000)) / 1000.0
+ per_url_timeout = max(90.0, nav_s * 4)
+
+ # worker index -> monotonic start of its current URL (None when between URLs)
+ heartbeats: dict[int, float | None] = {}
+ hb_lock = threading.Lock()
+ stop = threading.Event()
+
+ def supervisor() -> None:
+ interval = max(0.5, min(per_url_timeout / 2, 30.0))
+ while not stop.wait(interval):
+ now = time.monotonic()
+ with hb_lock:
+ stalled = [
+ w
+ for w, t in heartbeats.items()
+ if t is not None and now - t > per_url_timeout
+ ]
+ if stalled:
+ logger.warning(
+ "worker(s) %s stuck > %.0fs on one URL; killing browsers to recover",
+ stalled,
+ per_url_timeout,
+ )
+ reaper()
+ with hb_lock: # grace: don't reap again before workers rebuild
+ for w in list(heartbeats):
+ if heartbeats[w] is not None:
+ heartbeats[w] = now
+
+ def work(idx: int, shard: list[str]) -> list[CaptureOutcome]:
+ outcomes: list[CaptureOutcome] = []
+ cm = fetcher_factory(config)
+ pipeline = CapturePipeline(cm.__enter__(), store)
+ try:
+ for url in shard:
+ with hb_lock:
+ heartbeats[idx] = time.monotonic()
+ outcome = pipeline.capture_url(url)
+ if outcome.status == "error" and _browser_died(outcome.reason):
+ logger.warning(
+ "browser died; rebuilding worker %d, retrying %s", idx, url
+ )
+ _close_quietly(cm)
+ cm = fetcher_factory(config)
+ pipeline = CapturePipeline(cm.__enter__(), store)
+ with hb_lock:
+ heartbeats[idx] = time.monotonic()
+ outcome = pipeline.capture_url(url) # one retry on a fresh browser
+ outcomes.append(outcome)
+ with hb_lock:
+ heartbeats[idx] = None
+ finally:
+ _close_quietly(cm)
+ return outcomes
+
+ supervisor_thread = threading.Thread(target=supervisor, daemon=True)
+ supervisor_thread.start()
+ try:
+ if workers == 1:
+ heartbeats[0] = None
+ return PipelineSummary(outcomes=work(0, url_list))
+
+ shards = [url_list[i::workers] for i in range(workers)]
+ for i in range(workers):
+ heartbeats[i] = None
+ summary = PipelineSummary()
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ futures = [pool.submit(work, i, shards[i]) for i in range(workers)]
+ for future in futures:
+ summary.outcomes.extend(future.result())
+ return summary
+ finally:
+ stop.set()
+ reaper()
diff --git a/forecasting_tools/agents_and_tools/source_archive/reindex.py b/forecasting_tools/agents_and_tools/source_archive/reindex.py
new file mode 100644
index 00000000..5a7472e6
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/reindex.py
@@ -0,0 +1,278 @@
+"""One-off reindex / dedup audit for an existing archive.
+
+This walks the canonical per-URL indexes already in a store and reports how much
+the smarter-dedup work (see ``ROADMAP.md`` Plan 1) would collapse, **without
+mutating anything by default**. It answers the practical question: *after exact
+canonicalization and content dedup, are there still many URLs that look like the
+same page?* — i.e. whether the fuzzy near-dup phase (D) is worth building.
+
+Three lenses:
+
+ - **Canonicalization (Phase A):** group stored URLs by :func:`canonicalize_url`.
+ Any group with >1 distinct raw URL is a set that *now* shares one key.
+ - **Content (Phase C):** group distinct canonical URLs by their latest content
+ hash. A group with >1 URL is byte-identical pages reachable at different URLs.
+ - **Near-dup signal (Phase D candidate):** of the URLs surviving both dedups,
+ group by ``scheme://host/path`` ignoring the query string. Big groups mean
+ "same path, differing query" pages that exact dedup leaves separate — the
+ cases fuzzy matching would target.
+
+Run it::
+
+ # against the configured S3 bucket (read-only audit)
+ WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive WEB_ARCHIVE_AWS_PROFILE=default \\
+ python -m forecasting_tools.agents_and_tools.source_archive.reindex
+
+ # against a local capture dir
+ python -m forecasting_tools.agents_and_tools.source_archive.reindex --local ./archive
+
+ # additionally (re)build the content reverse index for existing captures
+ python -m forecasting_tools.agents_and_tools.source_archive.reindex --apply
+
+``--apply`` only writes the additive ``index/by-content/`` reverse index (safe,
+idempotent). It does **not** move blobs or re-key the per-URL indexes; that
+heavier migration is intentionally deferred (the archive is young).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+from urllib.parse import urlsplit
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+
+class Cluster(BaseModel):
+ key: str
+ urls: list[str]
+
+
+class AnalysisReport(BaseModel):
+ total_url_indexes: int = 0
+ alias_indexes: int = 0 # already-collapsed redirects (Phase B)
+ canonical_captures: int = 0 # distinct stored URLs with content
+ distinct_after_canonicalization: int = 0
+ distinct_after_content_dedup: int = 0
+ canonicalization_clusters: list[Cluster] = [] # raw URLs that now share a key
+ content_clusters: list[Cluster] = [] # different URLs, identical content
+ near_dup_clusters: list[Cluster] = [] # same host+path, differing query
+
+ def __str__(self) -> str:
+ merged_a = sum(len(c.urls) - 1 for c in self.canonicalization_clusters)
+ merged_c = sum(len(c.urls) - 1 for c in self.content_clusters)
+ lines = [
+ "Source-archive dedup audit",
+ "=" * 40,
+ f"URL indexes scanned : {self.total_url_indexes}",
+ f" of which alias (redirect) : {self.alias_indexes}",
+ f" of which canonical capture : {self.canonical_captures}",
+ "",
+ f"Distinct URLs (raw) : {self.canonical_captures}",
+ f"After canonicalization (A) : {self.distinct_after_canonicalization}"
+ f" (−{merged_a} merged)",
+ f"After content dedup (C) : {self.distinct_after_content_dedup}"
+ f" (−{merged_c} byte-identical)",
+ "",
+ f"Canonicalization clusters : {len(self.canonicalization_clusters)}",
+ f"Identical-content clusters : {len(self.content_clusters)}",
+ f"Near-dup candidates (D) : {len(self.near_dup_clusters)}"
+ " (same host+path, differing query)",
+ ]
+
+ def _show(title: str, clusters: list[Cluster], limit: int = 5) -> None:
+ if not clusters:
+ return
+ lines.append("")
+ lines.append(f"--- top {title} ---")
+ for c in sorted(clusters, key=lambda x: len(x.urls), reverse=True)[:limit]:
+ lines.append(f" [{len(c.urls)}] {c.key}")
+ for u in c.urls[:4]:
+ lines.append(f" {u}")
+ if len(c.urls) > 4:
+ lines.append(f" … +{len(c.urls) - 4} more")
+
+ _show("canonicalization clusters", self.canonicalization_clusters)
+ _show("identical-content clusters", self.content_clusters)
+ _show("near-dup candidates (Phase D signal)", self.near_dup_clusters)
+ return "\n".join(lines)
+
+
+def _host_path(url: str) -> str:
+ parts = urlsplit(canonicalize_url(url))
+ return f"{parts.scheme}://{parts.netloc}{parts.path}"
+
+
+def iter_url_indexes(store: BlobStore, prefix: str):
+ """Yield ``(key, index_dict)`` for each per-URL index, skipping the reverse
+ content index under ``index/by-content/``."""
+ index_prefix = f"{prefix.rstrip('/')}/index/"
+ content_sub = f"{index_prefix}by-content/"
+ for key in store.list_keys(index_prefix):
+ if not key.endswith(".json") or key.startswith(content_sub):
+ continue
+ try:
+ yield key, json.loads(store.get(key).decode("utf-8"))
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ continue
+
+
+def analyze(store: BlobStore, config: ArchiveConfig) -> AnalysisReport:
+ report = AnalysisReport()
+ by_canonical: dict[str, list[str]] = defaultdict(list)
+ by_content: dict[str, list[str]] = defaultdict(list)
+
+ for _key, index in iter_url_indexes(store, config.s3_prefix):
+ report.total_url_indexes += 1
+ if index.get("alias_of"):
+ report.alias_indexes += 1
+ continue
+ url = index.get("url")
+ if not url or not index.get("captures"):
+ continue
+ report.canonical_captures += 1
+ by_canonical[canonicalize_url(url)].append(url)
+ ch = index.get("latest_content_hash")
+ if ch:
+ by_content[ch].append(url)
+
+ report.distinct_after_canonicalization = len(by_canonical)
+ report.canonicalization_clusters = [
+ Cluster(key=k, urls=sorted(set(v)))
+ for k, v in by_canonical.items()
+ if len(set(v)) > 1
+ ]
+
+ # Content dedup operates on the canonicalized URL set.
+ content_groups = {k: sorted(set(v)) for k, v in by_content.items()}
+ report.content_clusters = [
+ Cluster(key=k, urls=v) for k, v in content_groups.items() if len(v) > 1
+ ]
+ # distinct pages after content dedup = canonical URLs minus those merged away
+ merged_by_content = sum(len(v) - 1 for v in content_groups.values() if len(v) > 1)
+ report.distinct_after_content_dedup = max(
+ 0, report.distinct_after_canonicalization - merged_by_content
+ )
+
+ # Phase D signal: among canonical URLs, same host+path but differing query.
+ survivors = {canonicalize_url(u) for grp in by_canonical.values() for u in grp}
+ by_host_path: dict[str, set[str]] = defaultdict(set)
+ for u in survivors:
+ by_host_path[_host_path(u)].add(u)
+ report.near_dup_clusters = [
+ Cluster(key=k, urls=sorted(v)) for k, v in by_host_path.items() if len(v) > 1
+ ]
+ return report
+
+
+def rebuild_content_index(
+ store: BlobStore, config: ArchiveConfig, *, apply: bool
+) -> int:
+ """(Re)build ``index/by-content/`` from existing captures. Returns the number
+ of content groups (that would be) written. Additive and idempotent."""
+ cstore = ContentStore(store, config)
+ groups: dict[str, list[tuple[str, str]]] = defaultdict(list)
+ for _key, index in iter_url_indexes(store, config.s3_prefix):
+ if index.get("alias_of") or not index.get("captures"):
+ continue
+ uh = index.get("url_hash")
+ url = index.get("url")
+ ch = index.get("latest_content_hash")
+ if uh and url and ch:
+ groups[ch].append((uh, url))
+
+ written = 0
+ for ch, members in groups.items():
+ written += 1
+ if not apply:
+ continue
+ owner_uh, owner_url = members[0]
+ # Re-register every member; the first becomes canonical owner.
+ for uh, url in members:
+ blob_keys = None
+ if uh == owner_uh:
+ cap = index_blob_keys(store, config, owner_uh, ch)
+ blob_keys = cap
+ cstore._register_content(ch, uh, url, blob_keys)
+ return written
+
+
+def index_blob_keys(
+ store: BlobStore, config: ArchiveConfig, uh: str, ch: str
+) -> dict | None:
+ cstore = ContentStore(store, config)
+ index = cstore._read_index(uh)
+ if not index:
+ return None
+ cap = (index.get("captures") or {}).get(ch)
+ if not cap:
+ return None
+ return {
+ "html": cap.get("html_key"),
+ "markdown": cap.get("markdown_key"),
+ "screenshot": cap.get("screenshot_key"),
+ }
+
+
+def _build_store(local_dir: str | None, bucket: str | None, config: ArchiveConfig):
+ if local_dir:
+ from forecasting_tools.agents_and_tools.source_archive.storage import (
+ LocalBlobStore,
+ )
+
+ return LocalBlobStore(local_dir)
+ bucket = bucket or config.s3_bucket
+ if not bucket:
+ sys.exit(
+ "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), "
+ "or use --local DIR."
+ )
+ from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore
+
+ return S3BlobStore(bucket, config=config)
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = argparse.ArgumentParser(
+ prog="source-archive-reindex",
+ description="Audit (and optionally rebuild) dedup structures for an "
+ "existing archive.",
+ )
+ parser.add_argument("--local", metavar="DIR", help="audit a local capture dir")
+ parser.add_argument("--bucket", help="override WEB_ARCHIVE_S3_BUCKET")
+ parser.add_argument(
+ "--apply",
+ action="store_true",
+ help="rebuild index/by-content/ for existing captures (additive)",
+ )
+ parser.add_argument("--json", action="store_true", help="emit the report as JSON")
+ args = parser.parse_args(argv)
+
+ config = ArchiveConfig.from_env()
+ store = _build_store(args.local, args.bucket, config)
+
+ report = analyze(store, config)
+ if args.json:
+ print(report.model_dump_json(indent=2))
+ else:
+ print(report)
+
+ if args.apply:
+ n = rebuild_content_index(store, config, apply=True)
+ print(f"\nRebuilt index/by-content/ for {n} content group(s).")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/reports.py b/forecasting_tools/agents_and_tools/source_archive/reports.py
new file mode 100644
index 00000000..ba75b4b6
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/reports.py
@@ -0,0 +1,72 @@
+"""Persist each capture run's per-URL outcomes to ``reports/.json``.
+
+The coverage report's job is to surface sources we should be collecting. A cited
+source we have not archived falls into two very different buckets:
+
+- **never fetched** — it was harvested into a manifest but no capture run ever
+ attempted it. This is the real "we should go collect this" signal.
+- **fetched but failed** — we tried and the fetch/quality gate rejected it
+ (Cloudflare, PDF, 404…). A capture problem, not a collection problem.
+
+Without persisted run outcomes the two are indistinguishable. Writing each run's
+outcomes here lets coverage tell them apart.
+"""
+
+from __future__ import annotations
+
+import json
+
+from forecasting_tools.agents_and_tools.source_archive.canonicalize import (
+ canonicalize_url,
+)
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+CAPTURED_STATUSES = {"stored", "deduped", "cache_hit"}
+FAILED_STATUSES = {"quality_failed", "error"}
+
+
+def report_key(run_id: str, config: ArchiveConfig) -> str:
+ return f"{config.s3_prefix.rstrip('/')}/reports/{run_id}.json"
+
+
+def write_run_report(
+ store: BlobStore, run_id: str, summary, config: ArchiveConfig
+) -> str:
+ """Persist a run's per-URL outcomes; ``summary`` is a ``PipelineSummary``."""
+ rows = [
+ {"url": o.url, "status": o.status, "reason": getattr(o, "reason", "")}
+ for o in summary.outcomes
+ ]
+ key = report_key(run_id, config)
+ store.put(
+ key, json.dumps(rows, indent=2).encode("utf-8"), content_type="application/json"
+ )
+ return key
+
+
+def read_outcomes(store: BlobStore, config: ArchiveConfig) -> dict[str, str]:
+ """Map canonical URL -> last known capture status across all run reports.
+
+ A captured status wins over a failed one (if we ever succeeded, that's the
+ truth). Returns ``{}`` if no reports exist yet.
+ """
+ prefix = config.s3_prefix.rstrip("/")
+ out: dict[str, str] = {}
+ for key in store.list_keys(f"{prefix}/reports/"):
+ if not key.endswith(".json"):
+ continue
+ try:
+ rows = json.loads(store.get(key).decode("utf-8"))
+ except (UnicodeDecodeError, ValueError):
+ continue
+ for r in rows:
+ url = canonicalize_url(r.get("url", ""))
+ status = r.get("status", "")
+ if not url:
+ continue
+ if url not in out or status in CAPTURED_STATUSES:
+ out[url] = status
+ return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
index c70d676f..7553c972 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
@@ -6,6 +6,7 @@
from __future__ import annotations
+from collections.abc import Iterable
from typing import Protocol, runtime_checkable
@@ -18,3 +19,7 @@ def put(
def get(self, key: str) -> bytes: ...
def exists(self, key: str) -> bool: ...
+
+ def list_keys(self, prefix: str = "") -> Iterable[str]:
+ """Yield every stored key beginning with ``prefix`` (for reindex/audit)."""
+ ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
index 429333ab..d85b0b0b 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
@@ -22,3 +22,13 @@ def get(self, key: str) -> bytes:
def exists(self, key: str) -> bool:
return self._path(key).exists()
+
+ def list_keys(self, prefix: str = "") -> list[str]:
+ if not self.root.exists():
+ return []
+ keys = [
+ p.relative_to(self.root).as_posix()
+ for p in self.root.rglob("*")
+ if p.is_file()
+ ]
+ return sorted(k for k in keys if k.startswith(prefix))
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
index 0d4822b0..10914b94 100644
--- a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
@@ -58,3 +58,9 @@ def exists(self, key: str) -> bool:
if code in ("404", "NoSuchKey", "NotFound"):
return False
raise
+
+ def list_keys(self, prefix: str = ""):
+ paginator = self._get_client().get_paginator("list_objects_v2")
+ for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+ for obj in page.get("Contents", []):
+ yield obj["Key"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/viewer.py b/forecasting_tools/agents_and_tools/source_archive/viewer.py
new file mode 100644
index 00000000..fe604a2b
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/viewer.py
@@ -0,0 +1,409 @@
+"""Streamlit viewer for the source archive.
+
+Browse what the capture pipeline stored in S3: pick a captured URL and see its
+**screenshot, markdown, and HTML** side by side, with the question/bot it came
+from. Reads provenance from the run manifests and resolves each URL's latest
+capture through its per-URL index — no local file wrangling.
+
+Run it::
+
+ # uses the same env as the rest of the archive (WEB_ARCHIVE_S3_BUCKET, etc.)
+ AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \\
+ streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py
+
+Nothing here is deployment-specific: bucket/prefix/profile come from
+``ArchiveConfig.from_env()``.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+# `streamlit run ` puts only the script's own directory on sys.path, not
+# the repo root — so make `import forecasting_tools` work whether the package is
+# pip-installed or just checked out. (viewer.py -> source_archive -> agents_and_tools
+# -> forecasting_tools -> .)
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+ sys.path.insert(0, str(_REPO_ROOT))
+
+import pandas as pd # noqa: E402
+import streamlit as st # noqa: E402
+
+from forecasting_tools.agents_and_tools.source_archive.config import ( # noqa: E402
+ ArchiveConfig,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import ( # noqa: E402
+ url_hash,
+)
+
+# --- S3 access (cached) ----------------------------------------------------
+
+
+@st.cache_resource(show_spinner=False)
+def _client(profile: str | None, region: str | None):
+ import boto3
+
+ return boto3.Session(
+ profile_name=profile or None, region_name=region or None
+ ).client("s3")
+
+
+def _cfg() -> ArchiveConfig:
+ return ArchiveConfig.from_env()
+
+
+@st.cache_data(show_spinner=False)
+def _list_keys(bucket: str, prefix: str) -> list[str]:
+ cfg = _cfg()
+ if cfg.local_dir: # filesystem-backed archive — list matching files
+ root = Path(cfg.local_dir)
+ if not root.exists():
+ return []
+ return [
+ p.relative_to(root).as_posix()
+ for p in root.rglob("*")
+ if p.is_file() and p.relative_to(root).as_posix().startswith(prefix)
+ ]
+ s3 = _client(cfg.aws_profile, cfg.aws_region)
+ keys: list[str] = []
+ token = None
+ while True:
+ kw = {"Bucket": bucket, "Prefix": prefix}
+ if token:
+ kw["ContinuationToken"] = token
+ resp = s3.list_objects_v2(**kw)
+ keys.extend(o["Key"] for o in resp.get("Contents", []))
+ if not resp.get("IsTruncated"):
+ break
+ token = resp.get("NextContinuationToken")
+ return keys
+
+
+@st.cache_data(show_spinner=False)
+def _get_bytes(bucket: str, key: str) -> bytes | None:
+ cfg = _cfg()
+ if cfg.local_dir:
+ p = Path(cfg.local_dir) / key
+ return p.read_bytes() if p.exists() else None
+ s3 = _client(cfg.aws_profile, cfg.aws_region)
+ try:
+ return s3.get_object(Bucket=bucket, Key=key)["Body"].read()
+ except Exception:
+ return None
+
+
+# Metaculus question id -> review URL. Derived at display time (not stored) so
+# there's no redundant, drift-prone URL column in S3.
+_METACULUS_QUESTION_BASE = "https://www.metaculus.com/questions/"
+
+
+def _metaculus_url(metaculus_id) -> str:
+ if metaculus_id in (None, "", "null"):
+ return ""
+ return f"{_METACULUS_QUESTION_BASE}{metaculus_id}/"
+
+
+def _comment_url(metaculus_id, comment_id) -> str:
+ """Deep-link to the specific comment the URL was cited in."""
+ base = _metaculus_url(metaculus_id)
+ if not base or comment_id in (None, "", "null"):
+ return ""
+ return f"{base}#comment-{comment_id}"
+
+
+@st.cache_data(show_spinner="Loading manifests…")
+def _manifest_rows(bucket: str, prefix: str) -> pd.DataFrame:
+ """Every (question, bot, url) the bots cited, from the run manifests."""
+ rows = []
+ for key in _list_keys(bucket, f"{prefix}/manifests/"):
+ body = _get_bytes(bucket, key)
+ if not body:
+ continue
+ for line in body.decode("utf-8").splitlines():
+ line = line.strip()
+ if not line:
+ continue
+ r = json.loads(line)
+ rows.append(
+ {
+ "question": r.get("question_id") or "(none)",
+ "bot": r.get("bot") or "(none)",
+ "run_id": r.get("run_id") or "",
+ "origin": r.get("origin") or "",
+ "query": r.get("query") or "",
+ "metaculus": _metaculus_url(r.get("metaculus_id")),
+ "comment": _comment_url(r.get("metaculus_id"), r.get("comment_id")),
+ "url": r.get("url", ""),
+ "question_url": r.get("question_url") or "",
+ "tool_args": r.get("tool_args"),
+ }
+ )
+ df = pd.DataFrame(rows)
+ if not df.empty:
+ # Keep distinct provenance (a URL cited via two origins/runs = two rows).
+ df = df.drop_duplicates(
+ subset=["question", "bot", "run_id", "origin", "url"]
+ ).reset_index(drop=True)
+ return df
+
+
+def _scrape_report(bucket: str, prefix: str, view: pd.DataFrame):
+ """Per-question scraping cost: which backend captured each URL.
+
+ Self-hosted Playwright is free; Firecrawl (the fallback) costs ~1 credit per
+ page and is what actually accrues spend once a key is configured. We classify
+ each *stored* capture by ``fetcher`` and count Firecrawl pages per question.
+
+ Caveat: only successful captures are recorded in the index, so a Firecrawl
+ attempt that failed its quality gate isn't counted here — billed attempts
+ aren't yet persisted (see the note in the UI).
+ """
+ per_q: dict[str, dict] = {}
+ for _, row in view.iterrows():
+ cap = _index(bucket, prefix, row["url"])
+ q = row["question"]
+ agg = per_q.setdefault(
+ q,
+ {
+ "question": q,
+ "urls": 0,
+ "captured": 0,
+ "playwright": 0,
+ "firecrawl": 0,
+ "other": 0,
+ },
+ )
+ agg["urls"] += 1
+ if not cap:
+ continue
+ agg["captured"] += 1
+ fetcher = (cap.get("fetcher") or "").lower()
+ if fetcher in ("playwright", "firecrawl"):
+ agg[fetcher] += 1
+ else:
+ agg["other"] += 1
+ return per_q
+
+
+@st.cache_data(show_spinner=False)
+def _index(bucket: str, prefix: str, url: str) -> dict | None:
+ """Latest stored capture for a URL (keys + metadata), or None if uncaptured."""
+ body = _get_bytes(bucket, f"{prefix}/index/{url_hash(url)}.json")
+ if not body:
+ return None
+ idx = json.loads(body.decode("utf-8"))
+ ch = idx.get("latest_content_hash")
+ cap = (idx.get("captures") or {}).get(ch)
+ return cap
+
+
+# --- UI --------------------------------------------------------------------
+
+
+def main() -> None:
+ st.set_page_config(page_title="Source Archive Viewer", layout="wide")
+ cfg = _cfg()
+ st.title("📚 Source Archive Viewer")
+
+ location = cfg.local_dir or cfg.s3_bucket
+ if not location:
+ st.error(
+ "No archive configured. Set WEB_ARCHIVE_LOCAL_DIR (a local capture "
+ "directory) or WEB_ARCHIVE_S3_BUCKET (S3), then reload."
+ )
+ st.stop()
+ if cfg.local_dir:
+ st.caption(f"📂 local: {cfg.local_dir}/{cfg.s3_prefix}")
+ else:
+ st.caption(
+ f"s3://{cfg.s3_bucket}/{cfg.s3_prefix} · "
+ f"profile={cfg.aws_profile or 'default'}"
+ )
+
+ with st.sidebar:
+ st.header("Filters")
+ if st.button("🔄 Refresh"):
+ st.cache_data.clear()
+ st.rerun()
+
+ df = _manifest_rows(location, cfg.s3_prefix)
+ if df.empty:
+ st.warning("No manifests found under this prefix yet. Run a capture first.")
+ st.stop()
+
+ with st.sidebar:
+ bots = sorted(df["bot"].unique())
+ qs = sorted(df["question"].unique())
+ sel_bots = st.multiselect("Bot", bots, default=bots)
+ sel_qs = st.multiselect("Question", qs, default=qs)
+ search = st.text_input("URL contains")
+
+ view = df[df["bot"].isin(sel_bots) & df["question"].isin(sel_qs)]
+ if search:
+ view = view[view["url"].str.contains(search, case=False, na=False)]
+ view = view.reset_index(drop=True)
+
+ st.subheader(f"{len(view)} cited URL(s)")
+
+ # Resolve capture status for the filtered rows (cached per-URL).
+ if len(view) > 300:
+ st.info(
+ "Showing 300 of %d — narrow with the filters for capture details."
+ % len(view)
+ )
+ table = []
+ for _, row in view.head(300).iterrows():
+ cap = _index(location, cfg.s3_prefix, row["url"])
+ table.append(
+ {
+ "question": row["question"],
+ "bot": row["bot"],
+ "run_id": row["run_id"],
+ "origin": row["origin"],
+ "captured": "✅" if cap else "—",
+ "fetcher": (cap or {}).get("fetcher", ""),
+ "captured_at": (cap or {}).get("captured_at", "")[:19],
+ "metaculus": row["metaculus"],
+ "comment": row["comment"],
+ "url": row["url"],
+ }
+ )
+ st.dataframe(
+ pd.DataFrame(table),
+ use_container_width=True,
+ hide_index=True,
+ column_config={
+ # Show the full link address as the clickable text (not a label).
+ "url": st.column_config.LinkColumn("url"),
+ "metaculus": st.column_config.LinkColumn(
+ "metaculus", display_text="question ↗"
+ ),
+ "comment": st.column_config.LinkColumn("comment", display_text="comment ↗"),
+ },
+ )
+
+ if st.sidebar.checkbox("💸 Show scraping cost"):
+ st.subheader("💸 Scraping cost (filtered set)")
+ rate = st.number_input(
+ "Firecrawl cost per page ($)",
+ min_value=0.0,
+ value=0.001,
+ step=0.0005,
+ format="%.4f",
+ help="Self-hosted Playwright is free; this prices the Firecrawl "
+ "fallback. Adjust to your plan's credit rate.",
+ )
+ per_q = _scrape_report(location, cfg.s3_prefix, view.head(300))
+ rows, t_fc, t_pw, t_cap, t_url = [], 0, 0, 0, 0
+ for agg in sorted(per_q.values(), key=lambda a: a["question"]):
+ rows.append(
+ {
+ "question": agg["question"],
+ "urls": agg["urls"],
+ "captured": agg["captured"],
+ "playwright (free)": agg["playwright"],
+ "firecrawl (paid)": agg["firecrawl"],
+ "firecrawl $": round(agg["firecrawl"] * rate, 4),
+ }
+ )
+ t_fc += agg["firecrawl"]
+ t_pw += agg["playwright"]
+ t_cap += agg["captured"]
+ t_url += agg["urls"]
+ st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
+ a, b, c = st.columns(3)
+ a.metric("Captured", f"{t_cap}/{t_url}")
+ b.metric("Firecrawl pages", t_fc, help="Playwright pages are free")
+ c.metric("Est. Firecrawl cost", f"${t_fc * rate:.4f}")
+ st.caption(
+ f"Playwright (free): {t_pw} · Firecrawl (paid): {t_fc}. "
+ "⚠️ Only **successful** captures carry a fetcher in the index, so "
+ "Firecrawl attempts that failed the quality gate aren't counted — "
+ "billed-attempt tracking needs the pipeline to persist fetch attempts."
+ )
+
+ st.divider()
+ st.subheader("Inspect a capture")
+ labels = [f"[{r['question']}] {r['url']}" for _, r in view.iterrows()]
+ if not labels:
+ st.stop()
+ choice = st.selectbox("URL", range(len(labels)), format_func=lambda i: labels[i])
+ row = view.iloc[choice]
+ url = row["url"]
+ cap = _index(location, cfg.s3_prefix, url)
+
+ c1, c2 = st.columns([3, 2])
+ with c1:
+ st.markdown(f"**URL:** [{url}]({url})")
+ st.markdown(
+ f"**Question:** `{row['question']}` · **Bot:** `{row['bot']}` · "
+ f"**Origin:** `{row['origin'] or '—'}`"
+ )
+ st.markdown(f"**Run:** `{row['run_id'] or '—'}`")
+ review = row["metaculus"] or row["question_url"]
+ if review:
+ st.markdown(f"**Metaculus question:** [{review}]({review})")
+ if row["comment"]:
+ st.markdown(f"**Cited in comment:** [{row['comment']}]({row['comment']})")
+ if row["query"]:
+ st.markdown(f"**Search query:** `{row['query']}`")
+ if row.get("tool_args"):
+ st.markdown(f"**Tool args:** `{row['tool_args']}`")
+ with c2:
+ if cap:
+ st.markdown(
+ f"**Captured:** {cap.get('captured_at','')[:19]} · "
+ f"**Fetcher:** {cap.get('fetcher','')} · "
+ f"**HTTP:** {cap.get('status_code','?')}"
+ )
+
+ if not cap:
+ st.warning(
+ "No stored capture for this URL — it failed the quality gate / errored, "
+ "or hasn't been captured yet."
+ )
+ st.stop()
+
+ tab_shot, tab_md, tab_html = st.tabs(["🖼 Screenshot", "📝 Markdown", "🌐 HTML"])
+
+ with tab_shot:
+ key = cap.get("screenshot_key")
+ data = _get_bytes(location, key) if key else None
+ if data:
+ st.download_button("Download .webp", data, file_name="screenshot.webp")
+ st.image(data, use_container_width=True)
+ else:
+ st.info("No screenshot stored.")
+
+ with tab_md:
+ key = cap.get("markdown_key")
+ data = _get_bytes(location, key) if key else None
+ if data:
+ text = data.decode("utf-8", "replace")
+ st.download_button("Download .md", data, file_name="page.md")
+ st.caption(f"{len(text):,} chars")
+ st.markdown(text)
+ else:
+ st.info("No markdown stored.")
+
+ with tab_html:
+ key = cap.get("html_key")
+ data = _get_bytes(location, key) if key else None
+ if data:
+ html = data.decode("utf-8", "replace")
+ st.download_button("Download .html", data, file_name="page.html")
+ st.caption(
+ f"{len(html):,} chars · rendered below (CSS/images load from the "
+ "original site and may not all resolve — the screenshot is the "
+ "faithful visual record)."
+ )
+ st.components.v1.html(html, height=800, scrolling=True)
+ else:
+ st.info("No HTML stored.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/poetry.lock b/poetry.lock
index c0fcff5e..f741fa95 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -867,6 +867,29 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
+[[package]]
+name = "cloakbrowser"
+version = "0.3.32"
+description = "Stealth Chromium that passes every bot detection test. Drop-in Playwright replacement with source-level fingerprint patches."
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "cloakbrowser-0.3.32-py3-none-any.whl", hash = "sha256:5a993ee019bfcd00d545d7d6d51837646bcb1e8226545acdf0b543b38a8883df"},
+ {file = "cloakbrowser-0.3.32.tar.gz", hash = "sha256:7361e2f5e366f651b5d54aad3ac13e145462110e0956b538ae3686916c36535a"},
+]
+
+[package.dependencies]
+httpx = ">=0.24"
+playwright = ">=1.40"
+
+[package.extras]
+dev = ["pytest (>=7.0)", "pytest-asyncio (>=0.23)"]
+geoip = ["geoip2 (>=4.0)", "socksio (>=1.0)"]
+patchright = ["patchright (>=1.40)"]
+serve = ["aiohttp (>=3.9)", "websockets (>=12.0)"]
+
[[package]]
name = "colorama"
version = "0.4.6"
@@ -5371,6 +5394,46 @@ dev = ["coverage[toml] (==7.10.7)", "cryptography (>=3.4.0)", "pre-commit", "pyt
docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
tests = ["coverage[toml] (==7.10.7)", "pytest (>=8.4.2,<9.0.0)"]
+[[package]]
+name = "pymupdf"
+version = "1.27.2.3"
+description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc1bc3cae6e9e150b0dbb0a9221bdfd411d65f0db2fe359eaa22467d7cc2a05f"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:660d93cb6da5bbddf11d3982ae27745dd3a9902d9f24cdb69adab83962294b5a"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1dd460a3ae4597a755f00a3bd9771f5ebf1531dc111f6a36bf05dd00a6b84425"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:857842b4888827bd6155a1131341b2822a7ebe9a8c15a975fd7d490d7a64a30c"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:580983849c64a08d08344ca3d1580e87c01f046a8392421797bc850efd72a5b6"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-win32.whl", hash = "sha256:a5c1088a87189891a4946ab314a14b7934ac4c5b6077f7e74ebee956f8906d0e"},
+ {file = "pymupdf-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:d20f68ef15195e073071dbc4ae7455257c7889af7584e39df490c0a92728526e"},
+ {file = "pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2"},
+ {file = "pymupdf-1.27.2.3.tar.gz", hash = "sha256:7a92faa25129e8bbec5e50eeb9214f187665428c31b05c4ef6e36c58c0b1c6d2"},
+]
+
+[[package]]
+name = "pymupdf4llm"
+version = "0.3.4"
+description = "PyMuPDF Utilities for LLM/RAG"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "pymupdf4llm-0.3.4-py3-none-any.whl", hash = "sha256:0517492f82af978541162ade20fc54649cdca52acd478e33b97cb6171d69956f"},
+ {file = "pymupdf4llm-0.3.4.tar.gz", hash = "sha256:48d396a5fb3c14351493c7f1dd25b2a843efdbdc4526e489ee100643a2cebec1"},
+]
+
+[package.dependencies]
+pymupdf = ">=1.27.1"
+tabulate = "*"
+
+[package.extras]
+layout = ["pymupdf-layout (>=1.27.1)"]
+
[[package]]
name = "pyparsing"
version = "3.3.2"
@@ -6659,6 +6722,22 @@ typepy = ">=1.2.0,<3"
logging = ["loguru (>=0.4.1,<1)"]
test = ["pytablewriter (>=0.46)", "pytest"]
+[[package]]
+name = "tabulate"
+version = "0.10.0"
+description = "Pretty-print tabular data"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3"},
+ {file = "tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
[[package]]
name = "tcolorpy"
version = "0.1.7"
@@ -7818,9 +7897,9 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_it
type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""]
[extras]
-source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"]
+source-archive = ["boto3", "cloakbrowser", "firecrawl-py", "playwright", "pymupdf4llm", "trafilatura"]
[metadata]
lock-version = "2.1"
python-versions = "^3.11"
-content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e"
+content-hash = "d9abd6c9194bdd4769704c8c60f48f438f9d77370b35ee739555d3b9fd3e5e22"
diff --git a/pyproject.toml b/pyproject.toml
index d15ad580..c8b322f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,9 +57,16 @@ boto3 = {version = ">=1.34,<2.0.0", optional = true}
playwright = {version = ">=1.44,<2.0.0", optional = true}
firecrawl-py = {version = ">=4.0,<5.0.0", optional = true}
trafilatura = {version = ">=1.9,<3.0.0", optional = true}
+pymupdf4llm = {version = ">=0.0.17,<1.0.0", optional = true}
+# Self-hosted anti-bot backend (CloakBrowser). Pinned tight to 0.3.x: it's a
+# young, fast-moving 0.x package whose launch() API changed recently, so bump
+# the minor deliberately. The pip wheel is light (httpx + playwright); the
+# ~200MB patched Chromium downloads at first launch, not at install.
+cloakbrowser = {version = ">=0.3.31,<0.4.0", optional = true}
[tool.poetry.extras]
-source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"]
+# hyperbrowser is already a core dep (used elsewhere too).
+source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura", "pymupdf4llm", "cloakbrowser"]
[tool.poetry.scripts]
source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main"