From 75f68de9b20e28176202b2882eb061a1939b2275 Mon Sep 17 00:00:00 2001 From: Jaden Earl Date: Thu, 25 Jun 2026 16:11:28 -0600 Subject: [PATCH] Add source archive Capture HTML + screenshot + markdown for the URLs a forecasting bot cited and store them (S3 or local) with provenance, deduplicated by url + content-hash. Heavy backends are an optional `source-archive` extra. Co-Authored-By: Claude Opus 4.8 --- .env.template | 34 +- .gitignore | 4 + _typos.toml | 2 + .../test_source_archive/test_backends.py | 266 +++++++++ .../test_source_archive/test_canonicalize.py | 72 +++ .../test_source_archive/test_catalog.py | 122 ++++ .../test_comment_harvester.py | 91 --- .../test_source_archive/test_content_store.py | 90 ++- .../test_source_archive/test_coverage.py | 110 ++++ .../test_source_archive/test_metaculus_db.py | 124 ++++ .../test_pipeline_and_manifest.py | 113 +++- .../test_source_archive/test_reindex.py | 87 +++ .../test_source_archive/test_reports.py | 51 ++ .../test_screenshot_encoding.py | 67 +++ .../test_trace_extraction.py | 215 +++++++ .../test_url_extraction.py | 19 + .../agents_and_tools/source_archive/README.md | 195 +++++- .../source_archive/__init__.py | 6 +- .../source_archive/benchmark.py | 459 ++++++++++++++ .../source_archive/canonicalize.py | 115 ++++ .../source_archive/catalog.py | 562 ++++++++++++++++++ .../agents_and_tools/source_archive/cli.py | 274 ++++++++- .../agents_and_tools/source_archive/config.py | 42 +- .../source_archive/content_store.py | 203 ++++++- .../source_archive/coverage.py | 237 ++++++++ .../source_archive/fetchers/__init__.py | 96 ++- .../fetchers/cloakbrowser_fetcher.py | 62 ++ .../fetchers/firecrawl_fetcher.py | 46 +- .../fetchers/hyperbrowser_fetcher.py | 149 +++++ .../source_archive/fetchers/pdf_fetcher.py | 146 +++++ .../fetchers/playwright_fetcher.py | 140 ++++- .../source_archive/ingest/__init__.py | 21 +- .../ingest/metaculus_comments.py | 180 ------ .../source_archive/ingest/metaculus_db.py | 215 +++++++ .../source_archive/ingest/trace_extraction.py | 380 ++++++++++++ .../source_archive/ingest/url_extraction.py | 49 +- .../source_archive/manifest.py | 17 +- .../agents_and_tools/source_archive/models.py | 21 +- .../source_archive/pipeline.py | 170 ++++++ .../source_archive/reindex.py | 278 +++++++++ .../source_archive/reports.py | 72 +++ .../source_archive/storage/blob_store.py | 5 + .../source_archive/storage/local_store.py | 10 + .../source_archive/storage/s3_store.py | 6 + .../agents_and_tools/source_archive/viewer.py | 409 +++++++++++++ poetry.lock | 83 ++- pyproject.toml | 9 +- 47 files changed, 5690 insertions(+), 434 deletions(-) create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py delete mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py create mode 100644 code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/benchmark.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/canonicalize.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/catalog.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/coverage.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py delete mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/reindex.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/reports.py create mode 100644 forecasting_tools/agents_and_tools/source_archive/viewer.py diff --git a/.env.template b/.env.template index 7f167fd0..e96d4a48 100644 --- a/.env.template +++ b/.env.template @@ -17,8 +17,40 @@ METACULUS_API_BASE_URL=https://www.metaculus.com/api # As of Jan 23rd 2025, only used for free semantic similarity calculation in Deduplicator, but defaults to OpenAI if not filled in HUGGINGFACE_API_KEY= -# As of Jun 10 2025, used for browser use agents +# As of Jun 10 2025, used for browser use agents. +# Also a fallback capture backend for the source archive (see below). HYPERBROWSER_API_KEY= +# --- Source archive (agents_and_tools/source_archive) ----------------------- +# Capture HTML + screenshot + markdown for every URL a bot cites. All optional; +# blank WEB_ARCHIVE_S3_BUCKET stores locally instead of S3. +WEB_ARCHIVE_S3_BUCKET= +WEB_ARCHIVE_S3_PREFIX=source-archive +WEB_ARCHIVE_AWS_PROFILE= +# Set to a local capture directory to run/view the archive with no S3 (the +# viewer reads from here when set). E.g. `capture --local ./archive`. +WEB_ARCHIVE_LOCAL_DIR= +WEB_ARCHIVE_TTL_DAYS=14 +# Managed fallback backends for the anti-bot / PDF tail behind self-hosted +# Playwright. FIRECRAWL also parses PDFs natively (OCR fallback for PdfFetcher). +FIRECRAWL_API_KEY= +# Firecrawl proxy mode for hardened anti-bot sites: basic (1 credit) | auto | +# stealth/enhanced (5 credits). Leave "basic" unless you need Cloudflare bypass. +WEB_ARCHIVE_FIRECRAWL_PROXY=basic +# Hyperbrowser session knobs (proxy turns a 1-credit scrape into 10 credits). +WEB_ARCHIVE_HYPERBROWSER_PROXY=true +WEB_ARCHIVE_HYPERBROWSER_STEALTH=true +WEB_ARCHIVE_HYPERBROWSER_CAPTCHA=true +# CloakBrowser (self-hosted anti-bot Playwright fork) module, if installed +# (`pip install cloakbrowser`). Exposes cloakbrowser.launch() -> Browser. +WEB_ARCHIVE_CLOAKBROWSER_IMPORT=cloakbrowser +WEB_ARCHIVE_PDF_MAX_PAGES=50 +# Operator-only: database DSN for `harvest-db` (reads a bot's cited URLs straight +# from Postgres). libpq DSN or postgresql:// URL — e.g. a Neon connection string. +# This DSN is a real secret. PREFER the macOS Keychain (item `metaculus-db-dsn`) +# over this file — see the source_archive README "DSN resolution". Leave blank to +# use the Keychain / local default. +METACULUS_DB_DSN= + # Disable if in Streamlit Cloud FILE_WRITING_ALLOWED=TRUE diff --git a/.gitignore b/.gitignore index 96b48188..4a05a450 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,7 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. # .idea/ + +# Private bot trace samples must never land in this public repo (kept locally). +butler-traces/ +**/butler-traces/ diff --git a/_typos.toml b/_typos.toml index 0359061b..83ba4592 100644 --- a/_typos.toml +++ b/_typos.toml @@ -1,6 +1,8 @@ [default] extend-ignore-identifiers-re = [ "AttributeID.*Supress.*", + # Real tracking-query params stripped during URL canonicalization (not typos). + "oly_.*", ] [default.extend-identifiers] diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py new file mode 100644 index 00000000..3adcf9be --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_backends.py @@ -0,0 +1,266 @@ +"""Unit tests for the backup capture backends and the bake-off pricing model. + +These mock the vendor SDKs so they run without API keys, network, browsers, or +the optional pymupdf/playwright/cloakbrowser packages installed. +""" + +from __future__ import annotations + +import base64 + +import pytest + +from forecasting_tools.agents_and_tools.source_archive import benchmark as B +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers import ( + build_default_fetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import ( + CloakBrowserFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import ( + FirecrawlFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import ( + HyperbrowserFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import ( + PdfFetcher, + looks_like_pdf, +) +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + + +# --- Firecrawl proxy/stealth wiring ------------------------------------------ +def test_firecrawl_basic_sends_no_proxy_key(): + f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy="basic")) + assert "proxy" not in f._scrape_kwargs(["markdown"]) + + +@pytest.mark.parametrize("mode", ["auto", "stealth", "enhanced"]) +def test_firecrawl_stealth_sends_proxy_key(mode): + f = FirecrawlFetcher(ArchiveConfig(firecrawl_proxy=mode)) + assert f._scrape_kwargs(["markdown"])["proxy"] == mode + + +def test_firecrawl_fetch_pdf_markdown(): + class FakeClient: + def scrape(self, url, **kwargs): + assert kwargs["formats"] == ["markdown"] + return {"markdown": "# PDF body " + "x " * 200} + + f = FirecrawlFetcher(ArchiveConfig(firecrawl_api_key="k"), client=FakeClient()) + assert f.fetch_pdf_markdown("https://x/y.pdf").startswith("# PDF body") + + +# --- Hyperbrowser screenshot coercion + result mapping ----------------------- +def test_hyperbrowser_coerce_screenshot_data_uri(): + raw = b"\x89PNG fake" + uri = "data:image/png;base64," + base64.b64encode(raw).decode() + shot, ctype = HyperbrowserFetcher._coerce_screenshot(uri) + assert shot == raw and ctype == "image/png" + + +def test_hyperbrowser_coerce_screenshot_bare_base64(): + raw = b"\x89PNG fake" + shot, ctype = HyperbrowserFetcher._coerce_screenshot(base64.b64encode(raw).decode()) + assert shot == raw and ctype == "image/png" + + +def test_hyperbrowser_coerce_screenshot_none(): + assert HyperbrowserFetcher._coerce_screenshot(None) == (None, None) + + +def test_hyperbrowser_fetch_maps_result(monkeypatch): + class Data: + metadata = {"statusCode": 200, "title": "T", "sourceURL": "https://final"} + html = "ok" + markdown = "ok " * 100 + screenshot = None + + class Resp: + status = "completed" + error = None + data = Data() + + class FakeClient: + class scrape: + @staticmethod + def start_and_wait(params): + return Resp() + + f = HyperbrowserFetcher( + ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient() + ) + # Avoid constructing real SDK request models in the unit test. + monkeypatch.setattr(f, "_params", lambda url: None) + result = f.fetch("https://x") + assert result.fetcher == "hyperbrowser" + assert result.final_url == "https://final" + assert result.status_code == 200 + assert result.metadata["used_proxy"] is True + + +def test_hyperbrowser_failed_job_raises(monkeypatch): + class Resp: + status = "failed" + error = "blocked" + data = None + + class FakeClient: + class scrape: + @staticmethod + def start_and_wait(params): + return Resp() + + f = HyperbrowserFetcher( + ArchiveConfig(hyperbrowser_api_key="k"), client=FakeClient() + ) + monkeypatch.setattr(f, "_params", lambda url: None) + with pytest.raises(FetchError): + f.fetch("https://x") + + +def test_hyperbrowser_requires_key(): + with pytest.raises(FetchError): + HyperbrowserFetcher(ArchiveConfig(hyperbrowser_api_key=None)).fetch("https://x") + + +# --- PDF fetcher ------------------------------------------------------------- +def test_looks_like_pdf(): + assert looks_like_pdf("https://x/report.pdf") + assert looks_like_pdf("https://x/report.PDF?v=2") + assert not looks_like_pdf("https://x/report.html") + + +def test_pdf_rejects_non_pdf_bytes(): + f = PdfFetcher( + ArchiveConfig(), + downloader=lambda url, t: (b"not a pdf", url, 200), + ) + with pytest.raises(FetchError): + f.fetch("https://x/fake.pdf") + + +def test_pdf_falls_back_to_firecrawl_when_local_thin(monkeypatch): + class FakeFirecrawl: + def fetch_pdf_markdown(self, url): + return "# Scanned doc recovered by OCR " + "y " * 200 + + f = PdfFetcher( + ArchiveConfig(), + firecrawl=FakeFirecrawl(), + downloader=lambda url, t: (b"%PDF- minimal", url, 200), + ) + # Force the local parser to look thin regardless of whether pymupdf is present. + monkeypatch.setattr(f, "_parse_local", lambda data: (None, None, None, 3, "none")) + result = f.fetch("https://x/scan.pdf") + assert result.metadata["pdf_engine"] == "firecrawl" + assert "OCR" in result.markdown + + +def test_pdf_uses_local_when_text_is_rich(monkeypatch): + f = PdfFetcher( + ArchiveConfig(), + downloader=lambda url, t: (b"%PDF- minimal", url, 200), + ) + rich = "# Title\n" + "real body text " * 100 + monkeypatch.setattr( + f, "_parse_local", lambda data: (rich, b"png", "image/png", 5, "pymupdf4llm") + ) + result = f.fetch("https://x/clean.pdf") + assert result.metadata["pdf_engine"] == "pymupdf4llm" + assert result.metadata["pdf_pages"] == 5 + assert result.screenshot == b"png" + + +# --- CloakBrowser ------------------------------------------------------------ +def test_cloakbrowser_missing_package_gives_clear_error(monkeypatch): + # Force every import to fail so this passes whether or not cloakbrowser is + # actually installed in the test environment. + import forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher as cb + + def _boom(name): + raise ImportError(name) + + monkeypatch.setattr(cb.importlib, "import_module", _boom) + f = CloakBrowserFetcher(ArchiveConfig()) + with pytest.raises(FetchError) as exc: + f._launch_browser() + assert "cloakbrowser" in str(exc.value).lower() + + +# --- Pricing model ----------------------------------------------------------- +def test_pricing_self_host_is_floor(): + r = CaptureResult(url="u", final_url="u") + assert B.estimate_cost("playwright", r, 1_000_000, B.Pricing()) == 0.00001 + assert B.estimate_cost("cloakbrowser", r, 1_000_000, B.Pricing()) == 0.00001 + + +def test_pricing_firecrawl_basic_vs_stealth(): + basic = CaptureResult(url="u", final_url="u", metadata={"firecrawl_proxy": "basic"}) + stealth = CaptureResult( + url="u", final_url="u", metadata={"firecrawl_proxy": "auto"} + ) + assert B.estimate_cost("firecrawl", basic, 0, B.Pricing()) == pytest.approx(0.00083) + assert B.estimate_cost( + "firecrawl-stealth", stealth, 0, B.Pricing() + ) == pytest.approx(0.00415) + + +def test_pricing_hyperbrowser_proxy_includes_bandwidth(): + r = CaptureResult(url="u", final_url="u", metadata={"used_proxy": True}) + # 10 credits ($0.01) + 1MB * $10/GB ($0.01) = $0.02 + assert B.estimate_cost("hyperbrowser", r, 1_000_000, B.Pricing()) == pytest.approx( + 0.02 + ) + + +def test_pricing_pdf_local_is_free_firecrawl_is_per_page(): + local = CaptureResult( + url="u", final_url="u", metadata={"pdf_engine": "pymupdf4llm"} + ) + ocr = CaptureResult( + url="u", final_url="u", metadata={"pdf_engine": "firecrawl", "pdf_pages": 10} + ) + assert B.estimate_cost("pdf", local, 0, B.Pricing()) == 0.0 + assert B.estimate_cost("pdf", ocr, 0, B.Pricing()) == pytest.approx(0.0083) + + +# --- Default tiered chain composition ---------------------------------------- +def _fake_browser(): + from unittest.mock import MagicMock + + return None, MagicMock() # (playwright_handle, browser) — browser.close() ok + + +def test_default_chain_cloakbrowser_is_primary(monkeypatch): + # CloakBrowser available -> it is the single self-hosted browser tier. + monkeypatch.setattr( + CloakBrowserFetcher, "_launch_browser", lambda self: _fake_browser() + ) + config = ArchiveConfig(hyperbrowser_api_key="h", firecrawl_api_key="f") + with build_default_fetcher(config) as fetcher: + names = [b.name for b in fetcher._tiered.backends] + # Note: exactly one browser tier (cloakbrowser), not vanilla + cloak. + assert names == ["cloakbrowser", "pdf", "hyperbrowser", "firecrawl"] + + +def test_default_chain_falls_back_to_playwright_and_skips_unkeyed(monkeypatch): + from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( + PlaywrightFetcher, + ) + + # CloakBrowser not installed -> vanilla Playwright is the browser tier. + def raise_unavailable(self): + raise FetchError("cloakbrowser not installed") + + monkeypatch.setattr(CloakBrowserFetcher, "_launch_browser", raise_unavailable) + monkeypatch.setattr( + PlaywrightFetcher, "_launch_browser", lambda self: _fake_browser() + ) + config = ArchiveConfig(hyperbrowser_api_key=None, firecrawl_api_key=None) + with build_default_fetcher(config) as fetcher: + names = [b.name for b in fetcher._tiered.backends] + assert names == ["playwright", "pdf"] diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py new file mode 100644 index 00000000..e9476409 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_canonicalize.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import pytest + +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) +from forecasting_tools.agents_and_tools.source_archive.models import url_hash + +# (raw, expected canonical) — each pair documents one normalization rule. +CASES = [ + # fragment dropped + ("https://a.test/x#section", "https://a.test/x"), + # trailing slash dropped (non-root) + ("https://a.test/x/", "https://a.test/x"), + # root path collapses (with or without slash) to host only + ("https://a.test/", "https://a.test"), + ("https://a.test", "https://a.test"), + # scheme + host lowercased, path case preserved + ("HTTPS://A.TEST/Path", "https://a.test/Path"), + # default ports stripped, non-default kept + ("http://a.test:80/x", "http://a.test/x"), + ("https://a.test:443/x", "https://a.test/x"), + ("https://a.test:8443/x", "https://a.test:8443/x"), + # tracking params removed, meaningful params kept + ("https://a.test/x?utm_source=z&utm_medium=email", "https://a.test/x"), + ("https://a.test/x?id=7&fbclid=abc", "https://a.test/x?id=7"), + ("https://a.test/x?gclid=abc&igshid=q", "https://a.test/x"), + # remaining params sorted (order-independent) + ("https://a.test/x?b=2&a=1", "https://a.test/x?a=1&b=2"), + # bare "ref"/"source" are intentionally preserved + ("https://a.test/x?ref=home", "https://a.test/x?ref=home"), + # combination + ( + "HTTPS://A.TEST:443/Path/?b=2&utm_campaign=spring&a=1#frag", + "https://a.test/Path?a=1&b=2", + ), + # non-http(s) left alone + ("mailto:someone@a.test", "mailto:someone@a.test"), +] + + +@pytest.mark.parametrize("raw,expected", CASES) +def test_canonicalize_cases(raw: str, expected: str): + assert canonicalize_url(raw) == expected + + +@pytest.mark.parametrize("raw,_expected", CASES) +def test_canonicalize_is_idempotent(raw: str, _expected: str): + once = canonicalize_url(raw) + assert canonicalize_url(once) == once + + +def test_near_duplicates_share_a_url_hash(): + variants = [ + "https://a.test/article", + "https://a.test/article/", + "https://a.test/article#intro", + "https://a.test/article?utm_source=newsletter", + "HTTPS://A.test/article", + ] + hashes = {url_hash(v) for v in variants} + assert len(hashes) == 1 + + +def test_distinct_pages_keep_distinct_hashes(): + assert url_hash("https://a.test/x?id=1") != url_hash("https://a.test/x?id=2") + assert url_hash("https://a.test/x") != url_hash("https://a.test/y") + + +def test_empty_and_none_safe(): + assert canonicalize_url("") == "" diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py new file mode 100644 index 00000000..e50775ee --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_catalog.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io +from forecasting_tools.agents_and_tools.source_archive.catalog import ( + build_catalog, + write_catalog, +) +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.models import ( + CaptureResult, + CitationRecord, +) +from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + +def _capture(url: str, html: str) -> CaptureResult: + return CaptureResult( + url=url, + final_url=url, + status_code=200, + html=html, + markdown="md " * 30, + screenshot=b"img", + screenshot_content_type="image/png", + fetcher="fake", + ) + + +def _seed(tmp_path): + store = LocalBlobStore(tmp_path) + config = ArchiveConfig(s3_prefix="t") + cstore = ContentStore(store, config) + cstore.store(_capture("https://a.test/p", "

a

")) + cstore.store(_capture("https://b.test/q", "

b

")) + # uncaptured.test/x is cited but never captured. + records = [ + CitationRecord( + url="https://a.test/p?utm_source=news", # canonicalizes to /p + run_id="r1", + bot="alpha", + question_id="100", + question_url="https://www.metaculus.com/questions/100/", + tool_name="web_search", + ), + CitationRecord( + url="https://b.test/q", + run_id="r1", + bot="beta", + question_id="100", + question_url="https://www.metaculus.com/questions/100/", + tool_name="page_fetch", + ), + CitationRecord( + url="https://uncaptured.test/x", + run_id="r1", + bot="alpha", + question_id="100", + ), + # A data/API call made only via run_code -> excluded from the catalog. + CitationRecord( + url="https://data.test/api?fmt=csv", + run_id="r1", + bot="beta", + question_id="100", + tool_name="run_code", + ), + ] + manifest_io.write_blob(store, "r1", records, config) + return store, config + + +def test_build_catalog_joins_and_canonicalizes(tmp_path): + store, config = _seed(tmp_path) + data = build_catalog(store, config) + + # The two a.test variants collapse to one source; the run_code API call is + # excluded (tool/API call, not a page). + urls = {s.canonical_url for s in data.sources} + assert urls == { + "https://a.test/p", + "https://b.test/q", + "https://uncaptured.test/x", + } + assert data.excluded.get("tool_call") == 1 + assert "https://data.test/api?fmt=csv" not in urls + captured = {s.canonical_url for s in data.sources if s.captured} + assert captured == {"https://a.test/p", "https://b.test/q"} + + by_q = data.by_question() + assert set(by_q) == {"100"} + assert len(by_q["100"]) == 3 + by_bot = data.by_bot() + assert set(by_bot) == {"alpha", "beta"} + + +def test_write_catalog_emits_views(tmp_path): + store, config = _seed(tmp_path) + summary = write_catalog(store, config) + + assert summary.sources == 3 + assert summary.captured == 2 + assert summary.questions == 1 + assert summary.excluded.get("tool_call") == 1 + + keys = set(store.list_keys("t/catalog/")) + assert "t/catalog/index.html" in keys + assert "t/catalog/READ_ME_FIRST.html" in keys + assert "t/catalog/by-question/100.html" in keys + assert "t/catalog/by-question/100.csv" in keys + assert "t/catalog/by-bot/alpha.html" in keys + assert "t/catalog/by-domain/a.test.html" in keys + + q_html = store.get("t/catalog/by-question/100.html").decode("utf-8") + assert "https://a.test/p" in q_html + assert "alpha" in q_html # bot tag present + # Local links are relative into the content store. + assert "../../content/" in q_html + + q_csv = store.get("t/catalog/by-question/100.csv").decode("utf-8") + assert "https://uncaptured.test/x" in q_csv + assert "no" in q_csv # uncaptured row marked diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py deleted file mode 100644 index 81874d80..00000000 --- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import annotations - -from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import ( - MetaculusCommentHarvester, -) - - -def _leaderboard(): - return { - "leaderboard_entries": [ - {"user": {"id": 1, "username": "botA", "is_bot": True}}, - {"user": {"id": 2, "username": "human", "is_bot": False}}, - {"user": {"id": 3, "username": "botB", "is_bot": True}}, - ] - } - - -def test_enumerate_bots_filters_non_bots(): - def fetch(path, params): - assert path == "/leaderboards/project/123/" - assert params["with_entries"] == "true" - return _leaderboard() - - h = MetaculusCommentHarvester(fetch_json=fetch) - bots = h.enumerate_bots(123) - assert [b["id"] for b in bots] == [1, 3] - - -def test_harvest_author_builds_records_with_provenance(): - def fetch(path, params): - assert path == "/comments/" - if params["offset"] == 0: - return { - "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}] - } - return {"results": []} - - h = MetaculusCommentHarvester(fetch_json=fetch) - records = h.harvest_author(1, run_id="r1", bot="botA") - assert len(records) == 1 - rec = records[0] - assert rec.url == "https://a.test/x" - assert rec.bot == "botA" - assert rec.run_id == "r1" - assert rec.question_id == "555" - assert rec.question_url == "https://www.metaculus.com/questions/555/" - assert rec.trace == "comment:10" - assert rec.origin == "metaculus_comment" - - -def test_iter_comments_paginates_until_short_page(): - calls = [] - - def fetch(path, params): - calls.append(params["offset"]) - if params["offset"] == 0: - return {"results": [{"id": i, "text": ""} for i in range(100)]} - return {"results": [{"id": 999, "text": ""}]} # short page -> stop - - h = MetaculusCommentHarvester(fetch_json=fetch) - comments = list(h.iter_comments(1)) - assert len(comments) == 101 - assert calls == [0, 100] - - -def test_harvest_project_aggregates_bots(): - def fetch(path, params): - if path.startswith("/leaderboards/project/"): - return _leaderboard() - # one URL per bot, single page each - if params["offset"] == 0: - author = params["author"] - return { - "results": [ - {"id": author, "on_post": 1, "text": f"https://bot{author}.test"} - ] - } - return {"results": []} - - h = MetaculusCommentHarvester(fetch_json=fetch) - records = h.harvest_project(123) - assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"} - assert {r.bot for r in records} == {"botA", "botB"} - assert all(r.run_id == "metaculus-comments-123" for r in records) - - -def test_custom_base_url_drives_web_base(): - h = MetaculusCommentHarvester( - base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []} - ) - assert h.web_base == "https://example.org" diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py index c6f83ef3..a1c1d6c0 100644 --- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py @@ -15,10 +15,10 @@ def _store(tmp_path, **cfg) -> ContentStore: return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg)) -def _result(url: str, html: str) -> CaptureResult: +def _result(url: str, html: str, final_url: str | None = None) -> CaptureResult: return CaptureResult( url=url, - final_url=url, + final_url=final_url if final_url is not None else url, status_code=200, html=html, markdown="md " * 50, @@ -73,3 +73,89 @@ def test_changed_content_creates_new_capture(tmp_path): second = store.store(_result("https://a.test", "

v2 changed

")) assert second.created is True assert first.capture.content_hash != second.capture.content_hash + + +# --- Phase B: redirect aliasing ------------------------------------------- +def test_redirect_keys_capture_by_final_url(tmp_path): + store = _store(tmp_path) + res = store.store( + _result("https://bit.ly/x", "

dest

", final_url="https://dest.test/page") + ) + # Capture is stored under the FINAL url's hash, not the shortener's. + assert res.capture.url == "https://dest.test/page" + assert res.capture.url_hash == url_hash("https://dest.test/page") + # The canonical index records the cited shortener as an alias. + canonical = store._read_index(url_hash("https://dest.test/page")) + assert "https://bit.ly/x" in canonical["aliases"] + + +def test_lookup_via_shortener_and_final_both_hit(tmp_path): + store = _store(tmp_path) + store.store( + _result("https://bit.ly/x", "

dest

", final_url="https://dest.test/page") + ) + via_alias = store.lookup("https://bit.ly/x") + via_final = store.lookup("https://dest.test/page") + assert via_alias is not None and via_final is not None + assert via_alias.content_hash == via_final.content_hash + assert via_alias.url == "https://dest.test/page" + + +def test_two_shorteners_to_same_page_store_once(tmp_path): + store = _store(tmp_path) + first = store.store( + _result("https://bit.ly/x", "

same

", final_url="https://dest.test/page") + ) + second = store.store( + _result("https://t.co/y", "

same

", final_url="https://dest.test/page") + ) + assert first.created is True + assert second.created is False # identical content deduped, not re-stored + canonical = store._read_index(url_hash("https://dest.test/page")) + assert set(canonical["aliases"]) == {"https://bit.ly/x", "https://t.co/y"} + assert len(canonical["captures"]) == 1 + + +# --- Phase C: cross-URL content dedup ------------------------------------- +def test_identical_content_across_distinct_urls_reuses_blobs(tmp_path): + store = _store(tmp_path) + a = store.store(_result("https://a.test/x", "

same

")) + b = store.store(_result("https://b.test/y", "

same

")) + + # Both are real captures (each URL has its own index entry)... + assert a.created is True and b.created is True + # ...but B reuses A's blobs instead of writing its own. + assert a.capture.content_alias_of is None + assert b.capture.content_alias_of == url_hash("https://a.test/x") + assert b.capture.html_key == a.capture.html_key + + # No duplicate blob was written under B's url hash. + b_own_key = ( + f"t/content/{url_hash('https://b.test/y')}/{b.capture.content_hash}.html" + ) + assert not store.blobs.exists(b_own_key) + assert store.blobs.exists(a.capture.html_key) + + +def test_content_reverse_index_tracks_members(tmp_path): + store = _store(tmp_path) + store.store(_result("https://a.test/x", "

same

")) + store.store(_result("https://b.test/y", "

same

")) + + ch = store.store(_result("https://c.test/z", "

same

")).capture.content_hash + reverse = store._read_content_index(ch) + assert reverse["canonical_url_hash"] == url_hash("https://a.test/x") + member_hashes = {m["url_hash"] for m in reverse["members"]} + assert member_hashes == { + url_hash("https://a.test/x"), + url_hash("https://b.test/y"), + url_hash("https://c.test/z"), + } + + +def test_different_content_not_aliased(tmp_path): + store = _store(tmp_path) + a = store.store(_result("https://a.test/x", "

one

")) + b = store.store(_result("https://b.test/y", "

two different

")) + assert b.capture.content_alias_of is None + assert b.capture.html_key != a.capture.html_key diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py new file mode 100644 index 00000000..155d3772 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_coverage.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.catalog import Citation, Source +from forecasting_tools.agents_and_tools.source_archive.coverage import ( + coverage_from_sources, +) + + +def _src(url, domain, captured, cits): + return Source(canonical_url=url, domain=domain, captured=captured, citations=cits) + + +def _trace(bot, q, tool): + return Citation(bot=bot, question_id=q, tool_name=tool, origin="tool_result") + + +def _comment(bot, q): + return Citation(bot=bot, question_id=q, origin="metaculus_comment") + + +SOURCES = [ + _src( + "https://a.test/1", + "a.test", + True, + [_trace("template", "100", "scrape_webpage")], + ), + _src( + "https://b.test/2", + "b.test", + False, + [_trace("template", "100", "scrape_webpage")], + ), + _src("https://c.test/3", "c.test", True, [_comment("otherbot", "200")]), + # run_code-only -> excluded as a tool/API call + _src( + "https://data.test/x", + "data.test", + False, + [_trace("template", "100", "run_code")], + ), + # search-engine result page -> excluded as a non-source + _src( + "https://www.google.com/search?q=x", + "google.com", + False, + [_trace("template", "100", "scrape_webpage")], + ), + # malformed (extractor junk) -> excluded + _src( + "https://a.test/y%5B1%5D", + "a.test", + False, + [_trace("template", "100", "scrape_webpage")], + ), +] + + +def test_trace_report_excludes_non_sources_and_counts_pages(): + r = coverage_from_sources(SOURCES, "trace") + assert r.cited == 2 # a.test/1 + b.test/2 (data/search/malformed excluded) + assert r.captured == 1 + assert r.pct == 50.0 + assert r.excluded == {"tool_call": 1, "search": 1, "malformed": 1} + assert r.missing == 1 + assert r.missing_urls == ["https://b.test/2"] + + by_q = {row.label: (row.cited, row.captured) for row in r.by_question} + assert by_q == {"100": (2, 1)} + by_tool = {row.label: (row.cited, row.captured) for row in r.by_tool} + assert by_tool == {"scrape_webpage": (2, 1)} + missed = {row.label for row in r.missed_by_domain} + assert missed == {"b.test"} + + +def test_comment_report_is_separate(): + r = coverage_from_sources(SOURCES, "comments") + assert r.cited == 1 # only the metaculus_comment source + assert r.captured == 1 + assert r.pct == 100.0 + assert {row.label for row in r.by_bot} == {"otherbot"} + + +def test_modes_do_not_bleed(): + trace = coverage_from_sources(SOURCES, "trace") + comments = coverage_from_sources(SOURCES, "comments") + assert "https://c.test/3" not in trace.missing_urls # comment source not in trace + # the trace bot never appears in the comment report + assert "template" not in {row.label for row in comments.by_bot} + + +def test_csv_export_has_overall_row(): + csv_text = coverage_from_sources(SOURCES, "trace").to_csv() + assert "group,label,cited,captured,pct" in csv_text + assert "overall,trace,2,1,50.0" in csv_text + + +def test_outcomes_split_never_fetched_vs_failed(): + # b.test/2 is the only missing page source. With no outcome for it, it's a + # pure collection gap (never fetched). + r = coverage_from_sources(SOURCES, "trace", {"https://a.test/1": "stored"}) + assert r.has_outcomes is True + assert r.missing_never_fetched == 1 + assert r.missing_fetch_failed == 0 + + # If a run report shows b.test/2 was fetched and failed, it's a capture + # problem, not a collection gap. + r2 = coverage_from_sources(SOURCES, "trace", {"https://b.test/2": "error"}) + assert r2.missing_never_fetched == 0 + assert r2.missing_fetch_failed == 1 diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py new file mode 100644 index 00000000..54cab175 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_metaculus_db.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import ( + LOCAL_DEFAULT_DSN, + MetaculusDbHarvester, + resolve_dsn, +) + + +def test_harvest_post_builds_records_with_provenance(): + rows = [ + { + "comment_id": 1, + "on_post_id": 42, + "text": "see https://a.test/x and https://b.test/y", + "username": "alpha", + "author_id": 7, + }, + { + "comment_id": 2, + "on_post_id": 42, + "text": "https://a.test/x again", + "username": "beta", + "author_id": 8, + }, + ] + seen = {} + + def query(sql, params): + seen["sql"], seen["params"] = sql, params + return rows + + records = MetaculusDbHarvester(query).harvest_post(42) + + assert seen["params"] == (42,) + assert {r.url for r in records} == {"https://a.test/x", "https://b.test/y"} + r0 = next(r for r in records if r.url == "https://a.test/x") + assert r0.origin == "metaculus_comment" + assert r0.question_id == "42" + assert r0.question_url == "https://www.metaculus.com/questions/42/" + assert r0.bot in ("alpha", "beta") + # one record per (URL, comment): a.test/x is cited in both comments + assert sum(r.url == "https://a.test/x" for r in records) == 2 + + +def test_harvest_recent_passes_days_and_limit(): + seen = {} + + def query(sql, params): + seen["sql"], seen["params"] = sql, params + return [] + + MetaculusDbHarvester(query).harvest_recent(days=3, limit=50) + assert seen["params"] == (3, 50) + assert "limit %s" in seen["sql"] + + +def test_harvest_recent_uncapped_by_default(): + seen = {} + + def query(sql, params): + seen["sql"], seen["params"] = sql, params + return [] + + # A daily sweep wants every row from the latest day, not a 1000-row cap. + MetaculusDbHarvester(query).harvest_recent(days=1) + assert seen["params"] == (1,) + assert "limit" not in seen["sql"].lower() + + +def test_includes_private_bot_comments_by_default(): + seen = {} + + def query(sql, params): + seen["sql"] = sql + return [] + + # The day-behind replica's value is the now-private bot reasoning, so the + # default read must NOT filter private rows out. + MetaculusDbHarvester(query).harvest_recent(days=1) + assert "is_private" not in seen["sql"] + assert "u.is_bot" in seen["sql"] + + +def test_public_only_filters_private_comments(): + seen = {} + + def query(sql, params): + seen["sql"] = sql + return [] + + MetaculusDbHarvester(query).harvest_post(42, include_private=False) + assert "not c.is_private" in seen["sql"] + + +def test_resolve_dsn_prefers_explicit_then_env_then_keychain(): + # explicit flag wins over everything + assert ( + resolve_dsn( + "postgresql://flag", + env={"METACULUS_DB_DSN": "postgresql://env"}, + keychain_reader=lambda: "postgresql://kc", + ) + == "postgresql://flag" + ) + # then the env var + assert ( + resolve_dsn( + None, + env={"METACULUS_DB_DSN": "postgresql://env"}, + keychain_reader=lambda: "postgresql://kc", + ) + == "postgresql://env" + ) + # then the keychain (the private path) + assert ( + resolve_dsn(None, env={}, keychain_reader=lambda: "postgresql://kc") + == "postgresql://kc" + ) + + +def test_resolve_dsn_falls_back_to_local_default(): + # nothing configured and no keychain item -> local dev DB, not a crash + assert resolve_dsn(None, env={}, keychain_reader=lambda: None) == LOCAL_DEFAULT_DSN diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py index 033d1689..fa87838d 100644 --- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py @@ -4,7 +4,10 @@ from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord -from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline +from forecasting_tools.agents_and_tools.source_archive.pipeline import ( + CapturePipeline, + capture_urls_concurrent, +) from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore @@ -15,6 +18,114 @@ def _pipeline(tmp_path, fetcher) -> CapturePipeline: return CapturePipeline(fetcher, store) +def test_capture_urls_concurrent_captures_all(tmp_path, make_fetcher): + from contextlib import contextmanager + + config = ArchiveConfig(s3_prefix="t", concurrency=4) + store = ContentStore(LocalBlobStore(tmp_path), config) + urls = [f"https://s{i}.test/p" for i in range(12)] + + @contextmanager + def factory(_cfg): + f = make_fetcher() + for u in urls: + f.add(u) + yield f + + summary = capture_urls_concurrent(urls, store, config, factory) + + assert len(summary.outcomes) == 12 + assert summary.count("stored") == 12 + # every URL is resolvable afterwards (proves the shared store got all writes) + assert all(store.lookup(u) is not None for u in urls) + + +def test_concurrent_supervisor_recovers_a_stuck_worker(tmp_path, make_fetcher): + import threading + from contextlib import contextmanager + + config = ArchiveConfig(s3_prefix="t", concurrency=1) + store = ContentStore(LocalBlobStore(tmp_path), config) + urls = ["https://stuck.test/x"] + reaped = threading.Event() + builds = {"n": 0} + + class _Wedges: + name = "wedge" + + def fetch(self, url): + # Block until the supervisor's reaper "kills the browser", then surface + # the dead-browser error a killed Chromium would raise. + reaped.wait(5) + raise RuntimeError("Target page, context or browser has been closed") + + @contextmanager + def factory(_cfg): + builds["n"] += 1 + if builds["n"] == 1: + yield _Wedges() # first browser wedges + else: + fetcher = make_fetcher() + fetcher.add(urls[0]) + yield fetcher # rebuilt browser works + + # Inject a fake reaper so the test drives the supervisor without real Chromium. + summary = capture_urls_concurrent( + urls, store, config, factory, per_url_timeout=0.3, reaper=reaped.set + ) + + assert builds["n"] == 2 # stalled -> reaped -> death -> rebuild -> retry + assert summary.count("stored") == 1 # recovered and captured on a fresh browser + + +def test_concurrent_restarts_browser_after_death(tmp_path, make_fetcher): + from contextlib import contextmanager + + config = ArchiveConfig(s3_prefix="t", concurrency=1) + store = ContentStore(LocalBlobStore(tmp_path), config) + urls = ["https://a.test/x"] + builds = {"n": 0} + + class _DeadBrowser: + name = "dead" + + def fetch(self, url): + raise RuntimeError("Target page, context or browser has been closed") + + @contextmanager + def factory(_cfg): + builds["n"] += 1 + if builds["n"] == 1: + yield _DeadBrowser() # first browser is dead + else: + fetcher = make_fetcher() + fetcher.add(urls[0]) + yield fetcher # rebuilt browser works + + summary = capture_urls_concurrent(urls, store, config, factory) + + assert builds["n"] == 2 # detected death, rebuilt once + assert summary.count("stored") == 1 # retry on the fresh browser succeeded + + +class _BoomFetcher: + """Raises an unexpected (non-FetchError) exception, like a bad screenshot.""" + + name = "boom" + + def fetch(self, url): + raise ValueError("kaboom") + + +def test_pipeline_isolates_unexpected_fetcher_errors(tmp_path): + # One pathological URL must not abort the whole run. + pipe = _pipeline(tmp_path, _BoomFetcher()) + summary = pipe.run(["https://a.test", "https://b.test"]) + assert summary.count("error") == 2 + assert len(summary.outcomes) == 2 + assert all(o.reason.startswith("unexpected:") for o in summary.outcomes) + + def test_manifest_roundtrip_and_unique_urls(): records = [ CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"), diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py new file mode 100644 index 00000000..82e5f5b7 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reindex.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import json + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.reindex import ( + analyze, + rebuild_content_index, +) +from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + +def _put_index(store, key: str, body: dict) -> None: + store.put(f"t/index/{key}.json", json.dumps(body).encode("utf-8")) + + +def _canonical(url: str, content_hash: str) -> dict: + return { + "url": url, + "url_hash": f"hash_of_{url}", + "latest_content_hash": content_hash, + "captures": { + content_hash: { + "url": url, + "url_hash": f"hash_of_{url}", + "content_hash": content_hash, + "html_key": f"t/content/hash_of_{url}/{content_hash}.html", + } + }, + } + + +def _seed(tmp_path) -> tuple[LocalBlobStore, ArchiveConfig]: + store = LocalBlobStore(tmp_path) + config = ArchiveConfig(s3_prefix="t") + # Legacy rows stored under raw hashing: two URLs that now canonicalize equal. + _put_index(store, "h1", _canonical("https://x.test/p?utm_source=news", "c1")) + _put_index(store, "h2", _canonical("https://x.test/p", "c2")) + # Two distinct URLs with byte-identical content (same latest hash). + _put_index(store, "h3", _canonical("https://a.test/1", "cX")) + _put_index(store, "h4", _canonical("https://b.test/2", "cX")) + # Same host+path, meaningful query differs -> Phase D candidate. + _put_index(store, "h5", _canonical("https://q.test/item?id=1", "n1")) + _put_index(store, "h6", _canonical("https://q.test/item?id=2", "n2")) + # An alias (redirect) index -> counted but not a capture. + _put_index(store, "h7", {"url": "https://bit.ly/z", "alias_of": "hash_of_x"}) + return store, config + + +def test_analyze_reports_all_three_lenses(tmp_path): + store, config = _seed(tmp_path) + report = analyze(store, config) + + assert report.total_url_indexes == 7 + assert report.alias_indexes == 1 + assert report.canonical_captures == 6 + + canon_keys = {c.key for c in report.canonicalization_clusters} + assert "https://x.test/p" in canon_keys + + content_urls = {tuple(c.urls) for c in report.content_clusters} + assert ("https://a.test/1", "https://b.test/2") in content_urls + + near_keys = {c.key for c in report.near_dup_clusters} + assert "https://q.test/item" in near_keys + + +def test_analyze_ignores_reverse_content_index(tmp_path): + store, config = _seed(tmp_path) + # A by-content reverse index must not be mistaken for a URL index. + store.put( + "t/index/by-content/cX.json", + json.dumps({"content_hash": "cX", "canonical_url_hash": "x"}).encode("utf-8"), + ) + report = analyze(store, config) + assert report.total_url_indexes == 7 # unchanged + + +def test_rebuild_content_index_is_dry_by_default(tmp_path): + store, config = _seed(tmp_path) + groups = rebuild_content_index(store, config, apply=False) + assert groups >= 1 + # Dry run wrote nothing under by-content/. + assert not list(store.list_keys("t/index/by-content/")) + + rebuild_content_index(store, config, apply=True) + assert list(store.list_keys("t/index/by-content/")) diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py new file mode 100644 index 00000000..a3248fa8 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_reports.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.pipeline import ( + CaptureOutcome, + PipelineSummary, +) +from forecasting_tools.agents_and_tools.source_archive.reports import ( + read_outcomes, + write_run_report, +) +from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + +def test_run_report_roundtrip_canonicalizes(tmp_path): + store = LocalBlobStore(tmp_path) + config = ArchiveConfig(s3_prefix="t") + summary = PipelineSummary( + outcomes=[ + CaptureOutcome(url="https://a.test/p?utm_source=x", status="stored"), + CaptureOutcome(url="https://b.test/q", status="error", reason="cloudflare"), + ] + ) + write_run_report(store, "r1", summary, config) + + out = read_outcomes(store, config) + # keys are canonicalized (tracking param stripped) + assert out["https://a.test/p"] == "stored" + assert out["https://b.test/q"] == "error" + + +def test_captured_status_wins_over_failure(tmp_path): + store = LocalBlobStore(tmp_path) + config = ArchiveConfig(s3_prefix="t") + write_run_report( + store, + "early", + PipelineSummary( + outcomes=[CaptureOutcome(url="https://a.test", status="error")] + ), + config, + ) + write_run_report( + store, + "later", + PipelineSummary( + outcomes=[CaptureOutcome(url="https://a.test", status="stored")] + ), + config, + ) + assert read_outcomes(store, config)["https://a.test"] == "stored" diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py new file mode 100644 index 00000000..d357982b --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_screenshot_encoding.py @@ -0,0 +1,67 @@ +"""Tests for screenshot encoding + the height cap. + +Regression guard for a silent truncation bug: the height cap used to be applied +via Playwright's ``clip`` *without* ``full_page``, which is bounded by the +viewport and chopped tall pages down to a single screen. The cap is now enforced +by cropping the full-page render in Pillow — these tests pin that behavior. +""" + +from __future__ import annotations + +import io + +import pytest + +from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( + _encode_screenshot, +) + +Image = pytest.importorskip("PIL.Image") + + +def _png(width: int, height: int) -> bytes: + out = io.BytesIO() + Image.new("RGB", (width, height), (255, 0, 0)).save(out, format="PNG") + return out.getvalue() + + +def test_tall_page_cropped_to_max_height(): + data, ct = _encode_screenshot(_png(1280, 12000), "webp", max_height=4000) + assert ct == "image/webp" + img = Image.open(io.BytesIO(data)) + assert img.size == (1280, 4000) # cropped to the cap, full width preserved + + +def test_short_page_not_cropped(): + data, _ = _encode_screenshot(_png(1280, 3000), "webp", max_height=20000) + assert Image.open(io.BytesIO(data)).size == (1280, 3000) # untouched + + +def test_webp_clamped_to_format_limit_even_without_cap(): + # WebP cannot exceed 16383px; an over-tall page must crop, not crash. + data, _ = _encode_screenshot(_png(1280, 25000), "webp", max_height=0) + assert Image.open(io.BytesIO(data)).size == (1280, 16383) + + +def test_webp_cap_above_format_limit_is_clamped(): + # A configured cap above WebP's limit still degrades safely to 16383. + data, _ = _encode_screenshot(_png(1280, 18000), "webp", max_height=16000) + assert Image.open(io.BytesIO(data)).height == 16000 + + +def test_png_keeps_full_height_uncapped(): + # PNG has no such limit, so max_height=0 preserves the whole render. + data, _ = _encode_screenshot(_png(1280, 20000), "png", max_height=0) + assert Image.open(io.BytesIO(data)).size == (1280, 20000) + + +def test_webp_is_real_webp(): + data, ct = _encode_screenshot(_png(800, 600), "webp") + assert ct == "image/webp" + assert data[:4] == b"RIFF" and data[8:12] == b"WEBP" + + +def test_jpeg_format(): + data, ct = _encode_screenshot(_png(800, 600), "jpeg") + assert ct == "image/jpeg" + assert Image.open(io.BytesIO(data)).format == "JPEG" diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py new file mode 100644 index 00000000..f3555523 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_trace_extraction.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import ( + extract_records_from_events, + extract_records_from_question_dir, + extract_records_from_trace_file, + harvest_run, + trace_label, +) + + +def test_trace_label_strips_prefix_and_suffix(): + assert trace_label("/x/traces_forecast_1_attempt_1.jsonl") == "forecast_1_attempt_1" + assert trace_label("traces_summarize.jsonl") == "summarize" + + +def test_tool_call_carries_query_and_tool_args(): + events = [ + { + "type": "tool_call", + "call_id": "c1", + "name": "search_online", + "args": {"query": "uk election polls", "max_results": 5}, + } + ] + records = extract_records_from_events(events, trace="forecast_1") + # No URL in the args -> nothing emitted from the tool_call itself. + assert records == [] + + +def test_tool_result_attributed_to_originating_call(): + events = [ + { + "type": "tool_call", + "call_id": "c1", + "name": "search_online", + "args": {"query": "uk election polls"}, + }, + { + "type": "tool_result", + "call_id": "c1", + "content": "Top hit: [poll](https://a.test/poll) and https://b.test/x", + "timestamp": "2026-05-12T12:00:00+00:00", + }, + ] + records = extract_records_from_events(events, trace="forecast_1", bot="template") + assert [r.url for r in records] == ["https://a.test/poll", "https://b.test/x"] + rec = records[0] + assert rec.origin == "tool_result" + assert rec.tool_name == "search_online" + assert rec.query == "uk election polls" + assert rec.tool_args == {"query": "uk election polls"} + assert rec.trace == "forecast_1" + assert rec.bot == "template" + assert rec.first_seen == "2026-05-12T12:00:00+00:00" + + +def test_query_from_list_args(): + events = [ + { + "type": "tool_call", + "call_id": "c1", + "name": "s", + "args": {"queries": ["a", "b"]}, + }, + {"type": "tool_result", "call_id": "c1", "content": "https://a.test/x"}, + ] + records = extract_records_from_events(events, trace="t") + assert records[0].query == "a b" + + +def test_url_directly_in_tool_call_args(): + events = [ + { + "type": "tool_call", + "call_id": "c1", + "name": "fetch_page", + "args": {"url": "https://a.test/page"}, + } + ] + records = extract_records_from_events(events, trace="t") + assert len(records) == 1 + assert records[0].url == "https://a.test/page" + assert records[0].origin == "tool_call" + assert records[0].tool_name == "fetch_page" + assert records[0].tool_args == {"url": "https://a.test/page"} + + +def test_initial_prompt_only_scanned_when_enabled(): + events = [ + {"type": "initial_prompt", "prompt": "background: https://a.test/bg"}, + ] + assert extract_records_from_events(events, trace="forecast_1") == [] + records = extract_records_from_events( + events, trace="summarize", include_initial_prompt=True + ) + assert [r.url for r in records] == ["https://a.test/bg"] + assert records[0].origin == "initial_prompt" + assert records[0].tool_name == "" + + +def test_non_dict_events_skipped(): + events = ["garbage", None, {"type": "tool_result", "content": "https://a.test/x"}] + records = extract_records_from_events(events, trace="t") + assert [r.url for r in records] == ["https://a.test/x"] + + +def _write_jsonl(path: Path, events: list[dict]) -> None: + path.write_text("\n".join(json.dumps(e) for e in events), encoding="utf-8") + + +def test_trace_file_uses_summarize_rule(tmp_path: Path): + f = tmp_path / "traces_summarize.jsonl" + _write_jsonl(f, [{"type": "initial_prompt", "prompt": "see https://a.test/r"}]) + records = extract_records_from_trace_file(str(f), run_id="run1", bot="template") + assert [r.url for r in records] == ["https://a.test/r"] + assert records[0].trace == "summarize" + assert records[0].run_id == "run1" + + +def test_trace_file_skips_blank_and_bad_lines(tmp_path: Path): + f = tmp_path / "traces_forecast_1.jsonl" + f.write_text( + '\n{"type": "tool_result", "content": "https://a.test/x"}\nnot json\n', + encoding="utf-8", + ) + records = extract_records_from_trace_file(str(f)) + assert [r.url for r in records] == ["https://a.test/x"] + + +def test_question_dir_reads_metadata_and_builds_url(tmp_path: Path): + qdir = tmp_path / "q_123" + qdir.mkdir() + (qdir / "question.json").write_text( + json.dumps({"question_id": "metac_123", "metaculus_id": 123}), + encoding="utf-8", + ) + _write_jsonl( + qdir / "traces_forecast_1.jsonl", + [{"type": "tool_result", "content": "https://a.test/x"}], + ) + records = extract_records_from_question_dir( + str(qdir), run_id="run1", bot="template" + ) + assert len(records) == 1 + rec = records[0] + assert rec.question_id == "metac_123" + assert rec.metaculus_id == "123" + assert rec.question_url == "https://www.metaculus.com/questions/123/" + + +def test_question_dir_without_metadata_still_emits(tmp_path: Path): + qdir = tmp_path / "q_x" + qdir.mkdir() + _write_jsonl( + qdir / "traces_forecast_1.jsonl", + [{"type": "tool_result", "content": "https://a.test/x"}], + ) + records = extract_records_from_question_dir(str(qdir)) + assert [r.url for r in records] == ["https://a.test/x"] + assert records[0].question_id is None + assert records[0].question_url is None + + +def test_harvest_run_walks_bot_and_question_dirs(tmp_path: Path): + run = tmp_path / "run_demo" + qdir = run / "bot_template" / "q_1" + qdir.mkdir(parents=True) + (qdir / "question.json").write_text( + json.dumps({"metaculus_id": 1}), encoding="utf-8" + ) + _write_jsonl( + qdir / "traces_forecast_1.jsonl", + [{"type": "tool_result", "content": "https://a.test/x"}], + ) + records = harvest_run(str(run)) + assert len(records) == 1 + rec = records[0] + assert rec.run_id == "run_demo" + assert rec.bot == "template" + assert rec.metaculus_id == "1" + + +def test_harvest_run_flat_layout_without_bot_dirs(tmp_path: Path): + # Butler-style: //traces_*.jsonl with no bot_* grouping. + run = tmp_path / "s3_backfill" + qdir = run / "2026-05-20_metac_43538" + qdir.mkdir(parents=True) + (qdir / "question.json").write_text( + json.dumps({"metaculus_id": 43538}), encoding="utf-8" + ) + _write_jsonl( + qdir / "traces_forecast_1.jsonl", + [{"type": "tool_result", "content": "https://a.test/x"}], + ) + records = harvest_run(str(run), bot="butler") + assert len(records) == 1 + rec = records[0] + assert rec.bot == "butler" # the flat-layout bot override + assert rec.metaculus_id == "43538" # still read from question.json + + +def test_harvest_run_flat_layout_defaults_bot_to_run_name(tmp_path: Path): + run = tmp_path / "myrun" + qdir = run / "q_only" + qdir.mkdir(parents=True) + _write_jsonl( + qdir / "traces_x.jsonl", + [{"type": "tool_result", "content": "https://a.test/y"}], + ) + records = harvest_run(str(run)) # no bot= -> defaults to run dir name + assert [r.bot for r in records] == ["myrun"] diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py index e018af77..443578bb 100644 --- a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py @@ -38,6 +38,25 @@ def test_dedupes_preserving_order(): assert extract_urls(text) == ["https://a.test", "https://b.test"] +def test_strips_trailing_backslash_escape_residue(): + # Markdown often leaves a trailing backslash, e.g. "Zaporizhzhia\" + assert extract_urls("see https://a.test/search?q=Zaporizhzhia\\ ok") == [ + "https://a.test/search?q=Zaporizhzhia" + ] + + +def test_cuts_markdown_reference_tail_and_keeps_both_urls(): + # The bare scan can glue ")[10](other)" onto a real URL; the tail is cut so + # the first URL is clean, and the genuinely-separate second URL (itself a + # valid markdown link) is still extracted. Order follows pattern precedence + # (markdown links before bare URLs), so compare as a set. + text = "https://a.test/story?id=123)[10](https://b.test/other)" + assert set(extract_urls(text)) == { + "https://a.test/story?id=123", + "https://b.test/other", + } + + def test_ignores_non_http_and_empty(): assert extract_urls("ftp://a.test mailto:x@y.test nope") == [] assert extract_urls(None) == [] diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md index 4eb2d9ef..fbcb34cf 100644 --- a/forecasting_tools/agents_and_tools/source_archive/README.md +++ b/forecasting_tools/agents_and_tools/source_archive/README.md @@ -43,6 +43,10 @@ Configuration is read from the environment (see the project `.env.template`): | `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain | | `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` | | `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) | +| `WEB_ARCHIVE_FIRECRAWL_PROXY` | Firecrawl proxy mode for hardened sites: `basic` (1 credit) / `auto` / `stealth` (5 credits). | `basic` | +| `HYPERBROWSER_API_KEY` | Enables the Hyperbrowser managed fallback. | — (off) | +| `WEB_ARCHIVE_CLOAKBROWSER_IMPORT` | Module exposing CloakBrowser's `launch()`. | `cloakbrowser` | +| `WEB_ARCHIVE_PDF_MAX_PAGES` | Cap on PDF pages parsed per document. | `50` | AWS credentials use the standard AWS resolution chain — environment variables, a shared config file, or an SSO profile. Nothing secret is committed or baked into @@ -87,13 +91,131 @@ source-archive capture run.jsonl --local ./archive # Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo +# Skip the Hyperbrowser fallback this run; failures are written to a retry +# manifest so you can come back to just those sites later (e.g. with it on). +source-archive capture run.jsonl --no-hyperbrowser --run-id demo +source-archive capture demo_needs_retry.jsonl --run-id demo # later, hyperbrowser on + # Build a manifest by harvesting the URLs bots cited on a Metaculus tournament source-archive harvest 32506 --out run.jsonl ``` +Because a failed fetch leaves no cache entry while a success does, re-running the +same manifest only re-attempts the failures — the retry manifest just makes that +explicit and fast (it skips the already-captured majority). + `source-archive` is installed by the extra; the equivalent module form is `python -m forecasting_tools.agents_and_tools.source_archive.cli`. +## Backup backends & the bake-off + +A self-hosted browser is the primary backend and gets ~70% of URLs for ~free, +but two kinds of URL fall through it: **anti-bot/Cloudflare** pages (it detects +the block but can't get past it) and **PDFs** (Chromium downloads them instead of +rendering, so nothing is captured). The package ships these backups, ordered by +marginal cost so the cheap tiers absorb most of the tail: + +| Backend | Cost (2026) | Closes | Notes | +| --- | --- | --- | --- | +| `CloakBrowserFetcher` | ~$0/page (self-host) | Cloudflare | **The primary browser tier when installed** (`pip install cloakbrowser`): patched Chromium that beat vanilla Playwright on Cloudflare in 2026 benchmarks. Only one browser runs — cloak *replaces* vanilla Playwright (two `sync_playwright` instances conflict in one process), falling back to vanilla when cloak isn't installed. | +| `PdfFetcher` | $0 local; ~$0.0008/pg OCR | PDFs | PyMuPDF4LLM locally, falls back to Firecrawl OCR on scanned PDFs. | +| `FirecrawlFetcher` | $0.0008 basic / $0.0042 stealth | Cloudflare + PDFs | Native PDF parser; `WEB_ARCHIVE_FIRECRAWL_PROXY=stealth` for hardened sites. | +| `HyperbrowserFetcher` | $0.001 basic / $0.01 proxy | Cloudflare | Consolidates spend onto a vendor already used elsewhere. No PDF support. | + +Selenium was evaluated and **rejected**: it drives the same Chromium as +Playwright, so it bypasses nothing Playwright can't, and its stealth ecosystem +(`undetected-chromedriver`) is now legacy. CloakBrowser/Patchright/nodriver are +the credible self-hosted upgrades. + +To decide which backup(s) to wire in, run the bake-off — it runs each selected +backend independently over the same URLs (not tiered) and reports reliability, +latency, and estimated cost per backend, broken down by category: + +```bash +python -m forecasting_tools.agents_and_tools.source_archive.benchmark \ + --manifest forecasting_tools/agents_and_tools/source_archive/benchmarks/sample_urls.jsonl \ + --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \ + --out bench.csv +``` + +Backends whose API key or dependency is missing are skipped cleanly. Cost +figures are model estimates (see `PRICING` in `benchmark.py`); tune the credit +rates with `--firecrawl-credit-usd` / `--hyperbrowser-credit-usd` to match your +plan. Swap the sample manifest for a JSONL of your own cited URLs (one +`{"url", "category"}` per line; categories `normal`/`cloudflare`/`pdf`) for a +representative run. + +## Browse what you captured + +A Streamlit viewer reads the manifests + index back out of the store and shows +each captured URL's **screenshot, markdown, and HTML** side by side, filterable +by bot and question: + +```bash +AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \ + streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py +``` + +It uses the same `ArchiveConfig.from_env()` settings as capture, so it points at +whatever bucket/prefix you captured to (no extra configuration). + +To browse a **local** capture (no S3/AWS), set `WEB_ARCHIVE_LOCAL_DIR` to the +directory you captured into with `--local`: + +```bash +WEB_ARCHIVE_LOCAL_DIR=./archive \ + streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py +``` + +## The catalog: a browsable, coworker-legible view + +The viewer is interactive (good for us); the **catalog** is a set of static +HTML/CSV pages written into the bucket so a non-technical coworker can browse the +sources without any tooling. It is **question-primary** — the encyclopedia of +every web source used for a question — plus `by-bot/` and `by-domain/` +cross-views, built by joining the manifests with the index: + +```bash +# write catalog/ into the bucket (uses WEB_ARCHIVE_S3_BUCKET) +source-archive catalog +# or against a local capture dir +source-archive catalog --local ./archive +``` + +Start at `catalog/index.html` (or `catalog/READ_ME_FIRST.html` for the plain +explainer). Each source shows its screenshot, who used it (bot + tool), and +whether it was captured; each question also has a CSV. Data/API calls (a bot's +`run_code` pulling a CSV, etc.) are **excluded** from the catalog — it lists web +pages a bot read, not data endpoints — though they remain in the raw manifests. + +## Coverage: what fraction did we archive? + +The catalog shows what we *have*; the **coverage report** shows what we're +*missing*. It's two separate reports, by ingestion path — different denominators, +different ground truth: + +```bash +source-archive coverage # both reports +source-archive coverage --mode trace # just the complex/template bot +source-archive coverage --csv ./cov # also write cov_.csv (+ _missing.txt) +``` + +- **trace** — the complex/template bot's instrumented runs (metac-ai-sdk). Traces + hold *every* URL the bot touched, so this is a true archival success-rate. +- **comments** — every bot (Metaculus's own + outsiders) harvested from public + comments. Comments are truncated, so this denominator under-counts — coverage + here means "of the links visible in comments, how many we archived." + +The report is oriented to one question: **are there sources bots are using that +we are not yet archiving?** It leads with that gap, then breaks it down by +question, bot, tool, and the biggest-gap sites, plus the list of sources to +collect. Non-source URLs — search-engine results, `run_code`-style tool/API +calls, and malformed extractor junk — are excluded (same as the catalog). + +If capture runs have persisted their outcomes (`reports/.json`, written +automatically by `capture`), the gap is split into **never fetched** (the real +collection gap) vs **fetched but failed** (a capture problem). + ## The manifest: what to feed it A run produces a **citation manifest** — a JSONL file with one record per cited @@ -107,33 +229,49 @@ The pipeline dedupes URLs within the manifest before fetching. ## Where the manifest comes from -You can write a manifest yourself, or generate one from a bot's published -reasoning. Both first-party and third-party bots post their reasoning — with the -source links they used — as comments on Metaculus, so the public, no-auth -Metaculus API is the one ingestion path that works across *every* bot: +You can write a manifest yourself, or generate one from a forecasting bot's +reasoning — the source links a bot used are recorded in the comment it posts and, +more completely, in its run traces. -```python -from forecasting_tools.agents_and_tools.source_archive.ingest import ( - MetaculusCommentHarvester, -) -from forecasting_tools.agents_and_tools.source_archive import manifest +**From the database (operator path).** `harvest-db` reads the URLs a bot cited +straight from the platform's Postgres database and emits a manifest. Point it at +a database (a `postgresql://…` URL works — e.g. a Neon connection string): -harvester = MetaculusCommentHarvester() # uses METACULUS_API_BASE_URL -records = harvester.harvest_project(32506) # a tournament / project id -manifest.write_file("run.jsonl", records) # -> feed to `capture` +```bash +# one post, or the latest day of activity +source-archive harvest-db --post 29495 --dedupe --out run.jsonl +source-archive harvest-db --days 1 --dedupe --upload --run-id "$(date -u +%F)" ``` -Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`. +It reads `comments_comment ⋈ users_user (is_bot)` and emits the same manifest. +`--days` is uncapped by default; `--limit N` caps the row count for spot checks. +`--public-only` restricts to public comments (all comments are read by default). -The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in -`ingest.url_extraction` pull URLs out of any markdown/text (markdown links, -autolinks, and bare URLs), if you are ingesting from somewhere other than -comments. +**DSN resolution (keep the credential off disk).** The DSN is resolved in this +order: `--dsn` flag → `$METACULUS_DB_DSN` → macOS Keychain item +`metaculus-db-dsn` → local default `dbname=metaculus`. The DSN is a real secret +(it grants database read access), so prefer the **Keychain** over `.env` / a +shell export — those land in files and shell history that any editor or coding +agent can read. Store it once (you'll be prompted to paste it, so it never +appears in your shell history): -Caveat: comments are length-truncated when posted, so a comment-harvested URL -list can be incomplete versus a bot's full research. For bots you control, an -instrumented trace gives a fuller list; comment harvesting is the universal -baseline. +```bash +security add-generic-password -U -a "$USER" -s metaculus-db-dsn -w +# paste the full postgresql://USER:PASS@HOST/dbname?sslmode=require string, return +``` + +For the strongest guard, open **Keychain Access.app → login → `metaculus-db-dsn` +→ Access Control → "Confirm before allowing access"** and clear the always-allow +list. Every read then raises a GUI confirm: a human running the harvest clicks +*Allow* (not *Always Allow*), but an automated agent driving a shell can't. With +that set, the harvester works with no DSN in any file — `source-archive +harvest-db --days 1` just prompts you once per run. + +**From text or traces.** The lower-level `extract_urls(text)` / +`extract_citation_records(...)` helpers in `ingest.url_extraction` pull URLs out +of any markdown/text (markdown links, autolinks, and bare URLs). For bots you +control, an instrumented trace (`ingest-traces`) gives the fullest URL list; a +comment gives a shallower one, since it is length-truncated when posted. ## How it's organized @@ -142,7 +280,8 @@ baseline. | `config.py` | Environment-driven `ArchiveConfig` | | `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` | | `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester | -| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator | +| `fetchers/` | Playwright (primary) + CloakBrowser / Hyperbrowser / Firecrawl / PDF backups, tiered orchestrator | +| `benchmark.py` | Backend bake-off: reliability + cost per backend over a manifest | | `quality.py` | Reject 404s, block pages, and thin content before archiving | | `storage/` | `BlobStore` interface with S3 and local backends | | `content_store.py` | `url + content-hash` store with the TTL cache and dedup | @@ -150,12 +289,22 @@ baseline. | `pipeline.py` | `lookup → fetch → quality gate → store` | | `cli.py` | `source-archive` command | +## Roadmap + +Planned and shipped improvements — smarter dedup (URL canonicalization + +redirect/content aliasing), the coworker-legible catalog, and coverage reports — +are written up in [ROADMAP.md](ROADMAP.md). + ## What lands in storage ``` -/index/.json per-URL capture history +/index/.json per-URL capture history (+ aliases) +/index/by-content/.json reverse index for content dedup /content//.html /content//.webp (screenshot) /content//.md /manifests/.jsonl the run's citation manifest +/reports/.json per-URL capture outcomes (for coverage) +/catalog/index.html browsable catalog (by question/bot/site) +/catalog/by-question/.{html,csv} ``` diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py index 795f4b66..5ede914d 100644 --- a/forecasting_tools/agents_and_tools/source_archive/__init__.py +++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py @@ -29,10 +29,7 @@ from forecasting_tools.agents_and_tools.source_archive.fetchers import ( build_default_fetcher, ) -from forecasting_tools.agents_and_tools.source_archive.ingest import ( - MetaculusCommentHarvester, - extract_urls, -) +from forecasting_tools.agents_and_tools.source_archive.ingest import extract_urls from forecasting_tools.agents_and_tools.source_archive.models import ( CaptureResult, CitationRecord, @@ -51,7 +48,6 @@ "CapturePipeline", "CitationRecord", "ContentStore", - "MetaculusCommentHarvester", "PipelineSummary", "StoreResult", "StoredCapture", diff --git a/forecasting_tools/agents_and_tools/source_archive/benchmark.py b/forecasting_tools/agents_and_tools/source_archive/benchmark.py new file mode 100644 index 00000000..76d79083 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/benchmark.py @@ -0,0 +1,459 @@ +"""Backend bake-off: run each capture backend independently over the same URLs. + +This is the harness for deciding *which* backup to put behind Playwright. Unlike +the production :class:`TieredFetcher` (which stops at the first backend that +passes the quality gate), the benchmark runs **every** selected backend over +**every** URL, so you get an apples-to-apples table of reliability, latency, and +estimated cost per backend — broken down by URL category (normal / cloudflare / +pdf). + +Run it:: + + python -m forecasting_tools.agents_and_tools.source_archive.benchmark \\ + --manifest sample_urls.jsonl \\ + --backends playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf \\ + --out bench.csv + +A backend whose dependency or API key is missing is skipped with a note rather +than failing the whole run, so you can benchmark whatever you have configured. + +Cost figures are ESTIMATES from a documented pricing model (see ``PRICING``, +sourced 2026-06); they are not billed amounts. Override the credit rates via +CLI flags to match your plan. +""" + +from __future__ import annotations + +import argparse +import csv +import io +import json +import logging +import statistics +import sys +import time +from contextlib import nullcontext +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import ( + Fetcher, + FetchError, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import ( + CloakBrowserFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import ( + FirecrawlFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import ( + HyperbrowserFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import ( + PdfFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( + PlaywrightFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult +from forecasting_tools.agents_and_tools.source_archive.quality import evaluate + +logger = logging.getLogger(__name__) + +GB = 1_000_000_000 + +# --- Pricing model ----------------------------------------------------------- +# $/unit as of 2026-06, from each vendor's public pricing + this repo's prior +# cost experiment. These are the knobs to adjust for your plan. +# +# - Self-hosted compute (Playwright / CloakBrowser): ~$0.00001/page rendered +# (measured in bot-sources probe). Marginal service fee is effectively $0. +# - Firecrawl: 1 credit basic, 5 credits stealth/"enhanced" proxy. Standard +# plan ≈ $0.00083/credit. +# - Hyperbrowser: 1 credit ($0.001) basic, 10 credits ($0.01) with proxy, +# plus $10/GB proxy bandwidth. 1 credit = $0.001. +# - PDF: PyMuPDF4LLM local = $0; Firecrawl OCR fallback = ~1 credit/PDF page. + + +@dataclass +class Pricing: + self_host_per_page: float = 0.00001 + firecrawl_credit_usd: float = 0.00083 + firecrawl_basic_credits: int = 1 + firecrawl_stealth_credits: int = 5 + hyperbrowser_credit_usd: float = 0.001 + hyperbrowser_basic_credits: int = 1 + hyperbrowser_proxy_credits: int = 10 + hyperbrowser_bandwidth_usd_per_gb: float = 10.0 + + +def estimate_cost( + backend: str, result: CaptureResult, response_bytes: int, pricing: Pricing +) -> float: + """Estimated $ for one successful capture by ``backend``.""" + meta = result.metadata or {} + if backend in ("playwright", "cloakbrowser"): + return pricing.self_host_per_page + if backend.startswith("firecrawl"): + proxy = str(meta.get("firecrawl_proxy", "basic")).lower() + credits = ( + pricing.firecrawl_basic_credits + if proxy in ("", "basic") + else pricing.firecrawl_stealth_credits + ) + return credits * pricing.firecrawl_credit_usd + if backend == "hyperbrowser": + credits = ( + pricing.hyperbrowser_proxy_credits + if meta.get("used_proxy") + else pricing.hyperbrowser_basic_credits + ) + bandwidth = (response_bytes / GB) * pricing.hyperbrowser_bandwidth_usd_per_gb + return credits * pricing.hyperbrowser_credit_usd + bandwidth + if backend == "pdf": + if meta.get("pdf_engine") == "firecrawl": + pages = int(meta.get("pdf_pages") or 1) + return pages * pricing.firecrawl_credit_usd + return 0.0 # local PyMuPDF4LLM + return 0.0 + + +# --- Backend registry -------------------------------------------------------- +# Factories so a missing dependency / API key only skips that backend. The +# ``context`` flag marks browser backends that must be entered as a context +# manager (the browser launches once and is reused across URLs). + + +@dataclass +class BackendSpec: + name: str + factory: Callable[[ArchiveConfig], Fetcher] + context: bool = False + # Optional pre-flight: return a reason string if the backend can't run + # (missing key/dep) so the bake-off reports a clean SKIP instead of N/N + # fetch_errors. ``None`` means "looks runnable". + precheck: Callable[[ArchiveConfig], str | None] | None = None + + +def _need_firecrawl_key(config: ArchiveConfig) -> str | None: + if not config.firecrawl_api_key: + return "FIRECRAWL_API_KEY not set" + return None + + +def _need_hyperbrowser_key(config: ArchiveConfig) -> str | None: + if not config.hyperbrowser_api_key: + return "HYPERBROWSER_API_KEY not set" + return None + + +def _firecrawl_stealth(config: ArchiveConfig) -> FirecrawlFetcher: + # Force the proxy/stealth path so this row measures the Cloudflare-grade + # (5-credit) cost, even if the operator left the default at "basic". + proxy = config.firecrawl_proxy + if proxy in ("", "basic"): + proxy = "auto" + f = FirecrawlFetcher(config.model_copy(update={"firecrawl_proxy": proxy})) + f.name = "firecrawl-stealth" + return f + + +BACKENDS: dict[str, BackendSpec] = { + "playwright": BackendSpec("playwright", PlaywrightFetcher, context=True), + "cloakbrowser": BackendSpec("cloakbrowser", CloakBrowserFetcher, context=True), + "firecrawl": BackendSpec( + "firecrawl", FirecrawlFetcher, precheck=_need_firecrawl_key + ), + "firecrawl-stealth": BackendSpec( + "firecrawl-stealth", _firecrawl_stealth, precheck=_need_firecrawl_key + ), + "hyperbrowser": BackendSpec( + "hyperbrowser", HyperbrowserFetcher, precheck=_need_hyperbrowser_key + ), + "pdf": BackendSpec("pdf", PdfFetcher), +} + + +# --- Sample manifest --------------------------------------------------------- +# A curated starter set spanning the three categories the backup must handle. +# Replace/extend with your own real cited URLs for a representative run. +SAMPLE_MANIFEST: list[dict] = [ + {"url": "https://example.com", "category": "normal"}, + {"url": "https://en.wikipedia.org/wiki/Forecasting", "category": "normal"}, + {"url": "https://www.federalregister.gov/", "category": "normal"}, + # Sites commonly fronted by Cloudflare / anti-bot: + {"url": "https://www.g2.com/", "category": "cloudflare"}, + {"url": "https://www.indeed.com/", "category": "cloudflare"}, + {"url": "https://www.zillow.com/", "category": "cloudflare"}, + # PDFs (the gap Playwright can't render): + {"url": "https://arxiv.org/pdf/1706.03762", "category": "pdf"}, + {"url": "https://bitcoin.org/bitcoin.pdf", "category": "pdf"}, +] + + +@dataclass +class Row: + backend: str + url: str + category: str + passed: bool + reason: str + seconds: float + html_bytes: int + md_bytes: int + screenshot_bytes: int + cost_usd: float + error: str = "" + + +@dataclass +class BackendRun: + name: str + rows: list[Row] = field(default_factory=list) + skipped: str = "" + + +def _sizes(result: CaptureResult) -> tuple[int, int, int]: + html = len(result.html.encode()) if result.html else 0 + md = len(result.markdown.encode()) if result.markdown else 0 + shot = len(result.screenshot) if result.screenshot else 0 + return html, md, shot + + +def run_backend( + spec: BackendSpec, + manifest: list[dict], + config: ArchiveConfig, + pricing: Pricing, +) -> BackendRun: + run = BackendRun(name=spec.name) + if spec.precheck is not None: + reason = spec.precheck(config) + if reason: + run.skipped = reason + logger.warning("%s skipped: %s", spec.name, reason) + return run + try: + fetcher = spec.factory(config) + except Exception as e: # construction (e.g. missing key) — skip cleanly + run.skipped = f"could not construct {spec.name}: {e}" + logger.warning(run.skipped) + return run + + cm = fetcher if spec.context else nullcontext(fetcher) + try: + with cm as live: + for record in manifest: + run.rows.append(_capture_one(spec.name, live, record, pricing)) + except FetchError as e: + # A browser backend can fail to even start (e.g. cloakbrowser not + # installed). Record it as a skip rather than crashing the bake-off. + if not run.rows: + run.skipped = f"{spec.name} unavailable: {e}" + logger.warning(run.skipped) + else: + raise + return run + + +def _capture_one(backend: str, fetcher: Fetcher, record: dict, pricing: Pricing) -> Row: + url = record["url"] + category = record.get("category", "normal") + start = time.monotonic() + try: + result = fetcher.fetch(url) + except FetchError as e: + return Row( + backend, + url, + category, + False, + "fetch_error", + round(time.monotonic() - start, 2), + 0, + 0, + 0, + 0.0, + error=str(e)[:300], + ) + except Exception as e: # backend bug / unexpected SDK error + return Row( + backend, + url, + category, + False, + "exception", + round(time.monotonic() - start, 2), + 0, + 0, + 0, + 0.0, + error=str(e)[:300], + ) + + seconds = round(time.monotonic() - start, 2) + verdict = evaluate(result) + html_b, md_b, shot_b = _sizes(result) + response_bytes = html_b + shot_b + cost = ( + estimate_cost(backend, result, response_bytes, pricing) + if verdict.passed + else 0.0 + ) + return Row( + backend, + url, + category, + verdict.passed, + verdict.reason or "ok", + seconds, + html_b, + md_b, + shot_b, + round(cost, 6), + ) + + +# --- Reporting --------------------------------------------------------------- +def write_csv(path: str, runs: list[BackendRun]) -> None: + buf = io.StringIO() + w = csv.writer(buf) + w.writerow( + [ + "backend", + "url", + "category", + "passed", + "reason", + "seconds", + "html_bytes", + "md_bytes", + "screenshot_bytes", + "cost_usd", + "error", + ] + ) + for run in runs: + for r in run.rows: + w.writerow( + [ + r.backend, + r.url, + r.category, + r.passed, + r.reason, + r.seconds, + r.html_bytes, + r.md_bytes, + r.screenshot_bytes, + r.cost_usd, + r.error, + ] + ) + Path(path).write_text(buf.getvalue(), encoding="utf-8") + + +def summarize(runs: list[BackendRun], urls_per_question: int, tail_share: float) -> str: + cats = ["normal", "cloudflare", "pdf"] + lines = [] + header = ( + f"{'backend':<18}{'overall':>9}" + + "".join(f"{c:>11}" for c in cats) + + f"{'med s':>8}{'$/page':>10}{'proj $/q':>10}" + ) + lines.append(header) + lines.append("-" * len(header)) + for run in runs: + if run.skipped: + lines.append(f"{run.name:<18} SKIPPED: {run.skipped[:80]}") + continue + total = len(run.rows) + passed = [r for r in run.rows if r.passed] + overall = f"{len(passed)}/{total}" + + def cat_rate(cat: str) -> str: + rows = [r for r in run.rows if r.category == cat] + if not rows: + return "-" + ok = sum(1 for r in rows if r.passed) + return f"{ok}/{len(rows)}" + + med = statistics.median([r.seconds for r in run.rows]) if run.rows else 0 + cost_per = statistics.mean([r.cost_usd for r in passed]) if passed else 0.0 + # Illustrative: if THIS backend alone handled the whole post-Playwright + # tail of a question. (tail_share × urls × $/successful page.) + proj = tail_share * urls_per_question * cost_per + lines.append( + f"{run.name:<18}{overall:>9}" + + "".join(f"{cat_rate(c):>11}" for c in cats) + + f"{med:>8.1f}{cost_per:>10.5f}{proj:>10.3f}" + ) + note = ( + f"\nproj $/q assumes one backend covers a {tail_share:.0%} tail of " + f"{urls_per_question} URLs/question, BEFORE the TTL cache (which makes " + f"re-runs nearly free). Costs are model estimates, not billed amounts." + ) + return "\n".join(lines) + "\n" + note + + +def load_manifest(path: str | None) -> list[dict]: + if not path: + return SAMPLE_MANIFEST + records = [] + for line in Path(path).read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + +def main(argv: list[str] | None = None) -> int: + logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(message)s") + p = argparse.ArgumentParser(description="Capture-backend bake-off.") + p.add_argument( + "--manifest", help="JSONL of {url, category}. Omit for the built-in sample." + ) + p.add_argument( + "--backends", + default="playwright,cloakbrowser,firecrawl,firecrawl-stealth,hyperbrowser,pdf", + help="Comma-separated subset of: " + ", ".join(BACKENDS), + ) + p.add_argument("--out", default="benchmark.csv", help="CSV output path.") + p.add_argument("--urls-per-question", type=int, default=450) + p.add_argument( + "--tail-share", + type=float, + default=0.30, + help="Fraction of URLs that fall through Playwright.", + ) + p.add_argument("--firecrawl-credit-usd", type=float, default=0.00083) + p.add_argument("--hyperbrowser-credit-usd", type=float, default=0.001) + args = p.parse_args(argv) + + config = ArchiveConfig.from_env() + pricing = Pricing( + firecrawl_credit_usd=args.firecrawl_credit_usd, + hyperbrowser_credit_usd=args.hyperbrowser_credit_usd, + ) + manifest = load_manifest(args.manifest) + + selected = [b.strip() for b in args.backends.split(",") if b.strip()] + unknown = [b for b in selected if b not in BACKENDS] + if unknown: + p.error(f"unknown backends: {unknown}. Choose from {list(BACKENDS)}") + + runs: list[BackendRun] = [] + for name in selected: + print(f"running {name} over {len(manifest)} URLs...", file=sys.stderr) + runs.append(run_backend(BACKENDS[name], manifest, config, pricing)) + + write_csv(args.out, runs) + print("\n" + summarize(runs, args.urls_per_question, args.tail_share)) + print(f"\nper-URL detail written to {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/forecasting_tools/agents_and_tools/source_archive/canonicalize.py b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py new file mode 100644 index 00000000..b791a47e --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/canonicalize.py @@ -0,0 +1,115 @@ +"""Canonicalize URLs so trivially-different links collapse to one dedup key. + +Every capture of a page is grouped under ``url_hash`` (see :mod:`models`). +Historically that hashed the *raw* URL string, so ``…/x``, ``…/x/``, +``…/x?utm_source=…`` and ``…/x#frag`` were four different "sources" — inflating +both storage and any "how many sources have we covered" count. + +This module normalizes away differences that do **not** change *which page* you +get, so the dedup key is stable across those variants: + + - lowercase scheme + host, strip a default port (``:80`` / ``:443``) + - drop the fragment (``#…``) + - drop known analytics / click-tracking query params, then sort the rest + - normalize a trailing slash (``…/x/`` -> ``…/x``; root collapses to no path) + +It is deliberately conservative. It does **not** upgrade ``http`` -> ``https`` or +strip ``www.``: those can resolve to genuinely different pages on some hosts, so +collapsing them belongs to a later, opt-in phase (see ``ROADMAP.md``). +""" + +from __future__ import annotations + +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit + +# Query params that are analytics/click tracking and never select the page. +# Matched case-insensitively; any key starting with a prefix below is also +# dropped. Bare ``ref`` / ``source`` are intentionally left alone — they are too +# often load-bearing (API refs, content selectors) to drop blindly. +_TRACKING_PARAMS = frozenset( + { + "gclid", + "gclsrc", + "dclid", + "gbraid", + "wbraid", + "fbclid", + "msclkid", + "yclid", + "twclid", + "mc_eid", + "mc_cid", + "_hsenc", + "_hsmi", + "igshid", + "igsh", + "vero_id", + "vero_conv", + "oly_anon_id", + "oly_enc_id", + "spm", + "scm", + "ref_src", + "ref_url", + } +) +_TRACKING_PREFIXES = ("utm_",) + +_DEFAULT_PORTS = {"http": "80", "https": "443"} + + +def _is_tracking(key: str) -> bool: + k = key.lower() + return k in _TRACKING_PARAMS or any(k.startswith(p) for p in _TRACKING_PREFIXES) + + +def canonicalize_url(url: str) -> str: + """Return a normalized form of ``url`` to use as a dedup key. + + Idempotent — ``canonicalize_url(canonicalize_url(u)) == canonicalize_url(u)``. + Non-http(s) or unparsable input is returned stripped but otherwise as-is + (e.g. ``mailto:``, relative paths), so callers can pass anything safely. + """ + if not url: + return url + raw = url.strip() + try: + parts = urlsplit(raw) + except ValueError: + return raw + if parts.scheme not in ("http", "https") or not parts.netloc: + return raw + + scheme = parts.scheme.lower() + + # netloc: lowercase host (bracket IPv6), keep userinfo, strip default port. + host = (parts.hostname or "").lower() + if ":" in host: # IPv6 literal + host = f"[{host}]" + netloc = host + if parts.username is not None: + auth = parts.username + if parts.password is not None: + auth += f":{parts.password}" + netloc = f"{auth}@{netloc}" + if parts.port is not None and str(parts.port) != _DEFAULT_PORTS.get(scheme): + netloc += f":{parts.port}" + + # path: collapse the bare root to empty; drop a trailing slash otherwise. + path = parts.path + if path in ("", "/"): + path = "" + elif path.endswith("/"): + path = path.rstrip("/") + + # query: drop tracking params, then sort so order doesn't matter. + kept = [ + (k, v) + for k, v in parse_qsl(parts.query, keep_blank_values=True) + if not _is_tracking(k) + ] + kept.sort() + query = urlencode(kept) + + # fragment: always dropped. + return urlunsplit((scheme, netloc, path, query, "")) diff --git a/forecasting_tools/agents_and_tools/source_archive/catalog.py b/forecasting_tools/agents_and_tools/source_archive/catalog.py new file mode 100644 index 00000000..a9ec97f4 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/catalog.py @@ -0,0 +1,562 @@ +"""Generate a coworker-legible catalog over the hash-addressed store. + +The content store is keyed by URL/content hash — great for dedup, opaque to a +human browsing the bucket. This builds a browsable ``catalog/`` layer on top by +joining the citation manifests (who cited what, on which question, with which +tool) with the per-URL index (what actually got captured). Blobs are never moved +or duplicated; the catalog only writes small HTML/CSV pointer pages. + +Views (question-primary, with two cross-views): + + catalog/READ_ME_FIRST.html plain-language explainer for coworkers + catalog/index.html landing page + headline counts + catalog/by-question/.html ★ the encyclopedia for one question: + catalog/by-question/.csv every source, deduped, tagged with the + bots/tools/queries that used it + catalog/by-bot/.html one bot's sources across questions + catalog/by-domain/.html sources grouped by site + +The question view is the default because that's how post-mortems and +non-technical coworkers think ("what did we know about question X?"); ``by-bot`` +covers profiling/"what is the top bot-maker doing", always next to how other +bots handled the same question. +""" + +from __future__ import annotations + +import csv +import html +import io +from collections import defaultdict +from urllib.parse import urlsplit + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.models import ( + CitationRecord, + url_hash, +) +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + +_UNKNOWN_Q = "unknown-question" + +# Tools that fetch data/API endpoints, not human-readable web pages. A URL only +# ever touched by one of these is a data call (e.g. a bot's run_code pulling a +# CSV), so it is kept out of the page-oriented catalog (it stays in the raw +# manifests). A URL also seen via search/page-fetch is treated as a real page. +_NON_PAGE_TOOLS = { + "run_code", + "code", + "python", + "run_python", + "code_interpreter", + "execute_code", + "bash", + "shell", +} + + +def tool_call_only(citations: list) -> bool: + """True if a URL was touched *only* by code-execution tools (a data/API call, + not a page a bot read).""" + tools = {(c.tool_name or "").lower() for c in citations} + code_tools = tools & _NON_PAGE_TOOLS + other_tools = tools - _NON_PAGE_TOOLS - {""} + return bool(code_tools) and not other_tools + + +def _is_tool_call_only(source: "Source") -> bool: + return tool_call_only(source.citations) + + +# Search-engine result pages are navigation, not sources — a bot citing a +# google/duckduckgo search URL hasn't handed us a page worth archiving. +_SEARCH_HOSTS = { + "duckduckgo.com", + "bing.com", + "search.brave.com", + "search.yahoo.com", + "ecosia.org", + "startpage.com", + "baidu.com", + "ask.com", + "qwant.com", + "search.marginalia.nu", + "kagi.com", +} +# Percent-encoded junk that means the extractor swallowed markdown / a second URL +# / control chars into the URL (legacy captures from before extraction hardening). +_MALFORMED_MARKERS = ("%5b", "%5d", "%5c", "%0a", "%0d", "%28http", "%29%5b") + + +def is_search_url(url: str) -> bool: + host = urlsplit(url).netloc.lower() + host = host[4:] if host.startswith("www.") else host + return host in _SEARCH_HOSTS or host == "google.com" or host.startswith("google.") + + +def is_malformed_url(url: str) -> bool: + low = url.lower() + return url.count("://") > 1 or any(m in low for m in _MALFORMED_MARKERS) + + +def exclusion_reason(url: str, citations: list) -> str | None: + """Why a cited URL is kept out of the page catalog / coverage, or ``None`` to + keep it. ``malformed`` (extractor junk), ``search`` (search-engine results), + ``tool_call`` (data/API endpoint touched only by code tools).""" + if is_malformed_url(url): + return "malformed" + if is_search_url(url): + return "search" + if tool_call_only(citations): + return "tool_call" + return None + + +class Citation(BaseModel): + bot: str | None = None + question_id: str | None = None + question_url: str | None = None + run_id: str | None = None + tool_name: str | None = None + origin: str | None = None + query: str | None = None + cited_url: str = "" # the original URL as cited (pre-canonicalization) + + +class Source(BaseModel): + canonical_url: str + domain: str + captured: bool = False + content_hash: str | None = None + html_key: str | None = None # store-relative (no prefix) + screenshot_key: str | None = None + markdown_key: str | None = None + citations: list[Citation] = [] + + @property + def bots(self) -> list[str]: + return sorted({c.bot for c in self.citations if c.bot}) + + @property + def question_ids(self) -> list[str]: + return sorted({c.question_id for c in self.citations if c.question_id}) + + +class CatalogData(BaseModel): + sources: list[Source] = [] + excluded: dict[str, int] = {} # exclusion reason -> count of URLs dropped + + @property + def hidden_total(self) -> int: + return sum(self.excluded.values()) + + def by_question(self) -> dict[str, list[Source]]: + out: dict[str, list[Source]] = defaultdict(list) + for s in self.sources: + qids = s.question_ids or [_UNKNOWN_Q] + for qid in qids: + out[qid].append(s) + return out + + def by_bot(self) -> dict[str, list[Source]]: + out: dict[str, list[Source]] = defaultdict(list) + for s in self.sources: + for bot in s.bots or ["(no bot)"]: + out[bot].append(s) + return out + + def by_domain(self) -> dict[str, list[Source]]: + out: dict[str, list[Source]] = defaultdict(list) + for s in self.sources: + out[s.domain].append(s) + return out + + def question_url(self, qid: str) -> str | None: + for s in self.sources: + for c in s.citations: + if c.question_id == qid and c.question_url: + return c.question_url + return None + + +# --------------------------------------------------------------------------- # +# Build (join manifests + index) +# --------------------------------------------------------------------------- # +def _domain(url: str) -> str: + host = urlsplit(url).netloc.lower() + return host[4:] if host.startswith("www.") else host + + +def _strip_prefix(key: str | None, prefix: str) -> str | None: + if not key: + return None + p = prefix.rstrip("/") + "/" + return key[len(p) :] if key.startswith(p) else key + + +def _latest_capture(store: ContentStore, canonical_url: str) -> dict | None: + """Return the latest stored capture dict for a URL (ignoring TTL), following + a redirect alias if present. ``None`` if nothing was ever captured.""" + index = store._read_index(url_hash(canonical_url)) + if not index: + return None + if index.get("alias_of"): + index = store._read_index(index["alias_of"]) + if not index: + return None + ch = index.get("latest_content_hash") + return (index.get("captures") or {}).get(ch) + + +def _load_all_records(store: BlobStore, prefix: str) -> list[CitationRecord]: + records: list[CitationRecord] = [] + for key in store.list_keys(f"{prefix.rstrip('/')}/manifests/"): + if not key.endswith(".jsonl"): + continue + try: + records.extend(manifest_io.loads(store.get(key).decode("utf-8"))) + except (UnicodeDecodeError, ValueError): + continue + return records + + +def build_sources(store: BlobStore, config: ArchiveConfig) -> list[Source]: + """Join every manifest with the index into one ``Source`` per canonical URL. + + Unfiltered (includes tool/API-call URLs) so other tools — e.g. the coverage + report — can classify them. The catalog itself filters these out. + """ + prefix = config.s3_prefix.rstrip("/") + cstore = ContentStore(store, config) + records = _load_all_records(store, prefix) + + grouped: dict[str, list[CitationRecord]] = defaultdict(list) + for r in records: + if r.url: + grouped[canonicalize_url(r.url)].append(r) + + sources: list[Source] = [] + for canonical, recs in sorted(grouped.items()): + cap = _latest_capture(cstore, canonical) + source = Source( + canonical_url=canonical, + domain=_domain(canonical) or "(unknown)", + captured=cap is not None, + content_hash=(cap or {}).get("content_hash"), + html_key=_strip_prefix((cap or {}).get("html_key"), prefix), + screenshot_key=_strip_prefix((cap or {}).get("screenshot_key"), prefix), + markdown_key=_strip_prefix((cap or {}).get("markdown_key"), prefix), + citations=[ + Citation( + bot=r.bot, + question_id=r.question_id or r.metaculus_id, + question_url=r.question_url, + run_id=r.run_id, + tool_name=r.tool_name, + origin=r.origin, + query=r.query, + cited_url=r.url, + ) + for r in recs + ], + ) + sources.append(source) + return sources + + +def build_catalog(store: BlobStore, config: ArchiveConfig) -> CatalogData: + sources = build_sources(store, config) + pages: list[Source] = [] + excluded: dict[str, int] = defaultdict(int) + for s in sources: + reason = exclusion_reason(s.canonical_url, s.citations) + if reason: + excluded[reason] += 1 + else: + pages.append(s) + return CatalogData(sources=pages, excluded=dict(excluded)) + + +# --------------------------------------------------------------------------- # +# Render +# --------------------------------------------------------------------------- # +_CSS = """ +body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;color:#1a1a1a;background:#fafafa} +header{background:#1f2937;color:#fff;padding:16px 24px} +header a{color:#cbd5e1} +h1{font-size:20px;margin:0 0 4px} +.wrap{padding:24px;max-width:1100px;margin:0 auto} +.muted{color:#6b7280} +.badge{display:inline-block;font-size:11px;padding:1px 7px;border-radius:10px} +.ok{background:#dcfce7;color:#166534}.no{background:#fee2e2;color:#991b1b} +.card{background:#fff;border:1px solid #e5e7eb;border-radius:8px;padding:12px;margin:12px 0;display:flex;gap:12px} +.card img{width:160px;height:110px;object-fit:cover;object-position:top;border:1px solid #e5e7eb;border-radius:4px;background:#f3f4f6} +.card .meta{flex:1;min-width:0} +.card .u{font-weight:600;word-break:break-all} +.tags{margin-top:6px} +.tag{display:inline-block;background:#eef2ff;color:#3730a3;font-size:11px;padding:1px 7px;border-radius:10px;margin:2px 4px 2px 0} +.links a{margin-right:10px;font-size:12px} +table{border-collapse:collapse;width:100%;background:#fff} +td,th{border:1px solid #e5e7eb;padding:6px 8px;text-align:left;font-size:13px} +th{background:#f3f4f6} +a.grid{display:inline-block;margin:4px 12px 4px 0} +""" + + +def _esc(s) -> str: + return html.escape(str(s)) if s is not None else "" + + +def _page(title: str, body: str, rel_root: str) -> str: + return ( + "" + f"{_esc(title)}" + f"

Source Archive

" + f"← catalog home
" + f"
{body}
" + ) + + +class Linker: + """Turns a store-relative blob key into a link a coworker can open.""" + + def __init__(self, store: BlobStore, config: ArchiveConfig): + from forecasting_tools.agents_and_tools.source_archive.storage import ( + S3BlobStore, + ) + + self.is_s3 = isinstance(store, S3BlobStore) + self.bucket = config.s3_bucket + self.region = config.aws_region + self.prefix = config.s3_prefix.rstrip("/") + + def url(self, rel_key: str | None, rel_root: str) -> str | None: + if not rel_key: + return None + if self.is_s3: + host = ( + f"{self.bucket}.s3.{self.region}.amazonaws.com" + if self.region + else f"{self.bucket}.s3.amazonaws.com" + ) + return f"https://{host}/{self.prefix}/{rel_key}" + return f"{rel_root}{rel_key}" # local: relative within the prefix dir + + +def _source_card(s: Source, linker: Linker, rel_root: str) -> str: + shot = linker.url(s.screenshot_key, rel_root) + html_link = linker.url(s.html_key, rel_root) + md_link = linker.url(s.markdown_key, rel_root) + badge = ( + "captured" + if s.captured + else "not captured" + ) + img = ( + f"screenshot" + if shot + else "
" + ) + tools = sorted({c.tool_name for c in s.citations if c.tool_name}) + tags = "".join(f"{_esc(b)}" for b in s.bots) + tool_tags = "".join(f"{_esc(t)}" for t in tools) + links = [] + if html_link: + links.append(f"HTML") + if md_link: + links.append(f"markdown") + if shot: + links.append(f"screenshot") + links.append(f"live ↗") + return ( + f"
{img}
" + f"
{_esc(s.canonical_url)}
" + f"
{_esc(s.domain)} · {badge}
" + f"
{tags}{tool_tags}
" + f"" + f"
" + ) + + +def _question_csv(sources: list[Source]) -> str: + buf = io.StringIO() + w = csv.writer(buf) + w.writerow(["url", "domain", "captured", "bots", "tools", "screenshot_key"]) + for s in sources: + tools = sorted({c.tool_name for c in s.citations if c.tool_name}) + w.writerow( + [ + s.canonical_url, + s.domain, + "yes" if s.captured else "no", + "; ".join(s.bots), + "; ".join(tools), + s.screenshot_key or "", + ] + ) + return buf.getvalue() + + +# --------------------------------------------------------------------------- # +# Write +# --------------------------------------------------------------------------- # +class CatalogSummary(BaseModel): + sources: int = 0 + captured: int = 0 + questions: int = 0 + bots: int = 0 + domains: int = 0 + excluded: dict[str, int] = {} + + def __str__(self) -> str: + excl = sum(self.excluded.values()) + breakdown = ( + " (" + ", ".join(f"{k}={v}" for k, v in sorted(self.excluded.items())) + ")" + if self.excluded + else "" + ) + return ( + f"Catalog: {self.sources} page sources ({self.captured} captured) across " + f"{self.questions} questions, {self.bots} bots, {self.domains} domains " + f"— {excl} non-page URLs excluded{breakdown}" + ) + + +def _slug(value: str) -> str: + # Keep dots so domains stay readable (a.test.html); collapse anything else. + keep = [c if c.isalnum() or c in "-_." else "-" for c in value] + out = "".join(keep).strip("-.").replace("..", ".")[:80] + return out or "x" + + +def write_catalog( + store: BlobStore, + config: ArchiveConfig, + out_store: BlobStore | None = None, +) -> CatalogSummary: + """Build the catalog from ``store`` and write it to ``out_store`` (default: + ``store``). Pass a separate ``out_store`` to preview a live bucket's catalog + into a local directory without mutating the bucket.""" + prefix = config.s3_prefix.rstrip("/") + data = build_catalog(store, config) + out = out_store or store + linker = Linker(out, config) + + def put(rel: str, body: str, ctype: str) -> None: + out.put(f"{prefix}/catalog/{rel}", body.encode("utf-8"), content_type=ctype) + + by_q = data.by_question() + by_b = data.by_bot() + by_d = data.by_domain() + + # Per-question pages (the encyclopedia) + CSVs. rel_root: catalog// -> ../../ + rr2 = "../../" + for qid, sources in sorted(by_q.items()): + sources = sorted(sources, key=lambda s: s.canonical_url) + qurl = data.question_url(qid) + head = f"

Question {_esc(qid)}

" + if qurl: + head += f"

{_esc(qurl)} ↗

" + head += ( + f"

{len(sources)} source(s); " + f"{sum(s.captured for s in sources)} captured · " + f"download CSV

" + ) + cards = "".join(_source_card(s, linker, rr2) for s in sources) + put( + f"by-question/{_slug(qid)}.html", + _page(f"Question {qid}", head + cards, rr2), + "text/html", + ) + put(f"by-question/{_slug(qid)}.csv", _question_csv(sources), "text/csv") + + # Per-bot and per-domain cross-views. + for bot, sources in sorted(by_b.items()): + sources = sorted(sources, key=lambda s: s.canonical_url) + body = f"

Bot: {_esc(bot)}

{len(sources)} source(s)

" + body += "".join(_source_card(s, linker, rr2) for s in sources) + put(f"by-bot/{_slug(bot)}.html", _page(f"Bot {bot}", body, rr2), "text/html") + + for domain, sources in sorted(by_d.items()): + sources = sorted(sources, key=lambda s: s.canonical_url) + body = f"

Site: {_esc(domain)}

{len(sources)} source(s)

" + body += "".join(_source_card(s, linker, rr2) for s in sources) + put( + f"by-domain/{_slug(domain)}.html", + _page(f"Site {domain}", body, rr2), + "text/html", + ) + + # Landing + readme. rel_root: catalog/ -> ../ + rr1 = "../" + index_body = _index_body(data, by_q, by_b, by_d) + put("index.html", _page("Catalog", index_body, rr1), "text/html") + put("READ_ME_FIRST.html", _page("Read me first", _readme_body(), rr1), "text/html") + + return CatalogSummary( + sources=len(data.sources), + captured=sum(s.captured for s in data.sources), + questions=len(by_q), + bots=len(by_b), + domains=len(by_d), + excluded=data.excluded, + ) + + +def _index_body(data, by_q, by_b, by_d) -> str: + captured = sum(s.captured for s in data.sources) + + def links(items: dict, view: str) -> str: + rows = [] + for key, sources in sorted(items.items(), key=lambda kv: (-len(kv[1]), kv[0])): + rows.append( + f"" + f"{_esc(key)} ({len(sources)})" + ) + return "".join(rows) + + hidden_note = ( + f" · {data.hidden_total} non-page URLs hidden " + f"({', '.join(f'{k} {v}' for k, v in sorted(data.excluded.items()))})" + if data.hidden_total + else "" + ) + return ( + f"

What is this? →

" + f"

{len(data.sources)} page sources ({captured} captured) · " + f"{len(by_q)} questions · {len(by_b)} bots · {len(by_d)} sites{hidden_note}

" + f"

By question

The encyclopedia of sources per " + f"question — start here.

{links(by_q, 'by-question')}" + f"

By bot

{links(by_b, 'by-bot')}" + f"

By site

{links(by_d, 'by-domain')}" + ) + + +def _readme_body() -> str: + return ( + "

What is this bucket?

" + "

This is a source archive: for every web page a forecasting bot " + "cited, we save a snapshot — the page's HTML, a full-page " + "screenshot, and a clean markdown copy — so a forecast can be " + "audited later even if the original page changes or disappears.

" + "

How to browse it

" + "
    " + "
  • Open index.html (the catalog home).
  • " + "
  • By question is the main view: pick a question to see every " + "source used for it, who used it, and a screenshot of each.
  • " + "
  • By bot shows one bot's sources across questions; By site " + "groups sources by website.
  • " + "
  • Each question also has a CSV you can open in a spreadsheet.
  • " + "
" + "

The folders with long hash names (content/, index/) are " + "the machine-readable store — you don't need to open those.

" + ) diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py index c2eed8db..d5ec2545 100644 --- a/forecasting_tools/agents_and_tools/source_archive/cli.py +++ b/forecasting_tools/agents_and_tools/source_archive/cli.py @@ -24,7 +24,6 @@ from forecasting_tools.agents_and_tools.source_archive.fetchers import ( build_default_fetcher, ) -from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline def _load_dotenv() -> None: @@ -69,6 +68,11 @@ def _cmd_check(config: ArchiveConfig) -> int: print(f" AWS profile : {config.aws_profile or '(default chain)'}") print(f" AWS region : {config.aws_region or '(default)'}") print(f" Firecrawl API key : {_mask(config.firecrawl_api_key)}") + print(f" Firecrawl proxy mode : {config.firecrawl_proxy}") + print(f" Hyperbrowser API key : {_mask(config.hyperbrowser_api_key)}") + print(f" Hyperbrowser proxy : {config.hyperbrowser_use_proxy}") + print(f" CloakBrowser module : {config.cloakbrowser_import}") + print(f" PDF max pages : {config.pdf_max_pages}") print(f" TTL (days) : {config.ttl_days}") print(f" Screenshot format : {config.screenshot_format}") print(f" Screenshot max height: {config.screenshot_max_height}") @@ -76,19 +80,64 @@ def _cmd_check(config: ArchiveConfig) -> int: def _cmd_capture(args, config: ArchiveConfig) -> int: + from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls + from forecasting_tools.agents_and_tools.source_archive.pipeline import ( + capture_urls_concurrent, + ) + records = manifest_io.read_file(args.manifest) + + overrides = {} + if getattr(args, "no_hyperbrowser", False): + overrides["hyperbrowser_api_key"] = None + if getattr(args, "concurrency", None): + overrides["concurrency"] = args.concurrency + if overrides: + config = config.model_copy(update=overrides) + if "hyperbrowser_api_key" in overrides: + print("Hyperbrowser fallback DISABLED for this run.") + store = ContentStore(_make_blob_store(config, args.local, args.bucket), config) + urls = list(unique_urls(records)) + if args.limit: + urls = urls[: args.limit] target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}" - print(f"Capturing {len(records)} citation record(s) -> {target}") + print( + f"Capturing {len(urls)} URL(s) at concurrency {config.concurrency} -> {target}" + ) - with build_default_fetcher(config) as fetcher: - pipeline = CapturePipeline(fetcher, store) - summary = pipeline.run_manifest(records) + summary = capture_urls_concurrent(urls, store, config, build_default_fetcher) print(summary) + run_id = args.run_id or (records[0].run_id if records else None) + if run_id: + from forecasting_tools.agents_and_tools.source_archive import reports + + reports.write_run_report(store.blobs, run_id, summary, config) + print(f"Wrote run outcomes -> {config.s3_prefix}/reports/{run_id}.json") + + # Failures leave no cache entry, so re-running retries exactly them. Write a + # retry manifest (with provenance) so coming back — e.g. with hyperbrowser + # re-enabled — is one command over only the sites that still need it. + failed = { + o.url for o in summary.outcomes if o.status in ("quality_failed", "error") + } + if failed: + from forecasting_tools.agents_and_tools.source_archive.ingest import ( + dedupe_records, + ) + + retry_records = dedupe_records(r for r in records if r.url in failed) + retry_path = args.retry_out or f"{run_id or 'run'}_needs_retry.jsonl" + manifest_io.write_file(retry_path, retry_records) + print( + f"{len(failed)} URL(s) failed -> retry manifest {retry_path}\n" + f" come back later with: source-archive capture {retry_path} " + f"--run-id {run_id or ''} (hyperbrowser on by default)" + ) + if args.upload_manifest: - run_id = args.run_id or (records[0].run_id if records else None) if not run_id: sys.exit("--upload-manifest needs --run-id (no run_id found in records)") manifest_io.write_blob(store.blobs, run_id, records, config) @@ -96,19 +145,71 @@ def _cmd_capture(args, config: ArchiveConfig) -> int: return 0 -def _cmd_harvest(args, config: ArchiveConfig) -> int: +def _cmd_ingest_traces(args, config: ArchiveConfig) -> int: from forecasting_tools.agents_and_tools.source_archive.ingest import ( - MetaculusCommentHarvester, + dedupe_records, + harvest_run, ) - run_id = args.run_id or f"metaculus-comments-{args.project_id}" - harvester = MetaculusCommentHarvester() - records = harvester.harvest_project(args.project_id, run_id=run_id) - print( - f"Harvested {len(records)} citation record(s) from project " - f"{args.project_id}" + run_id = args.run_id # None -> derived from the run dir name + records = harvest_run(args.run_dir, run_id=run_id, bot=args.bot) + if args.dedupe: + records = dedupe_records(records) + run_id = run_id or (records[0].run_id if records else None) + print(f"Ingested {len(records)} citation record(s) from traces in {args.run_dir}") + + out_path = args.out or f"{run_id or 'traces'}.jsonl" + if not args.upload or args.out: + manifest_io.write_file(out_path, records) + print(f"Wrote manifest -> {out_path}") + if args.upload: + if not run_id: + sys.exit("--upload needs a run id (pass --run-id; none found in records)") + store = _make_blob_store(config, None, args.bucket) + manifest_io.write_blob(store, run_id, records, config) + print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl") + return 0 + + +def _cmd_catalog(args, config: ArchiveConfig) -> int: + from forecasting_tools.agents_and_tools.source_archive.catalog import write_catalog + + store = _make_blob_store(config, args.local, args.bucket) + target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}" + print(f"Building catalog from manifests + index -> {target}/catalog/") + summary = write_catalog(store, config) + print(summary) + print(f"Open {config.s3_prefix}/catalog/index.html") + return 0 + + +def _cmd_harvest_db(args, config: ArchiveConfig) -> int: + from forecasting_tools.agents_and_tools.source_archive.ingest import ( + MetaculusDbHarvester, + dedupe_records, + resolve_dsn, ) + dsn = resolve_dsn(args.dsn) + include_private = not args.public_only + harvester = MetaculusDbHarvester.from_dsn(dsn) + if args.post: + records = harvester.harvest_post( + args.post, run_id=args.run_id, include_private=include_private + ) + run_id = args.run_id or f"metaculus-db-post-{args.post}" + else: + records = harvester.harvest_recent( + days=args.days, + limit=args.limit, + run_id=args.run_id, + include_private=include_private, + ) + run_id = args.run_id or f"metaculus-db-recent-{args.days}d" + if args.dedupe: + records = dedupe_records(records) + print(f"Harvested {len(records)} citation record(s) from the Metaculus DB") + out_path = args.out or f"{run_id}.jsonl" if not args.upload or args.out: manifest_io.write_file(out_path, records) @@ -120,6 +221,35 @@ def _cmd_harvest(args, config: ArchiveConfig) -> int: return 0 +def _cmd_coverage(args, config: ArchiveConfig) -> int: + from pathlib import Path + + from forecasting_tools.agents_and_tools.source_archive import reports + from forecasting_tools.agents_and_tools.source_archive.catalog import build_sources + from forecasting_tools.agents_and_tools.source_archive.coverage import ( + MODES, + coverage_from_sources, + ) + + store = _make_blob_store(config, args.local, args.bucket) + sources = build_sources(store, config) # read manifests + index once + outcomes = reports.read_outcomes(store, config) or None + modes = MODES if args.mode == "both" else (args.mode,) + for mode in modes: + report = coverage_from_sources(sources, mode, outcomes) + print(report) + print() + if args.csv: + Path(f"{args.csv}_{mode}.csv").write_text(report.to_csv()) + print(f"Wrote {args.csv}_{mode}.csv") + if report.missing_urls: + Path(f"{args.csv}_{mode}_missing.txt").write_text( + "\n".join(report.missing_urls) + ) + print(f"Wrote {args.csv}_{mode}_missing.txt") + return 0 + + def main(argv: list[str] | None = None) -> int: _load_dotenv() parser = argparse.ArgumentParser( @@ -145,22 +275,110 @@ def main(argv: list[str] | None = None) -> int: help="also upload the manifest itself to manifests/.jsonl", ) cap.add_argument("--run-id", help="run id for the uploaded manifest") + cap.add_argument( + "--no-hyperbrowser", + action="store_true", + help="disable the Hyperbrowser fallback for this run (others still run)", + ) + cap.add_argument( + "--retry-out", + metavar="FILE", + help="where to write the failed-URL retry manifest " + "(default: _needs_retry.jsonl)", + ) + cap.add_argument( + "--concurrency", + type=int, + metavar="N", + help="parallel browser workers for this run (overrides WEB_ARCHIVE_CONCURRENCY)", + ) + cap.add_argument( + "--limit", + type=int, + metavar="N", + help="capture only the first N URLs (chunk a big manifest; resume via cache)", + ) - harv = sub.add_parser( - "harvest", - help="harvest cited URLs from bot comments on a Metaculus project", + ing = sub.add_parser( + "ingest-traces", + help="build a manifest from a traced bot run directory (bot_*/q_*/traces_*.jsonl)", ) - harv.add_argument("project_id", help="Metaculus project / tournament id") - harv.add_argument( + ing.add_argument("run_dir", help="path to a traced run directory") + ing.add_argument( "--out", metavar="FILE", help="write the manifest to this .jsonl file" ) - harv.add_argument( - "--run-id", help="run id (default: metaculus-comments-)" + ing.add_argument("--run-id", help="run id (default: the run dir's name)") + ing.add_argument( + "--bot", + help="bot name for a flat (no bot_*/) layout (default: the run dir's name)", + ) + ing.add_argument( + "--dedupe", action="store_true", help="keep one record per URL (first seen)" ) - harv.add_argument( + ing.add_argument( "--upload", action="store_true", help="upload the manifest to S3 manifests/" ) - harv.add_argument("--bucket", help="override the S3 bucket") + ing.add_argument("--bucket", help="override the S3 bucket") + + cat = sub.add_parser( + "catalog", + help="generate a coworker-legible HTML/CSV catalog (by question/bot/site)", + ) + cat.add_argument( + "--local", metavar="DIR", help="read/write the catalog in this directory" + ) + cat.add_argument("--bucket", help="override the S3 bucket") + + hdb = sub.add_parser( + "harvest-db", + help="read a bot's cited URLs from the platform Postgres database (operator)", + ) + grp = hdb.add_mutually_exclusive_group(required=True) + grp.add_argument("--post", help="harvest one post id") + grp.add_argument("--days", type=int, help="harvest the most recent N days") + hdb.add_argument( + "--limit", + type=int, + default=None, + help="cap rows when using --days (default: uncapped — a daily sweep wants all)", + ) + hdb.add_argument( + "--public-only", + action="store_true", + help="read only public comments (default: read all of a bot's comments)", + ) + hdb.add_argument( + "--dsn", + help="libpq DSN or postgresql:// URL. Default resolution: --dsn > " + "$METACULUS_DB_DSN > macOS Keychain item 'metaculus-db-dsn' > " + "dbname=metaculus. Prefer the Keychain for the real secret " + "(a --dsn value lands in shell history).", + ) + hdb.add_argument("--out", metavar="FILE", help="write the manifest to this .jsonl") + hdb.add_argument("--run-id", help="run id (default derived from the slice)") + hdb.add_argument( + "--dedupe", action="store_true", help="keep one record per URL (first seen)" + ) + hdb.add_argument( + "--upload", action="store_true", help="upload the manifest to S3 manifests/" + ) + hdb.add_argument("--bucket", help="override the S3 bucket") + + cov = sub.add_parser( + "coverage", + help="report what %% of cited sources were archived (trace vs comments)", + ) + cov.add_argument( + "--mode", + choices=["trace", "comments", "both"], + default="both", + help="which report(s) to print (default: both)", + ) + cov.add_argument( + "--csv", metavar="PREFIX", help="write PREFIX_.csv (+ _missing.txt)" + ) + cov.add_argument("--local", metavar="DIR", help="read from this directory") + cov.add_argument("--bucket", help="override the S3 bucket") args = parser.parse_args(argv) config = ArchiveConfig.from_env() @@ -169,8 +387,14 @@ def main(argv: list[str] | None = None) -> int: return _cmd_check(config) if args.command == "capture": return _cmd_capture(args, config) - if args.command == "harvest": - return _cmd_harvest(args, config) + if args.command == "ingest-traces": + return _cmd_ingest_traces(args, config) + if args.command == "harvest-db": + return _cmd_harvest_db(args, config) + if args.command == "catalog": + return _cmd_catalog(args, config) + if args.command == "coverage": + return _cmd_coverage(args, config) return 1 diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py index 2572ffc4..cfb643ef 100644 --- a/forecasting_tools/agents_and_tools/source_archive/config.py +++ b/forecasting_tools/agents_and_tools/source_archive/config.py @@ -19,17 +19,41 @@ def _get_int(name: str, default: int) -> int: return int(raw) +def _get_bool(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None or raw == "": + return default + return raw.strip().lower() in ("1", "true", "yes", "on") + + class ArchiveConfig(BaseModel): """Runtime configuration. Construct directly in tests, or ``from_env()``.""" s3_bucket: str | None = None s3_prefix: str = "source-archive" + # Local archive directory. When set, the viewer reads captures from here + # instead of S3 — handy for inspecting a local capture run with no AWS. + local_dir: str | None = None aws_profile: str | None = None aws_region: str | None = None firecrawl_api_key: str | None = None + # Firecrawl proxy mode for the anti-bot path: "basic" (1 credit) | "auto" + # (1 credit, escalates to 5 on fallback) | "stealth"/"enhanced" (5 credits). + # Only the fallback Firecrawl tier pays this; basic is the default. + firecrawl_proxy: str = "basic" + hyperbrowser_api_key: str | None = None + # Hyperbrowser session knobs for the anti-bot path. Proxy turns a 1-credit + # scrape into a 10-credit one, so leave it on only for the Cloudflare tier. + hyperbrowser_use_proxy: bool = True + hyperbrowser_use_stealth: bool = True + hyperbrowser_solve_captchas: bool = True + # CloakBrowser exposes ``cloakbrowser.launch() -> Browser``; the module name + # is overridable in case the package is renamed. + cloakbrowser_import: str = "cloakbrowser" + pdf_max_pages: int = 50 # cap PDF parsing so a huge report can't blow latency/cost ttl_days: int = 14 screenshot_format: str = "webp" # webp | jpeg | png - screenshot_max_height: int = 4000 # px; cap full-page captures + screenshot_max_height: int = 16_000 # px; safety cap (under WebP's 16383 limit) nav_timeout_ms: int = 30_000 concurrency: int = 5 @@ -38,13 +62,27 @@ def from_env(cls) -> "ArchiveConfig": return cls( s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"), s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"), + local_dir=os.environ.get("WEB_ARCHIVE_LOCAL_DIR"), aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"), aws_region=os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"), firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"), + firecrawl_proxy=os.environ.get("WEB_ARCHIVE_FIRECRAWL_PROXY", "basic"), + hyperbrowser_api_key=os.environ.get("HYPERBROWSER_API_KEY"), + hyperbrowser_use_proxy=_get_bool("WEB_ARCHIVE_HYPERBROWSER_PROXY", True), + hyperbrowser_use_stealth=_get_bool( + "WEB_ARCHIVE_HYPERBROWSER_STEALTH", True + ), + hyperbrowser_solve_captchas=_get_bool( + "WEB_ARCHIVE_HYPERBROWSER_CAPTCHA", True + ), + cloakbrowser_import=os.environ.get( + "WEB_ARCHIVE_CLOAKBROWSER_IMPORT", "cloakbrowser" + ), + pdf_max_pages=_get_int("WEB_ARCHIVE_PDF_MAX_PAGES", 50), ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14), screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"), - screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000), + screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 16_000), nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000), concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5), ) diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py index 7481ab93..800ead70 100644 --- a/forecasting_tools/agents_and_tools/source_archive/content_store.py +++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py @@ -12,17 +12,33 @@ already stored, skip the write (dedup identical re-fetches) and just refresh timestamps. +**Redirect aliasing.** A capture is keyed by its *final* URL (after redirects), +so a link shortener (``bit.ly/x``) and the page it resolves to collapse onto one +capture instead of two. The original cited URL gets a tiny **alias index** that +points at the final URL's index, and the final URL's index lists its aliases for +provenance. So ``lookup(bit.ly/x)`` and ``lookup(final)`` both hit the same +stored page, and we never store the destination twice. + +**Cross-URL content dedup.** Different URLs that return byte-identical content +share the blobs rather than storing them three times each. The first URL to +store a given content owns the blobs; later URLs get a capture whose blob keys +point back at them and whose ``content_alias_of`` names the owner. A reverse +``index/by-content/.json`` tracks the owner and every member URL. + Object layout (under ``config.s3_prefix``):: - index/.json per-URL index + capture history - content//.html - content//. - content//.md + index/.json canonical: capture history + "aliases" + index/.json alias: {"alias_of": } + index/by-content/.json reverse: owner + member urls + content//.html + content//. + content//.md """ from __future__ import annotations import json +import threading from datetime import datetime, timedelta, timezone from pydantic import BaseModel @@ -58,6 +74,11 @@ def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None): self.blobs = blob_store self.config = config or ArchiveConfig() self.prefix = self.config.s3_prefix.rstrip("/") + # Serializes the shared by-content reverse index across capture threads + # (concurrent runs). Per-URL index files are written by a single thread + # each, so they don't need it; the by-content index can be contended when + # different URLs return identical content. + self._content_lock = threading.Lock() # --- key helpers ------------------------------------------------------- def _index_key(self, uh: str) -> str: @@ -66,6 +87,9 @@ def _index_key(self, uh: str) -> str: def _content_key(self, uh: str, ch: str, ext: str) -> str: return f"{self.prefix}/content/{uh}/{ch}.{ext}" + def _content_index_key(self, ch: str) -> str: + return f"{self.prefix}/index/by-content/{ch}.json" + # --- index io ---------------------------------------------------------- def _read_index(self, uh: str) -> dict | None: key = self._index_key(uh) @@ -77,16 +101,61 @@ def _write_index(self, uh: str, index: dict) -> None: data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8") self.blobs.put(self._index_key(uh), data, content_type="application/json") + def _read_content_index(self, ch: str) -> dict | None: + key = self._content_index_key(ch) + if not self.blobs.exists(key): + return None + try: + return json.loads(self.blobs.get(key).decode("utf-8")) + except (json.JSONDecodeError, UnicodeDecodeError): + # A concurrent writer may have left a partial local file mid-write; + # treat as absent rather than crash. The locked path below is authoritative. + return None + + def _register_content( + self, ch: str, uh: str, url: str, blob_keys: dict | None + ) -> None: + """Record that ``uh`` produced content ``ch`` in the reverse index. + + The first URL to store a given content becomes its ``canonical_url_hash`` + and owns the blob keys; later URLs with identical content are added as + ``members`` and reuse those blobs (see :meth:`store`). Locked so concurrent + capture threads with identical content don't clobber each other's members. + """ + with self._content_lock: + reverse = self._read_content_index(ch) + if reverse is None: + reverse = { + "content_hash": ch, + "canonical_url_hash": uh, + "blob_keys": blob_keys or {}, + "members": [], + } + members = reverse.setdefault("members", []) + if not any(m.get("url_hash") == uh for m in members): + members.append({"url_hash": uh, "url": url}) + data = json.dumps(reverse, indent=2, sort_keys=True).encode("utf-8") + self.blobs.put( + self._content_index_key(ch), data, content_type="application/json" + ) + # --- public api -------------------------------------------------------- def lookup(self, url: str) -> StoredCapture | None: """Return the latest stored capture if within the TTL, else ``None``. - A non-``None`` return means callers can skip fetching this URL. + A non-``None`` return means callers can skip fetching this URL. If ``url`` + is an alias of a previously-redirected target, the alias is followed to + the canonical capture. """ uh = url_hash(url) index = self._read_index(uh) if not index: return None + alias_of = index.get("alias_of") + if alias_of: # follow the alias to the canonical (final-URL) index + index = self._read_index(alias_of) + if not index: + return None latest_ch = index.get("latest_content_hash") captures = index.get("captures", {}) latest = captures.get(latest_ch) @@ -100,13 +169,20 @@ def lookup(self, url: str) -> StoredCapture | None: return StoredCapture.model_validate(latest) def store(self, result: CaptureResult) -> StoreResult: - """Persist a capture, deduping by content hash. Always updates the index.""" - uh = url_hash(result.url) + """Persist a capture, deduping by content hash. Always updates the index. + + The capture is keyed by the *final* URL (after redirects). If the cited + URL differs from the final one, an alias index is written so the cited + URL still resolves here, and the cited URL is recorded under the + canonical index's ``aliases``. + """ + final_url = result.final_url or result.url + uh = url_hash(final_url) ch = result.content_hash now = utcnow_iso() index = self._read_index(uh) or { - "url": result.url, + "url": final_url, "url_hash": uh, "first_seen": now, "captures": {}, @@ -116,33 +192,48 @@ def store(self, result: CaptureResult) -> StoreResult: created = existing is None if existing is not None: - # Identical content already stored — skip blob writes, refresh time. + # Identical content already stored for THIS url — skip writes, touch. existing["last_seen"] = now stored = StoredCapture.model_validate(existing) else: - html_key = screenshot_key = markdown_key = None - if result.html is not None: - html_key = self._content_key(uh, ch, "html") - self.blobs.put( - html_key, result.html.encode("utf-8"), content_type="text/html" - ) - if result.markdown is not None: - markdown_key = self._content_key(uh, ch, "md") - self.blobs.put( - markdown_key, - result.markdown.encode("utf-8"), - content_type="text/markdown", - ) - if result.screenshot is not None: - ext = _IMG_EXT.get(result.screenshot_content_type or "", "png") - screenshot_key = self._content_key(uh, ch, ext) - self.blobs.put( - screenshot_key, - result.screenshot, - content_type=result.screenshot_content_type, - ) + reverse = self._read_content_index(ch) + reuse = bool( + reverse and reverse.get("canonical_url_hash") not in (None, uh) + ) + if reuse: + # Byte-identical content already stored under a DIFFERENT url — + # point at its blobs instead of writing three more (cross-URL + # content dedup); each url still keeps its own index history. + bk = reverse.get("blob_keys", {}) + html_key = bk.get("html") + markdown_key = bk.get("markdown") + screenshot_key = bk.get("screenshot") + content_alias_of = reverse["canonical_url_hash"] + else: + html_key = screenshot_key = markdown_key = None + if result.html is not None: + html_key = self._content_key(uh, ch, "html") + self.blobs.put( + html_key, result.html.encode("utf-8"), content_type="text/html" + ) + if result.markdown is not None: + markdown_key = self._content_key(uh, ch, "md") + self.blobs.put( + markdown_key, + result.markdown.encode("utf-8"), + content_type="text/markdown", + ) + if result.screenshot is not None: + ext = _IMG_EXT.get(result.screenshot_content_type or "", "png") + screenshot_key = self._content_key(uh, ch, ext) + self.blobs.put( + screenshot_key, + result.screenshot, + content_type=result.screenshot_content_type, + ) + content_alias_of = None stored = StoredCapture( - url=result.url, + url=final_url, url_hash=uh, content_hash=ch, status_code=result.status_code, @@ -151,12 +242,62 @@ def store(self, result: CaptureResult) -> StoreResult: html_key=html_key, screenshot_key=screenshot_key, markdown_key=markdown_key, + content_alias_of=content_alias_of, first_seen=now, last_seen=now, ) captures[ch] = stored.model_dump() + self._register_content( + ch, + uh, + final_url, + blob_keys=( + None + if reuse + else { + "html": html_key, + "markdown": markdown_key, + "screenshot": screenshot_key, + } + ), + ) index["latest_content_hash"] = ch index["last_checked"] = now + + # If the cited URL redirected to a different final URL, record the alias. + orig_uh = url_hash(result.url) + if orig_uh != uh: + aliases = index.setdefault("aliases", []) + if result.url not in aliases: + aliases.append(result.url) + self._write_index(uh, index) + + if orig_uh != uh: + self._write_alias(orig_uh, result.url, uh, now) + return StoreResult(capture=stored, created=created) + + def _write_alias( + self, orig_uh: str, orig_url: str, final_uh: str, now: str + ) -> None: + """Write/refresh a pointer from a cited URL's hash to its final capture. + + Never clobbers a canonical index (one that already holds captures), so a + URL fetched directly in the past keeps its own history. + """ + existing = self._read_index(orig_uh) + if existing and existing.get("captures"): + return + first_seen = existing.get("first_seen", now) if existing else now + self._write_index( + orig_uh, + { + "url": orig_url, + "url_hash": orig_uh, + "alias_of": final_uh, + "first_seen": first_seen, + "last_checked": now, + }, + ) diff --git a/forecasting_tools/agents_and_tools/source_archive/coverage.py b/forecasting_tools/agents_and_tools/source_archive/coverage.py new file mode 100644 index 00000000..9b812e9b --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/coverage.py @@ -0,0 +1,237 @@ +"""Coverage reports: what fraction of cited sources did we actually archive? + +Two **separate** reports, split by ingestion path — they have different +denominators and different notions of ground truth, so they must not be blurred: + +- ``trace`` — the complex/template bot's own instrumented runs (metac-ai-sdk). + Traces record *every* URL the bot touched, so this is a true archival + success-rate against ground truth. +- ``comments`` — every bot (Metaculus's own + outside bots) harvested from public + Metaculus comments. Comments are length-truncated, so the denominator is itself + incomplete: coverage here means "of the links we could *see* in comments, how + many we archived" — a weaker claim than the trace report. + +For each mode: denominator = distinct canonical **page** sources cited (tool/API +calls excluded, same rule as the catalog); numerator = those with a successful +capture in the index. Misses are bucketed by site — the per-URL failure *reason* +isn't persisted yet (that needs each run's pipeline outcomes saved), so we can +say *which* sites we miss, not yet *why*. +""" + +from __future__ import annotations + +import csv +import io +from collections import defaultdict + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive.catalog import ( + Source, + build_sources, + exclusion_reason, +) +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + +MODES = ("trace", "comments") +_COMMENT_ORIGINS = {"metaculus_comment"} + + +def citation_mode(citation) -> str: + return "comments" if (citation.origin or "") in _COMMENT_ORIGINS else "trace" + + +class CoverageRow(BaseModel): + label: str + cited: int = 0 + captured: int = 0 + + @property + def pct(self) -> float: + return round(100 * self.captured / self.cited, 1) if self.cited else 0.0 + + +class CoverageReport(BaseModel): + mode: str + cited: int = 0 + captured: int = 0 + excluded: dict[str, int] = {} # non-source reason -> count + by_question: list[CoverageRow] = [] + by_bot: list[CoverageRow] = [] + by_tool: list[CoverageRow] = [] + missed_by_domain: list[CoverageRow] = [] + missing_urls: list[str] = [] + # Populated only when per-run outcomes (reports/) exist: + has_outcomes: bool = False + missing_never_fetched: int = 0 # the real collection gap + missing_fetch_failed: int = 0 # tried, failed (Cloudflare/PDF/…) + + @property + def pct(self) -> float: + return round(100 * self.captured / self.cited, 1) if self.cited else 0.0 + + @property + def missing(self) -> int: + return self.cited - self.captured + + def __str__(self) -> str: + title = { + "trace": "Trace coverage — complex/template bot (ground truth)", + "comments": "Comment coverage — all bots (truncated denominator)", + }.get(self.mode, self.mode) + excl = ( + " (excluded as non-sources: " + + ", ".join(f"{k} {v}" for k, v in sorted(self.excluded.items())) + + ")" + if self.excluded + else "" + ) + lines = [ + title, + "=" * len(title), + # Lead with the collection gap: this report exists to tell us whether + # there are sources bots are using that we are NOT yet archiving. + f"{self.missing} of {self.cited} cited page sources are NOT yet in the " + f"archive — candidates to collect ({self.captured} archived, " + f"{self.pct}%).", + excl, + ] + if self.has_outcomes: + lines.append( + f" of those {self.missing}: {self.missing_never_fetched} were " + f"never fetched (collection gap), {self.missing_fetch_failed} " + f"were fetched but failed." + ) + if self.mode == "comments": + lines.append( + " note: comments are length-truncated, so even this denominator " + "under-counts what bots actually read — the true gap is larger." + ) + + def table(header: str, rows: list[CoverageRow], n: int = 8) -> None: + if not rows: + return + lines.append("") + lines.append(f"--- {header} ---") + for r in rows[:n]: + lines.append(f" {r.captured:>4}/{r.cited:<4} {r.pct:>5}% {r.label}") + if len(rows) > n: + lines.append(f" … +{len(rows) - n} more") + + table("by question (most-cited first)", self.by_question) + table("by bot", self.by_bot) + if self.mode == "trace": + table("by tool", self.by_tool) + table("biggest collection gaps by site (captured/cited)", self.missed_by_domain) + if self.missing_urls: + lines.append("") + lines.append( + f"--- {len(self.missing_urls)} source(s) to collect (first 10) ---" + ) + for u in self.missing_urls[:10]: + lines.append(f" {u}") + return "\n".join(lines) + + def to_csv(self) -> str: + buf = io.StringIO() + w = csv.writer(buf) + w.writerow(["group", "label", "cited", "captured", "pct"]) + w.writerow(["overall", self.mode, self.cited, self.captured, self.pct]) + for group, rows in ( + ("question", self.by_question), + ("bot", self.by_bot), + ("tool", self.by_tool), + ("missed_domain", self.missed_by_domain), + ): + for r in rows: + w.writerow([group, r.label, r.cited, r.captured, r.pct]) + return buf.getvalue() + + +def _grouped(scoped: list[tuple[Source, list]], key_of) -> list[CoverageRow]: + agg: dict[str, list[int]] = defaultdict(lambda: [0, 0]) + for source, cits in scoped: + keys = {k for k in (key_of(c) for c in cits) if k} or {"(none)"} + for k in keys: + agg[k][0] += 1 + if source.captured: + agg[k][1] += 1 + rows = [CoverageRow(label=k, cited=v[0], captured=v[1]) for k, v in agg.items()] + return sorted(rows, key=lambda r: (-r.cited, r.label)) + + +def coverage_from_sources( + sources: list[Source], mode: str, outcomes: dict[str, str] | None = None +) -> CoverageReport: + scoped: list[tuple[Source, list]] = [] + excluded: dict[str, int] = defaultdict(int) + for s in sources: + cits = [c for c in s.citations if citation_mode(c) == mode] + if not cits: + continue + reason = exclusion_reason(s.canonical_url, cits) + if reason: + excluded[reason] += 1 + continue + scoped.append((s, cits)) + + captured = sum(1 for s, _ in scoped if s.captured) + + never_fetched = failed = 0 + if outcomes is not None: + from forecasting_tools.agents_and_tools.source_archive.reports import ( + FAILED_STATUSES, + ) + + for s, _ in scoped: + if s.captured: + continue + status = outcomes.get(s.canonical_url) + if status is None: + never_fetched += 1 + elif status in FAILED_STATUSES: + failed += 1 + else: + failed += 1 + + domain_agg: dict[str, list[int]] = defaultdict(lambda: [0, 0]) + for s, _ in scoped: + domain_agg[s.domain][0] += 1 + if s.captured: + domain_agg[s.domain][1] += 1 + missed_by_domain = sorted( + ( + CoverageRow(label=d, cited=c, captured=cap) + for d, (c, cap) in domain_agg.items() + if cap < c + ), + key=lambda r: (-(r.cited - r.captured), r.label), + ) + + return CoverageReport( + mode=mode, + cited=len(scoped), + captured=captured, + excluded=dict(excluded), + by_question=_grouped(scoped, lambda c: c.question_id), + by_bot=_grouped(scoped, lambda c: c.bot), + by_tool=_grouped(scoped, lambda c: c.tool_name), + missed_by_domain=missed_by_domain, + missing_urls=sorted(s.canonical_url for s, _ in scoped if not s.captured), + has_outcomes=outcomes is not None, + missing_never_fetched=never_fetched, + missing_fetch_failed=failed, + ) + + +def build_coverage( + store: BlobStore, config: ArchiveConfig, mode: str +) -> CoverageReport: + from forecasting_tools.agents_and_tools.source_archive.reports import read_outcomes + + sources = build_sources(store, config) + outcomes = read_outcomes(store, config) or None + return coverage_from_sources(sources, mode, outcomes) diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py index 758aa87e..b136ea66 100644 --- a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py @@ -1,19 +1,32 @@ """Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown). Most callers want :func:`build_default_fetcher`, which wires the recommended -tiered setup: self-hosted Playwright primary, Firecrawl fallback. +cost-ordered tiered setup: self-hosted Playwright primary, then CloakBrowser, +PDF, Hyperbrowser, and Firecrawl backups. """ from __future__ import annotations +import logging + from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig from forecasting_tools.agents_and_tools.source_archive.fetchers.base import ( Fetcher, FetchError, ) +from forecasting_tools.agents_and_tools.source_archive.fetchers.cloakbrowser_fetcher import ( + CloakBrowserFetcher, +) from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import ( FirecrawlFetcher, ) +from forecasting_tools.agents_and_tools.source_archive.fetchers.hyperbrowser_fetcher import ( + HyperbrowserFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.pdf_fetcher import ( + PdfFetcher, + looks_like_pdf, +) from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( PlaywrightFetcher, ) @@ -21,12 +34,18 @@ TieredFetcher, ) +logger = logging.getLogger(__name__) + __all__ = [ "Fetcher", "FetchError", + "CloakBrowserFetcher", "FirecrawlFetcher", + "HyperbrowserFetcher", + "PdfFetcher", "PlaywrightFetcher", "TieredFetcher", + "looks_like_pdf", "build_default_fetcher", ] @@ -39,29 +58,64 @@ def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetc with build_default_fetcher(config) as fetcher: fetcher.fetch(url) - Playwright runs first; if a page fails to render or trips the quality gate - and a Firecrawl API key is configured, Firecrawl is tried as a fallback. - - The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle - is managed by ``with``. On ``__enter__`` it transparently composes itself - with Firecrawl (when available) behind a :class:`TieredFetcher`. + Backends are tried in **cost order** — the first capture that passes the + quality gate wins, so the cheap tiers absorb most of the tail and the paid + ones only fire on what survives them: + + 1. **Self-hosted browser** (~free) — the primary; ~70% of URLs. Uses + **CloakBrowser** (patched Chromium; matches-or-beats vanilla Playwright on + anti-bot) when installed, else falls back to vanilla **Playwright**. Only + one browser is used: two live ``sync_playwright`` instances conflict in a + single process, so cloak *replaces* vanilla rather than stacking with it. + 2. **PdfFetcher** (local, free; Firecrawl OCR fallback) — captures PDFs, + which the browsers can't render. + 3. **Hyperbrowser** (managed) — consolidated anti-bot fallback. Added when + ``HYPERBROWSER_API_KEY`` is set. + 4. **Firecrawl** (managed) — cheapest stealth + native-PDF safety net. Added + when ``FIRECRAWL_API_KEY`` is set. + + The returned object is a :class:`PlaywrightFetcher` subclass so the single + browser's lifecycle is managed by ``with``. """ config = config or ArchiveConfig() return _ManagedTieredFetcher(config) class _ManagedTieredFetcher(PlaywrightFetcher): - """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline. - - Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle - while letting us add the Firecrawl fallback once the browser is live. + """PlaywrightFetcher whose ``fetch`` is delegated to a cost-ordered tiered + pipeline. The single self-hosted browser is CloakBrowser when available + (overriding ``_launch_browser``), else vanilla Playwright; the extra backends + are composed once it is live. """ + _primary_name = "playwright" + + def _launch_browser(self): + # Prefer CloakBrowser (patched Chromium, beats vanilla on anti-bot) as + # the one self-hosted browser. Two live sync_playwright instances in a + # process conflict, so cloak REPLACES vanilla here rather than stacking. + try: + browser = CloakBrowserFetcher(self.config)._launch_browser() + self._primary_name = "cloakbrowser" + return browser + except FetchError as e: + logger.info("cloakbrowser unavailable, using vanilla Playwright: %s", e) + self._primary_name = "playwright" + return super()._launch_browser() + def __enter__(self) -> "_ManagedTieredFetcher": - super().__enter__() - backends: list[Fetcher] = [_PlaywrightOnly(self)] + super().__enter__() # launches the chosen browser via _launch_browser + backends: list[Fetcher] = [_PrimaryBrowser(self, self._primary_name)] + + # PDFs: free local parse (Firecrawl OCR fallback wired internally when a + # key is present). Cheap to keep in the chain unconditionally. + backends.append(PdfFetcher(self.config)) + + if self.config.hyperbrowser_api_key: + backends.append(HyperbrowserFetcher(self.config)) if self.config.firecrawl_api_key: backends.append(FirecrawlFetcher(self.config)) + self._tiered = TieredFetcher(*backends) return self @@ -69,14 +123,16 @@ def fetch(self, url: str): # type: ignore[override] return self._tiered.fetch(url) -class _PlaywrightOnly: - """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering, - calling the un-overridden ``fetch`` so we don't recurse.""" - - name = "playwright" +class _PrimaryBrowser: + """Adapts the live primary browser to the Fetcher protocol for tiering, + calling the un-overridden ``fetch`` so we don't recurse, and labelling the + capture with the actual browser used (cloakbrowser/playwright).""" - def __init__(self, owner: PlaywrightFetcher): + def __init__(self, owner: PlaywrightFetcher, name: str): self._owner = owner + self.name = name def fetch(self, url: str): - return PlaywrightFetcher.fetch(self._owner, url) + result = PlaywrightFetcher.fetch(self._owner, url) + result.fetcher = self.name + return result diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py new file mode 100644 index 00000000..d4164e70 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/cloakbrowser_fetcher.py @@ -0,0 +1,62 @@ +"""CloakBrowser fetcher — a self-hosted anti-bot upgrade to Playwright. + +CloakBrowser (``CloakHQ/CloakBrowser``) is an open-source, patched-Chromium fork +whose ``cloakbrowser.launch()`` returns a standard Playwright ``Browser`` — so +this fetcher reuses *all* of ``PlaywrightFetcher``'s capture logic (settle, +autoscroll, full-page screenshot, trafilatura→markdown) and only overrides how +the browser is launched. The fork applies source-level fingerprint patches that +get past Cloudflare Turnstile and similar challenges that plain headless Chromium +trips; in the one rigorous 2026 anti-detect benchmark it cleared more Cloudflare +targets than vanilla Playwright. + +It runs on your own compute, so the marginal service cost is ~$0/page. The +patched Chromium binary (~200MB) is downloaded automatically on first launch. + +Install separately (it is not in the ``source-archive`` extra because of the +binary): ``pip install cloakbrowser``. The package module is configurable via +``config.cloakbrowser_import`` (default ``cloakbrowser``) in case it is renamed. +""" + +from __future__ import annotations + +import importlib +import logging + +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( + PlaywrightFetcher, +) + +logger = logging.getLogger(__name__) + + +class CloakBrowserFetcher(PlaywrightFetcher): + name = "cloakbrowser" + + def _launch_browser(self): + module = self._import_module() + launch = getattr(module, "launch", None) + if launch is None: + raise FetchError( + f"{module.__name__} has no launch(); the CloakBrowser API may " + "have changed. Expected `cloakbrowser.launch() -> Browser`." + ) + # stealth_args=True applies the fingerprint patches; the returned object + # is a Playwright Browser, so the inherited fetch() drives it unchanged. + # No separate playwright handle to stop — CloakBrowser owns its driver. + browser = launch(headless=True, stealth_args=True) + return None, browser + + def _import_module(self): + candidates = [self.config.cloakbrowser_import, "cloakbrowser"] + tried: list[str] = [] + for mod_name in dict.fromkeys(c for c in candidates if c): + try: + return importlib.import_module(mod_name) + except ImportError: + tried.append(mod_name) + raise FetchError( + "cloakbrowser is not installed. Install it with " + "`pip install cloakbrowser` (or set WEB_ARCHIVE_CLOAKBROWSER_IMPORT " + f"to the right module). Tried: {', '.join(tried)}." + ) diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py index 22aa1a55..622d51ff 100644 --- a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py @@ -1,8 +1,14 @@ -"""Firecrawl fetcher — the FALLBACK backend. +"""Firecrawl fetcher — a managed FALLBACK backend. -Reserved for sites that block headless Chromium. It costs ~1 credit/page even -with a screenshot, so it only runs when the primary backend fails or its capture -fails the quality gate. +Reserved for sites that block headless Chromium. A basic scrape costs 1 credit/ +page even with a screenshot, so it only runs when the primary backend fails or +its capture fails the quality gate. + +For hardened anti-bot sites, set ``config.firecrawl_proxy`` to ``"auto"`` or +``"stealth"`` (a.k.a. "enhanced") — this routes through residential proxies and +is billed at ~5 credits/page, so it is opt-in and reserved for the Cloudflare +tier. Firecrawl also natively parses PDFs to markdown (1 credit per PDF page), +which is why it is the fallback for the tiered ``PdfFetcher``. The Firecrawl SDK is optional and imported lazily. The screenshot comes back as a hosted URL, which we download to bytes. @@ -50,10 +56,24 @@ def _get_client(self): self._client = Firecrawl(api_key=self.config.firecrawl_api_key) return self._client + def _scrape_kwargs(self, formats: list[str]) -> dict: + kwargs: dict = {"formats": formats} + # Firecrawl 4.x renamed "stealth" to the "enhanced" proxy mode but still + # accepts the legacy spelling; pass whatever the operator configured and + # let the SDK normalize. "basic" is the implicit default, so only send + # the param when something stronger is requested (keeps the call 1-credit + # unless the operator explicitly opts into the 5-credit proxy). + proxy = (self.config.firecrawl_proxy or "basic").strip().lower() + if proxy and proxy != "basic": + kwargs["proxy"] = proxy + return kwargs + def fetch(self, url: str) -> CaptureResult: client = self._get_client() try: - doc = client.scrape(url, formats=["markdown", "html", "screenshot"]) + doc = client.scrape( + url, **self._scrape_kwargs(["markdown", "html", "screenshot"]) + ) except Exception as e: raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e @@ -75,9 +95,23 @@ def fetch(self, url: str) -> CaptureResult: screenshot=screenshot, screenshot_content_type=content_type, fetcher=self.name, - metadata={"title": _attr(metadata, "title")}, + metadata={ + "title": _attr(metadata, "title"), + "firecrawl_proxy": self.config.firecrawl_proxy, + }, ) + def fetch_pdf_markdown(self, url: str) -> str | None: + """Scrape just the markdown for a PDF URL via Firecrawl's native PDF + parser. Used as the fallback inside :class:`PdfFetcher` when local + extraction yields thin text (e.g. a scanned PDF needing OCR).""" + client = self._get_client() + try: + doc = client.scrape(url, **self._scrape_kwargs(["markdown"])) + except Exception as e: + raise FetchError(f"firecrawl pdf scrape failed for {url}: {e}") from e + return _attr(doc, "markdown") + @staticmethod def _download(src_url: str) -> tuple[bytes | None, str | None]: try: diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py new file mode 100644 index 00000000..ce728abd --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/hyperbrowser_fetcher.py @@ -0,0 +1,149 @@ +"""Hyperbrowser fetcher — a managed FALLBACK backend. + +Hyperbrowser exposes a Firecrawl-style ``scrape`` endpoint that returns +markdown + HTML + a screenshot in one call, with optional stealth, residential +proxy, and CAPTCHA-solving session options for getting past Cloudflare and other +anti-bot filters. + +Why it's here even though Firecrawl already is: forecasting-tools already uses +Hyperbrowser elsewhere (``research/computer_use.py``), so routing the anti-bot +tail through it consolidates spend onto one vendor/bill. + +Cost note: a basic scrape is 1 credit ($0.001); enabling ``use_proxy`` makes it +10 credits ($0.01) plus proxy bandwidth ($10/GB). So the proxy/stealth session +is opt-in and meant for the small hardened-Cloudflare residual, not every URL. +Hyperbrowser has no documented PDF→markdown path, so PDFs go to the dedicated +``PdfFetcher`` instead of here. + +The SDK is optional and imported lazily; a screenshot may come back as a hosted +URL (downloaded to bytes) or inline base64. +""" + +from __future__ import annotations + +import base64 +import binascii +import logging +import urllib.request + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + +logger = logging.getLogger(__name__) + + +def _attr(obj, key, default=None): + if obj is None: + return default + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +class HyperbrowserFetcher: + name = "hyperbrowser" + + def __init__(self, config: ArchiveConfig | None = None, client=None): + self.config = config or ArchiveConfig() + self._client = client + + def _get_client(self): + if self._client is not None: + return self._client + if not self.config.hyperbrowser_api_key: + raise FetchError("HYPERBROWSER_API_KEY is not set") + try: + from hyperbrowser import Hyperbrowser + except ImportError as e: + raise FetchError( + "hyperbrowser is not installed. Install it with " + "`pip install forecasting-tools[source-archive]`." + ) from e + self._client = Hyperbrowser(api_key=self.config.hyperbrowser_api_key) + return self._client + + def _params(self, url: str): + """Build the SDK request objects. Imported here (not at module top) so + importing this module never requires the SDK.""" + from hyperbrowser.models import ( + CreateSessionParams, + ScrapeOptions, + StartScrapeJobParams, + ) + + return StartScrapeJobParams( + url=url, + scrape_options=ScrapeOptions( + formats=["markdown", "html", "screenshot"], + only_main_content=False, + ), + session_options=CreateSessionParams( + use_proxy=self.config.hyperbrowser_use_proxy, + use_stealth=self.config.hyperbrowser_use_stealth, + solve_captchas=self.config.hyperbrowser_solve_captchas, + ), + ) + + def fetch(self, url: str) -> CaptureResult: + client = self._get_client() + try: + resp = client.scrape.start_and_wait(self._params(url)) + except Exception as e: + raise FetchError(f"hyperbrowser scrape failed for {url}: {e}") from e + + # The job wrapper carries status/error; the payload is on ``.data``. + if _attr(resp, "status") == "failed": + raise FetchError( + f"hyperbrowser scrape failed for {url}: {_attr(resp, 'error')}" + ) + data = _attr(resp, "data", resp) + + metadata = _attr(data, "metadata", {}) or {} + status = _attr(metadata, "statusCode") or _attr(metadata, "status_code") + final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url + + screenshot, content_type = self._coerce_screenshot(_attr(data, "screenshot")) + + return CaptureResult( + url=url, + final_url=final_url, + status_code=int(status) if status is not None else None, + html=_attr(data, "html"), + markdown=_attr(data, "markdown"), + screenshot=screenshot, + screenshot_content_type=content_type, + fetcher=self.name, + metadata={ + "title": _attr(metadata, "title"), + "used_proxy": self.config.hyperbrowser_use_proxy, + }, + ) + + @classmethod + def _coerce_screenshot(cls, value) -> tuple[bytes | None, str | None]: + """A screenshot may arrive as a hosted URL, a data: URI, or raw base64.""" + if not value or not isinstance(value, str): + return None, None + if value.startswith("http://") or value.startswith("https://"): + return cls._download(value) + if value.startswith("data:"): + try: + header, b64 = value.split(",", 1) + ctype = header[5:].split(";", 1)[0] or "image/png" + return base64.b64decode(b64), ctype + except (ValueError, binascii.Error): + return None, None + try: + return base64.b64decode(value, validate=True), "image/png" + except (binascii.Error, ValueError): + return None, None + + @staticmethod + def _download(src_url: str) -> tuple[bytes | None, str | None]: + try: + with urllib.request.urlopen(src_url, timeout=30) as resp: + return resp.read(), resp.headers.get("Content-Type", "image/png") + except Exception as e: + logger.warning("failed to download hyperbrowser screenshot: %s", e) + return None, None diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py new file mode 100644 index 00000000..0977605c --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/pdf_fetcher.py @@ -0,0 +1,146 @@ +"""PDF fetcher — closes the gap Playwright can't. + +Headless Chromium *downloads* a PDF instead of rendering it (``page.goto`` raises +"Download is starting"), and trafilatura doesn't parse PDFs, so a cited ``.pdf`` +URL produces nothing today. This fetcher fills that hole with a two-tier strategy: + + 1. Download the PDF bytes and parse locally with **PyMuPDF4LLM** — free, fast, + CPU-only, and excellent on text-layer PDFs (most government/legal reports). + The first page is rendered to an image so the viewer still has a screenshot. + 2. If local extraction yields thin text (a scanned PDF that needs OCR), fall + back to **Firecrawl's** native PDF parser (~1 credit/page, OCR included). + +Both parsers are optional and imported lazily. Use :func:`looks_like_pdf` / +:meth:`PdfFetcher.is_pdf` to decide whether a URL should be routed here. +""" + +from __future__ import annotations + +import logging +import urllib.request + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import ( + FirecrawlFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult +from forecasting_tools.agents_and_tools.source_archive.quality import MIN_TEXT_LEN + +logger = logging.getLogger(__name__) + +_PDF_MAGIC = b"%PDF-" + + +def looks_like_pdf(url: str) -> bool: + """Cheap URL-shape heuristic: does this look like a PDF link? (The fetcher + still confirms by sniffing the magic bytes before parsing.)""" + path = url.split("?", 1)[0].split("#", 1)[0].lower() + return path.endswith(".pdf") + + +class PdfFetcher: + name = "pdf" + + def __init__( + self, + config: ArchiveConfig | None = None, + *, + firecrawl: FirecrawlFetcher | None = None, + downloader=None, + ): + self.config = config or ArchiveConfig() + # Reuse the configured Firecrawl client for the OCR fallback when a key + # is present; otherwise the fallback is simply skipped. + if firecrawl is not None: + self._firecrawl = firecrawl + elif self.config.firecrawl_api_key: + self._firecrawl = FirecrawlFetcher(self.config) + else: + self._firecrawl = None + self._download = downloader or _download_bytes + + def is_pdf(self, url: str, data: bytes | None = None) -> bool: + if data is not None: + return data[:5] == _PDF_MAGIC + return looks_like_pdf(url) + + def fetch(self, url: str) -> CaptureResult: + data, final_url, status = self._download(url, self.config.nav_timeout_ms) + if not data or data[:5] != _PDF_MAGIC: + raise FetchError(f"not a PDF (no %PDF- magic) for {url}") + + markdown, screenshot, ctype, pages, engine = self._parse_local(data) + + thin = not markdown or len(markdown.strip()) < MIN_TEXT_LEN + if thin and self._firecrawl is not None: + logger.info("local PDF parse thin for %s; trying Firecrawl OCR", url) + try: + fc_md = self._firecrawl.fetch_pdf_markdown(url) + except FetchError as e: + logger.info("firecrawl PDF fallback failed for %s: %s", url, e) + else: + if fc_md and len(fc_md.strip()) >= MIN_TEXT_LEN: + markdown, engine = fc_md, "firecrawl" + + return CaptureResult( + url=url, + final_url=final_url or url, + status_code=status, + html=None, + markdown=markdown, + screenshot=screenshot, + screenshot_content_type=ctype, + fetcher=self.name, + metadata={"pdf_engine": engine, "pdf_pages": pages}, + ) + + def _parse_local( + self, data: bytes + ) -> tuple[str | None, bytes | None, str | None, int, str]: + """Return (markdown, screenshot_png, content_type, pages, engine).""" + try: + import pymupdf # PyMuPDF (a.k.a. fitz) + import pymupdf4llm + except ImportError: + logger.warning( + "pymupdf4llm not installed; local PDF parsing unavailable. " + "Install with `pip install forecasting-tools[source-archive]`." + ) + return None, None, None, 0, "none" + + try: + doc = pymupdf.open(stream=data, filetype="pdf") + except Exception as e: + raise FetchError(f"could not open PDF: {e}") from e + + try: + total = doc.page_count + limit = min(total, self.config.pdf_max_pages) or total + markdown = pymupdf4llm.to_markdown(doc, pages=list(range(limit))) + screenshot, ctype = self._render_first_page(doc) + return markdown or None, screenshot, ctype, total, "pymupdf4llm" + finally: + doc.close() + + @staticmethod + def _render_first_page(doc) -> tuple[bytes | None, str | None]: + try: + pix = doc[0].get_pixmap(dpi=110) + return pix.tobytes("png"), "image/png" + except Exception as e: + logger.info("could not render PDF first page: %s", e) + return None, None + + +def _download_bytes( + url: str, timeout_ms: int +) -> tuple[bytes | None, str | None, int | None]: + # A browser-ish UA avoids the cheapest 403s; the content store needs the + # bytes, not a render, so plain HTTP is fine and free. + req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) + try: + with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000, 1)) as resp: + return resp.read(), resp.geturl(), getattr(resp, "status", 200) + except Exception as e: + raise FetchError(f"could not download PDF for {url}: {e}") from e diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py index ee9900b7..efba5575 100644 --- a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py @@ -27,6 +27,13 @@ logger = logging.getLogger(__name__) +# WebP's hard per-side pixel limit; taller captures must be cropped before encode. +_WEBP_MAX_DIM = 16383 +# Above this total pixel count, skip the screenshot rather than decode it: a +# pathological full-page render (very tall × wide) costs minutes of CPU in Pillow +# for a screenshot that's nice-to-have, not essential. +_MAX_SCREENSHOT_PIXELS = 200_000_000 + def _to_markdown(html: str, url: str) -> str | None: try: @@ -39,21 +46,57 @@ def _to_markdown(html: str, url: str) -> str | None: ) -def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]: - """Re-encode a PNG screenshot to the requested format using Pillow. +# Scroll the document top-to-bottom (triggering lazy-loaded content) then back +# up, so a subsequent full-page screenshot captures the fully-rendered page. +_AUTOSCROLL_JS = """ +async () => { + await new Promise((resolve) => { + let y = 0; + const step = () => { + window.scrollTo(0, y); + y += 1000; + if (y < document.body.scrollHeight) setTimeout(step, 40); + else resolve(); + }; + step(); + }); + window.scrollTo(0, 0); +} +""" + + +def _encode_screenshot( + png_bytes: bytes, fmt: str, max_height: int = 0 +) -> tuple[bytes, str]: + """Crop (to ``max_height``) and re-encode a PNG screenshot using Pillow. Pillow is already a forecasting-tools dependency, so true WebP is available - here (Playwright itself only emits PNG/JPEG). + here (Playwright itself only emits PNG/JPEG). The height cap is enforced by + cropping the *full-page* render to its top ``max_height`` pixels — never via + Playwright's ``clip`` (which, without ``full_page``, is bounded by the + viewport and silently truncates tall pages to a single screen). """ fmt = fmt.lower() - if fmt == "png": - return png_bytes, "image/png" try: from PIL import Image except ImportError: + # No Pillow: can't crop or transcode; hand back the raw full-page PNG. return png_bytes, "image/png" - image = Image.open(io.BytesIO(png_bytes)) + image = Image.open(io.BytesIO(png_bytes)) # lazy: reads size, doesn't decode + if image.width * image.height > _MAX_SCREENSHOT_PIXELS: + raise ValueError( + f"screenshot too large to encode ({image.width}x{image.height}px)" + ) + # WebP cannot encode beyond 16383px on a side. Clamp the effective cap for + # webp so an over-tall page degrades to a top-crop instead of crashing the + # encoder mid-run (which would propagate out of fetch() and abort the URL). + limit = max_height or 0 + if fmt == "webp": + limit = min(limit or _WEBP_MAX_DIM, _WEBP_MAX_DIM) + if limit and image.height > limit: + image = image.crop((0, 0, image.width, limit)) + out = io.BytesIO() if fmt == "webp": image.save(out, format="WEBP", quality=80, method=6) @@ -61,7 +104,8 @@ def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]: if fmt in ("jpeg", "jpg"): image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True) return out.getvalue(), "image/jpeg" - return png_bytes, "image/png" + image.save(out, format="PNG", optimize=True) + return out.getvalue(), "image/png" class PlaywrightFetcher: @@ -82,7 +126,12 @@ def __init__(self, config: ArchiveConfig | None = None): self._playwright = None self._browser = None - def __enter__(self) -> "PlaywrightFetcher": + def _launch_browser(self): + """Start the browser. Returns ``(playwright_or_none, browser)`` where + ``browser`` is a Playwright ``Browser``. Subclasses override this to swap + in a different stealth browser (see ``CloakBrowserFetcher``) while reusing + all of the capture logic. A backend that manages its own driver returns + ``None`` for the first element.""" try: from playwright.sync_api import sync_playwright except ImportError as e: @@ -91,8 +140,12 @@ def __enter__(self) -> "PlaywrightFetcher": "`pip install forecasting-tools[source-archive]` and then run " "`playwright install chromium`." ) from e - self._playwright = sync_playwright().start() - self._browser = self._playwright.chromium.launch(headless=True) + playwright = sync_playwright().start() + browser = playwright.chromium.launch(headless=True) + return playwright, browser + + def __enter__(self) -> "PlaywrightFetcher": + self._playwright, self._browser = self._launch_browser() return self def __exit__(self, *exc) -> None: @@ -103,6 +156,32 @@ def __exit__(self, *exc) -> None: self._playwright.stop() self._playwright = None + def _settle(self, page) -> None: + """Best-effort: let the page finish rendering before the screenshot. + + ``page.goto`` only waits for ``domcontentloaded``, which fires before + CSS/images/lazy content have laid out — capturing then yields a short, + half-built page. Wait for the load/network to quiesce and scroll the + document to force lazy content in, so the full-page capture is complete. + Each step is bounded and swallows timeouts: rendering aids are + nice-to-have, never fatal to the capture. + """ + try: + page.wait_for_load_state("load", timeout=self.config.nav_timeout_ms) + except Exception: + pass + try: + page.wait_for_load_state( + "networkidle", timeout=min(self.config.nav_timeout_ms, 10_000) + ) + except Exception: + pass + try: + page.evaluate(_AUTOSCROLL_JS) + page.wait_for_timeout(500) + except Exception: + pass + def fetch(self, url: str) -> CaptureResult: if self._browser is None: raise FetchError("PlaywrightFetcher must be used as a context manager") @@ -119,26 +198,33 @@ def fetch(self, url: str) -> CaptureResult: except Exception as e: raise FetchError(f"navigation failed for {url}: {e}") from e + self._settle(page) + status = response.status if response is not None else None html = page.content() - shot_kwargs: dict = {"type": "png"} - cap = self.config.screenshot_max_height - dims = page.evaluate( - "() => ({w: document.documentElement.scrollWidth," - " h: document.documentElement.scrollHeight})" - ) - width = max(int(dims.get("w") or 0), 1) - height = int(dims.get("h") or 0) - if cap and height > cap: - shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap} - else: - shot_kwargs["full_page"] = True - - png = page.screenshot(**shot_kwargs) - screenshot, content_type = _encode_screenshot( - png, self.config.screenshot_format - ) + # Always capture the entire scrollable page in one shot — Playwright + # stitches it internally. The height cap is applied afterward by + # cropping in Pillow (see ``_encode_screenshot``). Fall back to a + # viewport capture only if a full-page shot fails (e.g. a page taller + # than Chromium's screenshot limit). + try: + png = page.screenshot(full_page=True) + except Exception as e: + logger.info("full-page screenshot failed for %s: %s", url, e) + png = page.screenshot() + # Encoding can fail on pathological pages (e.g. a 400M-pixel full-page + # render trips Pillow's decompression-bomb guard). A screenshot is + # nice-to-have — never lose the whole capture over it. + try: + screenshot, content_type = _encode_screenshot( + png, + self.config.screenshot_format, + self.config.screenshot_max_height, + ) + except Exception as e: + logger.info("screenshot encode failed for %s: %s", url, e) + screenshot, content_type = None, None return CaptureResult( url=url, diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py index 26b54831..8b689781 100644 --- a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py @@ -4,11 +4,19 @@ from a bot's published reasoning: - :mod:`url_extraction` — pull URLs out of free text / markdown. - - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API. + - :mod:`metaculus_db` — read a bot's cited URLs from the platform database. + - :mod:`trace_extraction` — build a manifest from a traced bot run (fullest path). """ -from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import ( - MetaculusCommentHarvester, +from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_db import ( + MetaculusDbHarvester, + resolve_dsn, +) +from forecasting_tools.agents_and_tools.source_archive.ingest.trace_extraction import ( + extract_records_from_events, + extract_records_from_question_dir, + extract_records_from_trace_file, + harvest_run, ) from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( dedupe_records, @@ -17,8 +25,13 @@ ) __all__ = [ - "MetaculusCommentHarvester", + "MetaculusDbHarvester", "dedupe_records", "extract_citation_records", + "extract_records_from_events", + "extract_records_from_question_dir", + "extract_records_from_trace_file", "extract_urls", + "harvest_run", + "resolve_dsn", ] diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py deleted file mode 100644 index 0aff84a9..00000000 --- a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Harvest the URLs bots cite, from their public Metaculus comments. - -Both first-party and third-party bots publish their reasoning — with the source -links they used — as comments on the questions they forecast. The public, -no-auth Metaculus API is therefore the one mechanism that works across *every* -bot on the platform, which is why this is the general ingestion path. - -Flow: - - 1. Enumerate the bots participating in a project (tournament) leaderboard. - 2. Page through each bot's comments. - 3. Extract the URLs from each comment and emit CitationRecords. - -The result is a citation manifest you can feed straight to the capture pipeline. - -Caveat: comments are length-truncated when posted, so a comment-harvested URL -list can be incomplete versus the bot's full research. For bots you control, an -instrumented trace gives a fuller list; this path is the universal baseline. -""" - -from __future__ import annotations - -import logging -import os -from collections.abc import Iterator -from typing import Any, Callable - -from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( - extract_citation_records, -) -from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord - -logger = logging.getLogger(__name__) - -DEFAULT_BASE_URL = "https://www.metaculus.com/api" -PAGE_LIMIT = 100 - - -def _first(d: dict, *keys, default=None): - for k in keys: - if k in d and d[k] is not None: - return d[k] - return default - - -class MetaculusCommentHarvester: - """Reads bot comments via the public Metaculus API. - - HTTP is injectable for testing: pass ``fetch_json=callable(path, params) -> - dict`` to avoid real network calls. - """ - - def __init__( - self, - base_url: str | None = None, - *, - session: Any = None, - timeout: int = 30, - fetch_json: Callable[[str, dict], dict] | None = None, - ): - self.base_url = ( - base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL - ).rstrip("/") - self.web_base = ( - self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url - ) - self.timeout = timeout - self._session = session - self._fetch_json = fetch_json - - # --- http -------------------------------------------------------------- - def _get(self, path: str, params: dict) -> dict: - if self._fetch_json is not None: - return self._fetch_json(path, params) - try: - import requests - except ImportError as e: # pragma: no cover - requests is a core dep - raise ImportError("requests is required for comment harvesting") from e - if self._session is None: - self._session = requests.Session() - resp = self._session.get( - f"{self.base_url}{path}", params=params, timeout=self.timeout - ) - resp.raise_for_status() - return resp.json() - - # --- bots -------------------------------------------------------------- - def enumerate_bots(self, project_id: int | str) -> list[dict]: - """Return the bot ``user`` records on a project's leaderboard.""" - data = self._get( - f"/leaderboards/project/{project_id}/", {"with_entries": "true"} - ) - entries = _first(data, "leaderboard_entries", "entries", "results", default=[]) - bots: list[dict] = [] - seen: set[Any] = set() - for entry in entries: - user = entry.get("user") if isinstance(entry, dict) else None - if not user or not user.get("is_bot"): - continue - uid = user.get("id") - if uid in seen: - continue - seen.add(uid) - bots.append(user) - return bots - - # --- comments ---------------------------------------------------------- - def iter_comments( - self, author_id: int | str, post_id: int | str | None = None - ) -> Iterator[dict]: - """Yield every comment authored by ``author_id`` (optionally on one post).""" - offset = 0 - while True: - params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset} - if post_id is not None: - params["post"] = post_id - data = self._get("/comments/", params) - results = ( - _first(data, "results", default=[]) if isinstance(data, dict) else data - ) - if not results: - break - yield from results - if len(results) < PAGE_LIMIT: - break - offset += PAGE_LIMIT - - # --- harvesting -------------------------------------------------------- - def _records_from_comment( - self, comment: dict, *, run_id: str | None, bot: str | None - ) -> list[CitationRecord]: - post_id = _first(comment, "on_post", "post", "post_id") - post_id_str = str(post_id) if post_id is not None else None - question_url = ( - f"{self.web_base}/questions/{post_id}/" if post_id is not None else None - ) - comment_id = comment.get("id") - return extract_citation_records( - comment.get("text"), - run_id=run_id, - bot=bot, - question_id=post_id_str, - metaculus_id=post_id_str, - question_url=question_url, - trace=f"comment:{comment_id}" if comment_id is not None else None, - origin="metaculus_comment", - ) - - def harvest_author( - self, - author_id: int | str, - *, - run_id: str | None = None, - bot: str | None = None, - post_id: int | str | None = None, - ) -> list[CitationRecord]: - """All citation records from one bot's comments.""" - records: list[CitationRecord] = [] - for comment in self.iter_comments(author_id, post_id=post_id): - records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot)) - return records - - def harvest_project( - self, project_id: int | str, *, run_id: str | None = None - ) -> list[CitationRecord]: - """All citation records from every bot on a project's leaderboard. - - Records are kept per-citation (duplicates across bots are preserved as - distinct provenance); the capture pipeline dedupes URLs before fetching. - """ - run_id = run_id or f"metaculus-comments-{project_id}" - records: list[CitationRecord] = [] - bots = self.enumerate_bots(project_id) - logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots)) - for user in bots: - bot_name = user.get("username") or str(user.get("id")) - bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name) - logger.info(" bot %s: %d cited URL(s)", bot_name, len(bot_records)) - records.extend(bot_records) - return records diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py new file mode 100644 index 00000000..c0221bbf --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_db.py @@ -0,0 +1,215 @@ +"""Read a bot's cited URLs from the platform Postgres database (operator tooling). + +For operators with database access, this reads the URLs a forecasting bot cited +straight from Postgres (``comments_comment`` joined to ``users_user.is_bot``) and +emits the same :class:`CitationRecord`s as every other ingestion path, so the +catalog / coverage / capture stages downstream are unchanged. By default it reads +all of a bot's comments (``include_private=True``); pass ``include_private=False`` +for the public ones only. Only ``u.is_bot`` accounts are ever read. + +The DB call is **injected** (``query``) so the core is driver-agnostic and unit +testable; :meth:`from_dsn` wires a psycopg2 connection for real use (a libpq DSN +or a ``postgresql://…`` URL — e.g. a Neon connection string). Reads only; no +secrets are stored — the DSN comes from the caller / environment. +""" + +from __future__ import annotations + +import os +from typing import Any, Callable, Mapping, Sequence + +from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( + extract_citation_records, +) +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord + +QueryFn = Callable[[str, Sequence[Any]], list[dict]] + +# Keychain service name the DSN is stored under (see resolve_dsn / README). +KEYCHAIN_SERVICE = "metaculus-db-dsn" +LOCAL_DEFAULT_DSN = "dbname=metaculus" + +_WEB = "https://www.metaculus.com" + +# The windowed/post-scoped comment set is computed in a MATERIALIZED CTE so +# Postgres evaluates it FIRST, then joins users_user by primary key. Without the +# CTE the planner's stale stats misjudge the date window at ~300k rows (it is +# really ~2k/day) and pick a join order that times out on the remote pooler. +_OUTER = ( + "select r.id as comment_id, r.on_post_id, r.text, " + "u.username, r.author_id " + "from recent r join users_user u on u.id = r.author_id where u.is_bot" +) + + +def _recent_cte(scope: str, include_private: bool) -> str: + """A MATERIALIZED ``recent`` CTE of link-bearing, non-deleted comments. + + ``scope`` is the row-narrowing predicate (a post id or a created_at window). + Private comments are included unless ``include_private`` is False. + + ``strpos(text,'http') > 0`` is a cheap substring pre-filter (a regex `~` scan + times out on the pooler; ``like`` would need ``%%`` escaping under psycopg2). + The real URL parsing happens in extract_citation_records, so over-matching + here just costs a few empty rows. + """ + clauses = ["not c.is_soft_deleted", "strpos(c.text, 'http') > 0", scope] + if not include_private: + clauses.append("not c.is_private") + where = " and ".join(clauses) + return ( + "with recent as materialized (" + "select c.id, c.on_post_id, c.text, c.author_id, c.created_at " + f"from comments_comment c where {where}) " + ) + + +def _dsn_from_keychain(service: str = KEYCHAIN_SERVICE) -> str | None: + """Read the DSN from the macOS login Keychain, or ``None`` if unavailable. + + Uses ``security find-generic-password -w`` so the credential lives only in + the Keychain — never in ``.env``, a shell rc, or shell history. If the + Keychain item's ACL is set to confirm on access, this call raises a GUI + prompt: a human can approve it, an automated agent driving the shell cannot. + Returns ``None`` off macOS or when the item is absent / access is denied, so + callers fall through to the next source. + """ + import shutil + import subprocess + + if not shutil.which("security"): # not macOS + return None + try: + proc = subprocess.run( + ["security", "find-generic-password", "-s", service, "-w"], + capture_output=True, + text=True, + timeout=30, + ) + except (OSError, subprocess.SubprocessError): + return None + if proc.returncode != 0: + return None + return proc.stdout.strip() or None + + +def resolve_dsn( + explicit: str | None = None, + *, + env: Mapping[str, str] | None = None, + keychain_reader: Callable[[], str | None] | None = None, +) -> str: + """Resolve the DB DSN without ever persisting it to disk. + + Resolution order, first hit wins: + 1. ``explicit`` (e.g. a ``--dsn`` flag — convenient, but lands in shell + history, so prefer the Keychain for the real secret), + 2. ``$METACULUS_DB_DSN``, + 3. the macOS Keychain item ``metaculus-db-dsn`` (the private path), + 4. the local default ``dbname=metaculus``. + ``env`` / ``keychain_reader`` are injectable for tests. + """ + if explicit: + return explicit + environ = env if env is not None else os.environ + from_env = environ.get("METACULUS_DB_DSN") + if from_env: + return from_env + reader = keychain_reader or _dsn_from_keychain + from_keychain = reader() + if from_keychain: + return from_keychain + return LOCAL_DEFAULT_DSN + + +class MetaculusDbHarvester: + """Reads bot comments from Postgres. ``query(sql, params) -> list[dict]``.""" + + def __init__(self, query: QueryFn): + self._query = query + + @classmethod + def from_dsn(cls, dsn: str = "dbname=metaculus") -> "MetaculusDbHarvester": + try: + import psycopg2 + import psycopg2.extras + except ImportError as e: # pragma: no cover - optional operator dep + raise ImportError( + "psycopg2 is required for DB harvesting " + "(`pip install psycopg2-binary`)." + ) from e + conn = psycopg2.connect(dsn) + + def query(sql: str, params: Sequence[Any]) -> list[dict]: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute(sql, params) + return [dict(r) for r in cur.fetchall()] + + return cls(query) + + def _records(self, rows: list[dict], run_id: str | None) -> list[CitationRecord]: + out: list[CitationRecord] = [] + for r in rows: + post_id = r.get("on_post_id") + pid = str(post_id) if post_id is not None else None + comment_id = r.get("comment_id") + out.extend( + extract_citation_records( + r.get("text"), + run_id=run_id, + bot=r.get("username") or str(r.get("author_id")), + question_id=pid, + metaculus_id=pid, + question_url=( + f"{_WEB}/questions/{post_id}/" if post_id is not None else None + ), + comment_id=str(comment_id) if comment_id is not None else None, + origin="metaculus_comment", + ) + ) + return out + + def harvest_post( + self, + post_id: int | str, + *, + run_id: str | None = None, + include_private: bool = True, + ) -> list[CitationRecord]: + """Every bot-cited URL in the comments on one post.""" + run_id = run_id or f"metaculus-db-post-{post_id}" + sql = ( + _recent_cte("c.on_post_id = %s", include_private) + + _OUTER + + " order by r.created_at" + ) + return self._records(self._query(sql, (post_id,)), run_id) + + def harvest_recent( + self, + *, + days: int = 1, + limit: int | None = None, + run_id: str | None = None, + include_private: bool = True, + ) -> list[CitationRecord]: + """Bot-cited URLs from the most recent ``days`` of comments. + + "Recent" is measured against ``max(created_at)`` in the table, not wall + clock, so a replica that lags real time by a day still returns its latest + day with ``days=1``. ``limit`` caps the row count; ``None`` (the default) + is uncapped, which is what a daily sweep wants. + """ + run_id = run_id or f"metaculus-db-recent-{days}d" + scope = ( + "c.created_at >= " + "(select max(created_at) from comments_comment) - (%s * interval '1 day')" + ) + sql = ( + _recent_cte(scope, include_private) + _OUTER + " order by r.created_at desc" + ) + params: list[Any] = [days] + if limit: + sql += " limit %s" + params.append(limit) + return self._records(self._query(sql, tuple(params)), run_id) diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py new file mode 100644 index 00000000..c330eccb --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/trace_extraction.py @@ -0,0 +1,380 @@ +"""Build a citation manifest from a bot's run traces. + +When the template bot is run with tracing enabled it writes one JSONL trace per +forecast attempt, recording the agent loop step by step. Those traces are the +*fullest* record of what the bot actually looked at — richer than the reasoning +comment it posts, which is length-truncated (see :mod:`metaculus_db` for the +shallower comment path). + +This module walks those traces and pulls out every external URL the bot touched, +turning each into a :class:`CitationRecord` with provenance (which trace, which +tool, the search query that surfaced it). That manifest is the input to the +capture pipeline, exactly like the comment-harvested one. + +Trace layout +------------ +A traced run is a directory tree:: + + / + bot_/ + q_/ + question.json + traces_forecast_1_attempt_1.jsonl + traces_summarize.jsonl + ... + +Each ``traces_*.jsonl`` file is a stream of newline-delimited event objects. The +events that can carry external links are: + +- ``tool_call`` — the arguments the bot passed to a tool (e.g. a search query, + or a ``url`` handed to a page fetcher). Carries ``name`` and ``call_id``. +- ``tool_result`` — what the tool returned. Search tools inline their citations + here as ``[n](url)`` or as a list of result URLs. Carries ``call_id`` so the + result can be attributed back to the originating ``tool_call``. +- ``initial_prompt`` — the first prompt of a trace. Only scanned for the + ``summarize`` trace: the template bot runs research *outside* the agent loop + and pastes the research blob verbatim into the summarizer's first prompt, so + that is the one place those URLs are recoverable. Other traces' initial + prompts just echo the question text (background, resolution criteria), whose + URLs aren't research, so they're skipped. + +Search provenance (``query`` / ``tool_args``) only exists in these instrumented +traces — it is populated here from each ``tool_call`` and carried onto the URLs +that the matching ``tool_result`` returned. +""" + +from __future__ import annotations + +import glob +import json +import os +from pathlib import Path +from typing import Any + +from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( + extract_urls, +) +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord + +METACULUS_QUESTION_URL_FMT = "https://www.metaculus.com/questions/{}/" + +# Event type -> the field on that event that carries the URL-bearing payload. +_SCANNABLE_FIELDS: dict[str, str] = { + "tool_call": "args", + "tool_result": "content", + "initial_prompt": "prompt", +} +# The trace whose initial prompt holds pasted-in research (see module docstring). +_SUMMARIZE_TRACE_LABEL = "summarize" +# Keys a tool's input commonly uses for the search string, best-effort. +_QUERY_KEYS = ("query", "q", "search_query", "search", "queries", "question") + + +def _urls_in(value: Any) -> list[str]: + """Return URLs found anywhere in a string / dict / list, in first-seen order. + + Tool args are structured (a dict) and tool results may be either a blob of + text or a structured payload, so we walk the whole value and run the shared + :func:`extract_urls` over every string we reach — keeping markdown-link and + trailing-punctuation handling identical to the comment path. + """ + urls: list[str] = [] + + def walk(v: Any) -> None: + if v is None: + return + if isinstance(v, str): + urls.extend(extract_urls(v)) + return + if isinstance(v, dict): + for key, val in v.items(): + walk(key) + walk(val) + return + if isinstance(v, (list, tuple, set, frozenset)): + for item in v: + walk(item) + return + walk(str(v)) + + walk(value) + return urls + + +def _query_from_args(args: Any) -> str | None: + """Pull the search string out of a tool's arguments, if recognisable.""" + if not isinstance(args, dict): + return None + for key in _QUERY_KEYS: + val = args.get(key) + if isinstance(val, str) and val.strip(): + return val + if isinstance(val, (list, tuple)) and val: + joined = " ".join(str(item) for item in val if item) + if joined.strip(): + return joined + return None + + +def trace_label(trace_path: str) -> str: + """``traces_forecast_1_attempt_1.jsonl`` -> ``forecast_1_attempt_1``.""" + name = os.path.basename(trace_path) + if name.startswith("traces_"): + name = name[len("traces_") :] + if name.endswith(".jsonl"): + name = name[: -len(".jsonl")] + return name + + +def extract_records_from_events( + events: Any, + *, + trace: str | None = None, + include_initial_prompt: bool = False, + run_id: str | None = None, + bot: str | None = None, + question_id: str | None = None, + metaculus_id: str | None = None, + question_url: str | None = None, +) -> list[CitationRecord]: + """Turn one trace's event stream into CitationRecords. + + ``events`` is any iterable of event dicts (already parsed from JSONL). The + given provenance is stamped onto every record; per-event provenance + (``trace``, ``tool_name``, ``origin``, ``query``, ``tool_args``, + ``first_seen``) is filled in here. + + Set ``include_initial_prompt`` to scan ``initial_prompt`` events — callers + should only do this for the ``summarize`` trace (see module docstring). + """ + records: list[CitationRecord] = [] + # Attribute tool_result events (which only carry call_id) back to the + # originating tool_call's name and arguments. + tool_name_by_call_id: dict[str, str] = {} + tool_args_by_call_id: dict[str, Any] = {} + + for event in events: + if not isinstance(event, dict): + continue + event_type = event.get("type") + + if event_type == "tool_call": + call_id = str(event.get("call_id") or "").strip() + name = event.get("name") or "" + if call_id: + if name: + tool_name_by_call_id[call_id] = name + if "args" in event: + tool_args_by_call_id[call_id] = event.get("args") + + field = _SCANNABLE_FIELDS.get(event_type or "") + if field is None: + continue + if event_type == "initial_prompt" and not include_initial_prompt: + continue + + urls = _urls_in(event.get(field)) + if not urls: + continue + + if event_type == "tool_call": + tool_name = event.get("name") or "" + origin = "tool_call" + tool_args = ( + event.get("args") if isinstance(event.get("args"), dict) else None + ) + elif event_type == "tool_result": + call_id = str(event.get("call_id") or "").strip() + tool_name = tool_name_by_call_id.get(call_id, "") + origin = "tool_result" + originating_args = tool_args_by_call_id.get(call_id) + tool_args = originating_args if isinstance(originating_args, dict) else None + else: # initial_prompt + tool_name = "" + origin = event_type or "" + tool_args = None + + query = _query_from_args(tool_args) + timestamp = event.get("timestamp") + for url in urls: + record = CitationRecord( + url=url, + run_id=run_id, + bot=bot, + question_id=question_id, + metaculus_id=metaculus_id, + question_url=question_url, + trace=trace, + tool_name=tool_name, + origin=origin, + query=query, + tool_args=tool_args, + ) + if timestamp: + record.first_seen = str(timestamp) + records.append(record) + + return records + + +def _read_jsonl(path: str) -> list[dict]: + """Read a JSONL file, skipping blank or unparsable lines.""" + events: list[dict] = [] + for raw_line in Path(path).read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + return events + + +def extract_records_from_trace_file( + trace_path: str, + *, + run_id: str | None = None, + bot: str | None = None, + question_id: str | None = None, + metaculus_id: str | None = None, + question_url: str | None = None, +) -> list[CitationRecord]: + """Extract CitationRecords from one ``traces_*.jsonl`` file.""" + label = trace_label(trace_path) + return extract_records_from_events( + _read_jsonl(trace_path), + trace=label, + include_initial_prompt=(label == _SUMMARIZE_TRACE_LABEL), + run_id=run_id, + bot=bot, + question_id=question_id, + metaculus_id=metaculus_id, + question_url=question_url, + ) + + +def _read_question_metadata(question_dir: str) -> tuple[str | None, str | None]: + """Return ``(question_id, metaculus_id)`` from ``question.json`` in the dir. + + Read as a plain dict with flexible keys so the ingest stays decoupled from + any particular question model. Missing/unparsable metadata is non-fatal — + records are still emitted, just with empty question provenance. + """ + question_path = os.path.join(question_dir, "question.json") + if not os.path.exists(question_path): + return None, None + try: + data = json.loads(Path(question_path).read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None, None + if not isinstance(data, dict): + return None, None + + def _str_or_none(*keys: str) -> str | None: + for key in keys: + val = data.get(key) + if val is not None: + return str(val) + return None + + question_id = _str_or_none("question_id", "id", "post_id") + metaculus_id = _str_or_none("metaculus_id", "post_id", "id") + return question_id, metaculus_id + + +def extract_records_from_question_dir( + question_dir: str, + *, + run_id: str | None = None, + bot: str | None = None, + question_id: str | None = None, + metaculus_id: str | None = None, + question_url: str | None = None, +) -> list[CitationRecord]: + """Aggregate CitationRecords across every trace in one ``q_*`` dir. + + Question provenance is read from ``question.json`` in the dir; pass any of + ``question_id`` / ``metaculus_id`` / ``question_url`` to override what's + found there (or to supply it when the file is absent). + """ + found_qid, found_mid = _read_question_metadata(question_dir) + question_id = question_id or found_qid + metaculus_id = metaculus_id or found_mid + if question_url is None and metaculus_id is not None: + question_url = METACULUS_QUESTION_URL_FMT.format(metaculus_id) + + records: list[CitationRecord] = [] + for trace_path in sorted(glob.glob(os.path.join(question_dir, "traces_*.jsonl"))): + records.extend( + extract_records_from_trace_file( + trace_path, + run_id=run_id, + bot=bot, + question_id=question_id, + metaculus_id=metaculus_id, + question_url=question_url, + ) + ) + return records + + +def _bot_name_from_dir(bot_dir: str) -> str: + """``.../bot_complex`` -> ``complex``.""" + name = os.path.basename(bot_dir) + return name[len("bot_") :] if name.startswith("bot_") else name + + +def _question_dirs_flat(run_dir: str) -> list[str]: + """Question dirs directly under ``run_dir`` (no ``bot_*`` level). + + A "question dir" is any immediate subdirectory that actually contains + ``traces_*.jsonl``. This handles flatter layouts (e.g. a backfill of one + bot's runs as ``//traces_*.jsonl``) where the ``bot_*`` + grouping is absent. + """ + dirs = [] + for entry in sorted(glob.glob(os.path.join(run_dir, "*"))): + if os.path.isdir(entry) and glob.glob(os.path.join(entry, "traces_*.jsonl")): + dirs.append(entry) + return dirs + + +def harvest_run( + run_dir: str, *, run_id: str | None = None, bot: str | None = None +) -> list[CitationRecord]: + """Build a citation manifest from a whole traced run directory. + + Primary layout is ``/bot_*/q_*/traces_*.jsonl``, deriving ``run_id`` + from the run dir's name and ``bot`` from each ``bot_*`` subdir. If no + ``bot_*`` subdirs exist, falls back to a **flat layout** — + ``//traces_*.jsonl`` — attributing every question to a + single bot (the ``bot`` argument, else the run dir's name). Question + provenance still comes from each dir's ``question.json``. + + Returns the flat list of CitationRecords (one per URL occurrence); feed it + through :func:`url_extraction.dedupe_records` before capture for one row per + URL. + """ + run_id = run_id or os.path.basename(os.path.normpath(run_dir)) + records: list[CitationRecord] = [] + + bot_dirs = sorted(glob.glob(os.path.join(run_dir, "bot_*"))) + if bot_dirs: + for bot_dir in bot_dirs: + bot_name = _bot_name_from_dir(bot_dir) + for question_dir in sorted(glob.glob(os.path.join(bot_dir, "q_*"))): + records.extend( + extract_records_from_question_dir( + question_dir, run_id=run_id, bot=bot_name + ) + ) + return records + + # Flat fallback: no bot_* grouping. One bot, question dirs directly below. + bot_name = bot or os.path.basename(os.path.normpath(run_dir)) + for question_dir in _question_dirs_flat(run_dir): + records.extend( + extract_records_from_question_dir(question_dir, run_id=run_id, bot=bot_name) + ) + return records diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py index f97def1c..b8b06d3b 100644 --- a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py @@ -14,6 +14,9 @@ import re from collections.abc import Iterable +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord # Markdown link target: [label](url) or [label](), optionally with a title. @@ -24,13 +27,30 @@ # unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives. _BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE) -# Characters commonly stuck to the end of a URL in prose. -_TRAILING = ".,;:!?'\"" +# Characters commonly stuck to the end of a URL in prose (incl. markdown-escape +# residue: a trailing backslash or backtick). +_TRAILING = ".,;:!?'\"\\`" + + +def _cut_markdown_tail(url: str) -> str: + """Cut a URL at a markdown reference/link tail the bare-URL scan can swallow. + + Bots sometimes emit ``…/story?id=123)[10](https://other…)`` where ``)[10](…`` + is a markdown reference glued onto a real URL. The leading ``)`` was never + part of the URL, so cut at the first ``)[`` or ``](`` boundary. + """ + cut = len(url) + for marker in (")[", "]("): + i = url.find(marker) + if i > 0: + cut = min(cut, i) + return url[:cut] def _trim(url: str) -> str: """Strip trailing punctuation, and a closing bracket/paren only when it is unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive).""" + url = _cut_markdown_tail(url) while url: last = url[-1] if last in _TRAILING: @@ -45,7 +65,12 @@ def _trim(url: str) -> str: def extract_urls(text: str | None) -> list[str]: - """Return the distinct http(s) URLs in ``text``, in first-seen order.""" + """Return the distinct http(s) URLs in ``text``, in first-seen order. + + Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so + ``…/x`` and ``…/x?utm_source=…`` count once; the original first-seen string + is returned. + """ if not text: return [] seen: set[str] = set() @@ -53,8 +78,11 @@ def extract_urls(text: str | None) -> list[str]: for pattern in (_MD_LINK, _AUTOLINK, _BARE): for match in pattern.finditer(text): url = _trim(match.group(1)) - if url and url not in seen: - seen.add(url) + if not url: + continue + key = canonicalize_url(url) + if key not in seen: + seen.add(key) ordered.append(url) return ordered @@ -67,6 +95,7 @@ def extract_citation_records( question_id: str | None = None, metaculus_id: str | None = None, question_url: str | None = None, + comment_id: str | None = None, trace: str | None = None, tool_name: str | None = None, origin: str | None = None, @@ -81,6 +110,7 @@ def extract_citation_records( question_id=question_id, metaculus_id=metaculus_id, question_url=question_url, + comment_id=comment_id, trace=trace, tool_name=tool_name, origin=origin, @@ -90,11 +120,14 @@ def extract_citation_records( def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]: - """Keep the first record per URL, preserving order.""" + """Keep the first record per *canonical* URL, preserving order.""" seen: set[str] = set() out: list[CitationRecord] = [] for r in records: - if r.url and r.url not in seen: - seen.add(r.url) + if not r.url: + continue + key = canonicalize_url(r.url) + if key not in seen: + seen.add(key) out.append(r) return out diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py index 609c74d7..880ab161 100644 --- a/forecasting_tools/agents_and_tools/source_archive/manifest.py +++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py @@ -10,6 +10,9 @@ from collections.abc import Iterable, Iterator from pathlib import Path +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( @@ -31,11 +34,19 @@ def loads(text: str) -> list[CitationRecord]: def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]: - """Yield each distinct URL once, preserving first-seen order.""" + """Yield each distinct URL once, preserving first-seen order. + + Distinctness is by *canonical* URL (see :func:`canonicalize_url`), so + near-duplicate links collapse to a single fetch; the original first-seen URL + string is what's yielded, for provenance. + """ seen: set[str] = set() for r in records: - if r.url and r.url not in seen: - seen.add(r.url) + if not r.url: + continue + key = canonicalize_url(r.url) + if key not in seen: + seen.add(key) yield r.url diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py index 8caad9ac..08c63cd6 100644 --- a/forecasting_tools/agents_and_tools/source_archive/models.py +++ b/forecasting_tools/agents_and_tools/source_archive/models.py @@ -8,14 +8,24 @@ from pydantic import BaseModel, Field +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) + def utcnow_iso() -> str: return datetime.now(timezone.utc).isoformat() def url_hash(url: str) -> str: - """Stable key for a URL — groups every capture of that URL together.""" - return hashlib.sha256(url.encode("utf-8")).hexdigest() + """Stable key for a URL — groups every capture of that URL together. + + The URL is canonicalized first (see :func:`canonicalize_url`) so trivially + different links — tracking params, a trailing slash, a ``#fragment``, + query-param order, host case — collapse onto one key instead of being + stored and counted as separate sources. + """ + return hashlib.sha256(canonicalize_url(url).encode("utf-8")).hexdigest() def content_hash(html: str | bytes) -> str: @@ -56,6 +66,9 @@ class StoredCapture(BaseModel): html_key: str | None = None screenshot_key: str | None = None markdown_key: str | None = None + # Set when this capture reuses another URL's blobs because the fetched + # content was byte-identical (cross-URL content dedup); holds that URL's hash. + content_alias_of: str | None = None first_seen: str = Field(default_factory=utcnow_iso) last_seen: str = Field(default_factory=utcnow_iso) @@ -74,7 +87,11 @@ class CitationRecord(BaseModel): question_id: str | None = None metaculus_id: str | None = None question_url: str | None = None + comment_id: str | None = None # Metaculus comment the URL was cited in trace: str | None = None tool_name: str | None = None origin: str | None = None + # Search provenance (populated by instrumented trace ingest, not comments): + query: str | None = None # the search query the bot ran, if known + tool_args: dict[str, Any] | None = None # full tool input (query + filters…) first_seen: str = Field(default_factory=utcnow_iso) diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py index 1855f039..67f5817b 100644 --- a/forecasting_tools/agents_and_tools/source_archive/pipeline.py +++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py @@ -11,6 +11,7 @@ from __future__ import annotations import logging +import threading from collections.abc import Iterable from pydantic import BaseModel @@ -71,6 +72,9 @@ def capture_url(self, url: str) -> CaptureOutcome: except FetchError as e: logger.info("fetch error for %s: %s", url, e) return CaptureOutcome(url=url, status="error", reason=str(e)) + except Exception as e: # never let one bad URL abort the whole run + logger.warning("unexpected error capturing %s: %s", url, e) + return CaptureOutcome(url=url, status="error", reason=f"unexpected: {e}") # Gate here so any fetcher is covered; the tiered fetcher also gates # internally to decide fallback, but this is the authoritative check. @@ -92,3 +96,169 @@ def run(self, urls: Iterable[str]) -> PipelineSummary: def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary: return self.run(unique_urls(records)) + + +# An outcome whose error reason contains one of these means the browser itself +# died (crash, OOM, or the machine slept and severed the CDP pipe) — not a +# problem with the URL. Without recovery, every later URL in that worker's shard +# would error against the dead browser, so we rebuild the browser and retry. +_DEAD_BROWSER_MARKERS = ( + "has been closed", + "Target page, context or browser", + "Browser.new_context", + "Connection closed", + "browser has been closed", +) + + +def _browser_died(reason: str | None) -> bool: + return any(m in (reason or "") for m in _DEAD_BROWSER_MARKERS) + + +def _close_quietly(cm, timeout_s: float = 15.0) -> None: + """Tear down a fetcher context manager, but never block on it: a wedged + browser's ``close()`` can itself hang, so run it in a daemon thread and give + up after ``timeout_s`` (the leftover process is reaped at the end of the run). + """ + done = threading.Event() + + def _close() -> None: + try: + cm.__exit__(None, None, None) + except Exception: + pass + finally: + done.set() + + threading.Thread(target=_close, daemon=True).start() + done.wait(timeout_s) + + +def _reap_browser_descendants() -> None: + """Best-effort: kill automation Chromium descending from this process. Used + both to recover a wedged worker (kill its browser so the blocked sync call + errors out) and to sweep leftovers at end of run. No-op without psutil so it + never becomes a hard dependency. + """ + try: + import os + + import psutil + except Exception: + return + try: + for child in psutil.Process(os.getpid()).children(recursive=True): + try: + if "chrom" in (child.name() or "").lower(): + child.kill() + except Exception: + pass + except Exception: + pass + + +def capture_urls_concurrent( + urls: Iterable[str], + store: ContentStore, + config, + fetcher_factory, + per_url_timeout: float | None = None, + reaper=_reap_browser_descendants, +) -> PipelineSummary: + """Capture ``urls`` across ``config.concurrency`` worker threads. + + Headless Chromium's sync API is **thread-affine** — a browser must be used on + the thread that created it — so each worker opens its **own** browser via + ``fetcher_factory(config)`` and runs all captures inline on its own thread. + The content store is shared (writes are keyed by URL hash and idempotent, so + shards never collide). Order of outcomes is not preserved. + + Hang protection runs *out of band*: a supervisor thread watches each worker's + heartbeat and, if one is stuck on a single URL past ``per_url_timeout`` (a + wedged sync call whose Playwright timeout never fires — e.g. the machine + slept and severed the CDP pipe), it **kills the browser processes**. That is + an OS-level action (safe across threads, unlike touching Playwright objects), + so the blocked call errors out and the worker rebuilds via the same + dead-browser path — no single stuck worker can freeze the whole run. + """ + import time + from concurrent.futures import ThreadPoolExecutor + + url_list = list(urls) + workers = max(1, int(getattr(config, "concurrency", 1) or 1)) + if per_url_timeout is None: + nav_s = float(getattr(config, "nav_timeout_ms", 30000)) / 1000.0 + per_url_timeout = max(90.0, nav_s * 4) + + # worker index -> monotonic start of its current URL (None when between URLs) + heartbeats: dict[int, float | None] = {} + hb_lock = threading.Lock() + stop = threading.Event() + + def supervisor() -> None: + interval = max(0.5, min(per_url_timeout / 2, 30.0)) + while not stop.wait(interval): + now = time.monotonic() + with hb_lock: + stalled = [ + w + for w, t in heartbeats.items() + if t is not None and now - t > per_url_timeout + ] + if stalled: + logger.warning( + "worker(s) %s stuck > %.0fs on one URL; killing browsers to recover", + stalled, + per_url_timeout, + ) + reaper() + with hb_lock: # grace: don't reap again before workers rebuild + for w in list(heartbeats): + if heartbeats[w] is not None: + heartbeats[w] = now + + def work(idx: int, shard: list[str]) -> list[CaptureOutcome]: + outcomes: list[CaptureOutcome] = [] + cm = fetcher_factory(config) + pipeline = CapturePipeline(cm.__enter__(), store) + try: + for url in shard: + with hb_lock: + heartbeats[idx] = time.monotonic() + outcome = pipeline.capture_url(url) + if outcome.status == "error" and _browser_died(outcome.reason): + logger.warning( + "browser died; rebuilding worker %d, retrying %s", idx, url + ) + _close_quietly(cm) + cm = fetcher_factory(config) + pipeline = CapturePipeline(cm.__enter__(), store) + with hb_lock: + heartbeats[idx] = time.monotonic() + outcome = pipeline.capture_url(url) # one retry on a fresh browser + outcomes.append(outcome) + with hb_lock: + heartbeats[idx] = None + finally: + _close_quietly(cm) + return outcomes + + supervisor_thread = threading.Thread(target=supervisor, daemon=True) + supervisor_thread.start() + try: + if workers == 1: + heartbeats[0] = None + return PipelineSummary(outcomes=work(0, url_list)) + + shards = [url_list[i::workers] for i in range(workers)] + for i in range(workers): + heartbeats[i] = None + summary = PipelineSummary() + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [pool.submit(work, i, shards[i]) for i in range(workers)] + for future in futures: + summary.outcomes.extend(future.result()) + return summary + finally: + stop.set() + reaper() diff --git a/forecasting_tools/agents_and_tools/source_archive/reindex.py b/forecasting_tools/agents_and_tools/source_archive/reindex.py new file mode 100644 index 00000000..5a7472e6 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/reindex.py @@ -0,0 +1,278 @@ +"""One-off reindex / dedup audit for an existing archive. + +This walks the canonical per-URL indexes already in a store and reports how much +the smarter-dedup work (see ``ROADMAP.md`` Plan 1) would collapse, **without +mutating anything by default**. It answers the practical question: *after exact +canonicalization and content dedup, are there still many URLs that look like the +same page?* — i.e. whether the fuzzy near-dup phase (D) is worth building. + +Three lenses: + + - **Canonicalization (Phase A):** group stored URLs by :func:`canonicalize_url`. + Any group with >1 distinct raw URL is a set that *now* shares one key. + - **Content (Phase C):** group distinct canonical URLs by their latest content + hash. A group with >1 URL is byte-identical pages reachable at different URLs. + - **Near-dup signal (Phase D candidate):** of the URLs surviving both dedups, + group by ``scheme://host/path`` ignoring the query string. Big groups mean + "same path, differing query" pages that exact dedup leaves separate — the + cases fuzzy matching would target. + +Run it:: + + # against the configured S3 bucket (read-only audit) + WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive WEB_ARCHIVE_AWS_PROFILE=default \\ + python -m forecasting_tools.agents_and_tools.source_archive.reindex + + # against a local capture dir + python -m forecasting_tools.agents_and_tools.source_archive.reindex --local ./archive + + # additionally (re)build the content reverse index for existing captures + python -m forecasting_tools.agents_and_tools.source_archive.reindex --apply + +``--apply`` only writes the additive ``index/by-content/`` reverse index (safe, +idempotent). It does **not** move blobs or re-key the per-URL indexes; that +heavier migration is intentionally deferred (the archive is young). +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import defaultdict +from urllib.parse import urlsplit + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + + +class Cluster(BaseModel): + key: str + urls: list[str] + + +class AnalysisReport(BaseModel): + total_url_indexes: int = 0 + alias_indexes: int = 0 # already-collapsed redirects (Phase B) + canonical_captures: int = 0 # distinct stored URLs with content + distinct_after_canonicalization: int = 0 + distinct_after_content_dedup: int = 0 + canonicalization_clusters: list[Cluster] = [] # raw URLs that now share a key + content_clusters: list[Cluster] = [] # different URLs, identical content + near_dup_clusters: list[Cluster] = [] # same host+path, differing query + + def __str__(self) -> str: + merged_a = sum(len(c.urls) - 1 for c in self.canonicalization_clusters) + merged_c = sum(len(c.urls) - 1 for c in self.content_clusters) + lines = [ + "Source-archive dedup audit", + "=" * 40, + f"URL indexes scanned : {self.total_url_indexes}", + f" of which alias (redirect) : {self.alias_indexes}", + f" of which canonical capture : {self.canonical_captures}", + "", + f"Distinct URLs (raw) : {self.canonical_captures}", + f"After canonicalization (A) : {self.distinct_after_canonicalization}" + f" (−{merged_a} merged)", + f"After content dedup (C) : {self.distinct_after_content_dedup}" + f" (−{merged_c} byte-identical)", + "", + f"Canonicalization clusters : {len(self.canonicalization_clusters)}", + f"Identical-content clusters : {len(self.content_clusters)}", + f"Near-dup candidates (D) : {len(self.near_dup_clusters)}" + " (same host+path, differing query)", + ] + + def _show(title: str, clusters: list[Cluster], limit: int = 5) -> None: + if not clusters: + return + lines.append("") + lines.append(f"--- top {title} ---") + for c in sorted(clusters, key=lambda x: len(x.urls), reverse=True)[:limit]: + lines.append(f" [{len(c.urls)}] {c.key}") + for u in c.urls[:4]: + lines.append(f" {u}") + if len(c.urls) > 4: + lines.append(f" … +{len(c.urls) - 4} more") + + _show("canonicalization clusters", self.canonicalization_clusters) + _show("identical-content clusters", self.content_clusters) + _show("near-dup candidates (Phase D signal)", self.near_dup_clusters) + return "\n".join(lines) + + +def _host_path(url: str) -> str: + parts = urlsplit(canonicalize_url(url)) + return f"{parts.scheme}://{parts.netloc}{parts.path}" + + +def iter_url_indexes(store: BlobStore, prefix: str): + """Yield ``(key, index_dict)`` for each per-URL index, skipping the reverse + content index under ``index/by-content/``.""" + index_prefix = f"{prefix.rstrip('/')}/index/" + content_sub = f"{index_prefix}by-content/" + for key in store.list_keys(index_prefix): + if not key.endswith(".json") or key.startswith(content_sub): + continue + try: + yield key, json.loads(store.get(key).decode("utf-8")) + except (json.JSONDecodeError, UnicodeDecodeError): + continue + + +def analyze(store: BlobStore, config: ArchiveConfig) -> AnalysisReport: + report = AnalysisReport() + by_canonical: dict[str, list[str]] = defaultdict(list) + by_content: dict[str, list[str]] = defaultdict(list) + + for _key, index in iter_url_indexes(store, config.s3_prefix): + report.total_url_indexes += 1 + if index.get("alias_of"): + report.alias_indexes += 1 + continue + url = index.get("url") + if not url or not index.get("captures"): + continue + report.canonical_captures += 1 + by_canonical[canonicalize_url(url)].append(url) + ch = index.get("latest_content_hash") + if ch: + by_content[ch].append(url) + + report.distinct_after_canonicalization = len(by_canonical) + report.canonicalization_clusters = [ + Cluster(key=k, urls=sorted(set(v))) + for k, v in by_canonical.items() + if len(set(v)) > 1 + ] + + # Content dedup operates on the canonicalized URL set. + content_groups = {k: sorted(set(v)) for k, v in by_content.items()} + report.content_clusters = [ + Cluster(key=k, urls=v) for k, v in content_groups.items() if len(v) > 1 + ] + # distinct pages after content dedup = canonical URLs minus those merged away + merged_by_content = sum(len(v) - 1 for v in content_groups.values() if len(v) > 1) + report.distinct_after_content_dedup = max( + 0, report.distinct_after_canonicalization - merged_by_content + ) + + # Phase D signal: among canonical URLs, same host+path but differing query. + survivors = {canonicalize_url(u) for grp in by_canonical.values() for u in grp} + by_host_path: dict[str, set[str]] = defaultdict(set) + for u in survivors: + by_host_path[_host_path(u)].add(u) + report.near_dup_clusters = [ + Cluster(key=k, urls=sorted(v)) for k, v in by_host_path.items() if len(v) > 1 + ] + return report + + +def rebuild_content_index( + store: BlobStore, config: ArchiveConfig, *, apply: bool +) -> int: + """(Re)build ``index/by-content/`` from existing captures. Returns the number + of content groups (that would be) written. Additive and idempotent.""" + cstore = ContentStore(store, config) + groups: dict[str, list[tuple[str, str]]] = defaultdict(list) + for _key, index in iter_url_indexes(store, config.s3_prefix): + if index.get("alias_of") or not index.get("captures"): + continue + uh = index.get("url_hash") + url = index.get("url") + ch = index.get("latest_content_hash") + if uh and url and ch: + groups[ch].append((uh, url)) + + written = 0 + for ch, members in groups.items(): + written += 1 + if not apply: + continue + owner_uh, owner_url = members[0] + # Re-register every member; the first becomes canonical owner. + for uh, url in members: + blob_keys = None + if uh == owner_uh: + cap = index_blob_keys(store, config, owner_uh, ch) + blob_keys = cap + cstore._register_content(ch, uh, url, blob_keys) + return written + + +def index_blob_keys( + store: BlobStore, config: ArchiveConfig, uh: str, ch: str +) -> dict | None: + cstore = ContentStore(store, config) + index = cstore._read_index(uh) + if not index: + return None + cap = (index.get("captures") or {}).get(ch) + if not cap: + return None + return { + "html": cap.get("html_key"), + "markdown": cap.get("markdown_key"), + "screenshot": cap.get("screenshot_key"), + } + + +def _build_store(local_dir: str | None, bucket: str | None, config: ArchiveConfig): + if local_dir: + from forecasting_tools.agents_and_tools.source_archive.storage import ( + LocalBlobStore, + ) + + return LocalBlobStore(local_dir) + bucket = bucket or config.s3_bucket + if not bucket: + sys.exit( + "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), " + "or use --local DIR." + ) + from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore + + return S3BlobStore(bucket, config=config) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="source-archive-reindex", + description="Audit (and optionally rebuild) dedup structures for an " + "existing archive.", + ) + parser.add_argument("--local", metavar="DIR", help="audit a local capture dir") + parser.add_argument("--bucket", help="override WEB_ARCHIVE_S3_BUCKET") + parser.add_argument( + "--apply", + action="store_true", + help="rebuild index/by-content/ for existing captures (additive)", + ) + parser.add_argument("--json", action="store_true", help="emit the report as JSON") + args = parser.parse_args(argv) + + config = ArchiveConfig.from_env() + store = _build_store(args.local, args.bucket, config) + + report = analyze(store, config) + if args.json: + print(report.model_dump_json(indent=2)) + else: + print(report) + + if args.apply: + n = rebuild_content_index(store, config, apply=True) + print(f"\nRebuilt index/by-content/ for {n} content group(s).") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/forecasting_tools/agents_and_tools/source_archive/reports.py b/forecasting_tools/agents_and_tools/source_archive/reports.py new file mode 100644 index 00000000..ba75b4b6 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/reports.py @@ -0,0 +1,72 @@ +"""Persist each capture run's per-URL outcomes to ``reports/.json``. + +The coverage report's job is to surface sources we should be collecting. A cited +source we have not archived falls into two very different buckets: + +- **never fetched** — it was harvested into a manifest but no capture run ever + attempted it. This is the real "we should go collect this" signal. +- **fetched but failed** — we tried and the fetch/quality gate rejected it + (Cloudflare, PDF, 404…). A capture problem, not a collection problem. + +Without persisted run outcomes the two are indistinguishable. Writing each run's +outcomes here lets coverage tell them apart. +""" + +from __future__ import annotations + +import json + +from forecasting_tools.agents_and_tools.source_archive.canonicalize import ( + canonicalize_url, +) +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + +CAPTURED_STATUSES = {"stored", "deduped", "cache_hit"} +FAILED_STATUSES = {"quality_failed", "error"} + + +def report_key(run_id: str, config: ArchiveConfig) -> str: + return f"{config.s3_prefix.rstrip('/')}/reports/{run_id}.json" + + +def write_run_report( + store: BlobStore, run_id: str, summary, config: ArchiveConfig +) -> str: + """Persist a run's per-URL outcomes; ``summary`` is a ``PipelineSummary``.""" + rows = [ + {"url": o.url, "status": o.status, "reason": getattr(o, "reason", "")} + for o in summary.outcomes + ] + key = report_key(run_id, config) + store.put( + key, json.dumps(rows, indent=2).encode("utf-8"), content_type="application/json" + ) + return key + + +def read_outcomes(store: BlobStore, config: ArchiveConfig) -> dict[str, str]: + """Map canonical URL -> last known capture status across all run reports. + + A captured status wins over a failed one (if we ever succeeded, that's the + truth). Returns ``{}`` if no reports exist yet. + """ + prefix = config.s3_prefix.rstrip("/") + out: dict[str, str] = {} + for key in store.list_keys(f"{prefix}/reports/"): + if not key.endswith(".json"): + continue + try: + rows = json.loads(store.get(key).decode("utf-8")) + except (UnicodeDecodeError, ValueError): + continue + for r in rows: + url = canonicalize_url(r.get("url", "")) + status = r.get("status", "") + if not url: + continue + if url not in out or status in CAPTURED_STATUSES: + out[url] = status + return out diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py index c70d676f..7553c972 100644 --- a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py +++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py @@ -6,6 +6,7 @@ from __future__ import annotations +from collections.abc import Iterable from typing import Protocol, runtime_checkable @@ -18,3 +19,7 @@ def put( def get(self, key: str) -> bytes: ... def exists(self, key: str) -> bool: ... + + def list_keys(self, prefix: str = "") -> Iterable[str]: + """Yield every stored key beginning with ``prefix`` (for reindex/audit).""" + ... diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py index 429333ab..d85b0b0b 100644 --- a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py +++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py @@ -22,3 +22,13 @@ def get(self, key: str) -> bytes: def exists(self, key: str) -> bool: return self._path(key).exists() + + def list_keys(self, prefix: str = "") -> list[str]: + if not self.root.exists(): + return [] + keys = [ + p.relative_to(self.root).as_posix() + for p in self.root.rglob("*") + if p.is_file() + ] + return sorted(k for k in keys if k.startswith(prefix)) diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py index 0d4822b0..10914b94 100644 --- a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py +++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py @@ -58,3 +58,9 @@ def exists(self, key: str) -> bool: if code in ("404", "NoSuchKey", "NotFound"): return False raise + + def list_keys(self, prefix: str = ""): + paginator = self._get_client().get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix): + for obj in page.get("Contents", []): + yield obj["Key"] diff --git a/forecasting_tools/agents_and_tools/source_archive/viewer.py b/forecasting_tools/agents_and_tools/source_archive/viewer.py new file mode 100644 index 00000000..fe604a2b --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/viewer.py @@ -0,0 +1,409 @@ +"""Streamlit viewer for the source archive. + +Browse what the capture pipeline stored in S3: pick a captured URL and see its +**screenshot, markdown, and HTML** side by side, with the question/bot it came +from. Reads provenance from the run manifests and resolves each URL's latest +capture through its per-URL index — no local file wrangling. + +Run it:: + + # uses the same env as the rest of the archive (WEB_ARCHIVE_S3_BUCKET, etc.) + AWS_PROFILE=default WEB_ARCHIVE_S3_BUCKET=metaculus-web-archive \\ + streamlit run forecasting_tools/agents_and_tools/source_archive/viewer.py + +Nothing here is deployment-specific: bucket/prefix/profile come from +``ArchiveConfig.from_env()``. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +# `streamlit run ` puts only the script's own directory on sys.path, not +# the repo root — so make `import forecasting_tools` work whether the package is +# pip-installed or just checked out. (viewer.py -> source_archive -> agents_and_tools +# -> forecasting_tools -> .) +_REPO_ROOT = Path(__file__).resolve().parents[3] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +import pandas as pd # noqa: E402 +import streamlit as st # noqa: E402 + +from forecasting_tools.agents_and_tools.source_archive.config import ( # noqa: E402 + ArchiveConfig, +) +from forecasting_tools.agents_and_tools.source_archive.models import ( # noqa: E402 + url_hash, +) + +# --- S3 access (cached) ---------------------------------------------------- + + +@st.cache_resource(show_spinner=False) +def _client(profile: str | None, region: str | None): + import boto3 + + return boto3.Session( + profile_name=profile or None, region_name=region or None + ).client("s3") + + +def _cfg() -> ArchiveConfig: + return ArchiveConfig.from_env() + + +@st.cache_data(show_spinner=False) +def _list_keys(bucket: str, prefix: str) -> list[str]: + cfg = _cfg() + if cfg.local_dir: # filesystem-backed archive — list matching files + root = Path(cfg.local_dir) + if not root.exists(): + return [] + return [ + p.relative_to(root).as_posix() + for p in root.rglob("*") + if p.is_file() and p.relative_to(root).as_posix().startswith(prefix) + ] + s3 = _client(cfg.aws_profile, cfg.aws_region) + keys: list[str] = [] + token = None + while True: + kw = {"Bucket": bucket, "Prefix": prefix} + if token: + kw["ContinuationToken"] = token + resp = s3.list_objects_v2(**kw) + keys.extend(o["Key"] for o in resp.get("Contents", [])) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + return keys + + +@st.cache_data(show_spinner=False) +def _get_bytes(bucket: str, key: str) -> bytes | None: + cfg = _cfg() + if cfg.local_dir: + p = Path(cfg.local_dir) / key + return p.read_bytes() if p.exists() else None + s3 = _client(cfg.aws_profile, cfg.aws_region) + try: + return s3.get_object(Bucket=bucket, Key=key)["Body"].read() + except Exception: + return None + + +# Metaculus question id -> review URL. Derived at display time (not stored) so +# there's no redundant, drift-prone URL column in S3. +_METACULUS_QUESTION_BASE = "https://www.metaculus.com/questions/" + + +def _metaculus_url(metaculus_id) -> str: + if metaculus_id in (None, "", "null"): + return "" + return f"{_METACULUS_QUESTION_BASE}{metaculus_id}/" + + +def _comment_url(metaculus_id, comment_id) -> str: + """Deep-link to the specific comment the URL was cited in.""" + base = _metaculus_url(metaculus_id) + if not base or comment_id in (None, "", "null"): + return "" + return f"{base}#comment-{comment_id}" + + +@st.cache_data(show_spinner="Loading manifests…") +def _manifest_rows(bucket: str, prefix: str) -> pd.DataFrame: + """Every (question, bot, url) the bots cited, from the run manifests.""" + rows = [] + for key in _list_keys(bucket, f"{prefix}/manifests/"): + body = _get_bytes(bucket, key) + if not body: + continue + for line in body.decode("utf-8").splitlines(): + line = line.strip() + if not line: + continue + r = json.loads(line) + rows.append( + { + "question": r.get("question_id") or "(none)", + "bot": r.get("bot") or "(none)", + "run_id": r.get("run_id") or "", + "origin": r.get("origin") or "", + "query": r.get("query") or "", + "metaculus": _metaculus_url(r.get("metaculus_id")), + "comment": _comment_url(r.get("metaculus_id"), r.get("comment_id")), + "url": r.get("url", ""), + "question_url": r.get("question_url") or "", + "tool_args": r.get("tool_args"), + } + ) + df = pd.DataFrame(rows) + if not df.empty: + # Keep distinct provenance (a URL cited via two origins/runs = two rows). + df = df.drop_duplicates( + subset=["question", "bot", "run_id", "origin", "url"] + ).reset_index(drop=True) + return df + + +def _scrape_report(bucket: str, prefix: str, view: pd.DataFrame): + """Per-question scraping cost: which backend captured each URL. + + Self-hosted Playwright is free; Firecrawl (the fallback) costs ~1 credit per + page and is what actually accrues spend once a key is configured. We classify + each *stored* capture by ``fetcher`` and count Firecrawl pages per question. + + Caveat: only successful captures are recorded in the index, so a Firecrawl + attempt that failed its quality gate isn't counted here — billed attempts + aren't yet persisted (see the note in the UI). + """ + per_q: dict[str, dict] = {} + for _, row in view.iterrows(): + cap = _index(bucket, prefix, row["url"]) + q = row["question"] + agg = per_q.setdefault( + q, + { + "question": q, + "urls": 0, + "captured": 0, + "playwright": 0, + "firecrawl": 0, + "other": 0, + }, + ) + agg["urls"] += 1 + if not cap: + continue + agg["captured"] += 1 + fetcher = (cap.get("fetcher") or "").lower() + if fetcher in ("playwright", "firecrawl"): + agg[fetcher] += 1 + else: + agg["other"] += 1 + return per_q + + +@st.cache_data(show_spinner=False) +def _index(bucket: str, prefix: str, url: str) -> dict | None: + """Latest stored capture for a URL (keys + metadata), or None if uncaptured.""" + body = _get_bytes(bucket, f"{prefix}/index/{url_hash(url)}.json") + if not body: + return None + idx = json.loads(body.decode("utf-8")) + ch = idx.get("latest_content_hash") + cap = (idx.get("captures") or {}).get(ch) + return cap + + +# --- UI -------------------------------------------------------------------- + + +def main() -> None: + st.set_page_config(page_title="Source Archive Viewer", layout="wide") + cfg = _cfg() + st.title("📚 Source Archive Viewer") + + location = cfg.local_dir or cfg.s3_bucket + if not location: + st.error( + "No archive configured. Set WEB_ARCHIVE_LOCAL_DIR (a local capture " + "directory) or WEB_ARCHIVE_S3_BUCKET (S3), then reload." + ) + st.stop() + if cfg.local_dir: + st.caption(f"📂 local: {cfg.local_dir}/{cfg.s3_prefix}") + else: + st.caption( + f"s3://{cfg.s3_bucket}/{cfg.s3_prefix} · " + f"profile={cfg.aws_profile or 'default'}" + ) + + with st.sidebar: + st.header("Filters") + if st.button("🔄 Refresh"): + st.cache_data.clear() + st.rerun() + + df = _manifest_rows(location, cfg.s3_prefix) + if df.empty: + st.warning("No manifests found under this prefix yet. Run a capture first.") + st.stop() + + with st.sidebar: + bots = sorted(df["bot"].unique()) + qs = sorted(df["question"].unique()) + sel_bots = st.multiselect("Bot", bots, default=bots) + sel_qs = st.multiselect("Question", qs, default=qs) + search = st.text_input("URL contains") + + view = df[df["bot"].isin(sel_bots) & df["question"].isin(sel_qs)] + if search: + view = view[view["url"].str.contains(search, case=False, na=False)] + view = view.reset_index(drop=True) + + st.subheader(f"{len(view)} cited URL(s)") + + # Resolve capture status for the filtered rows (cached per-URL). + if len(view) > 300: + st.info( + "Showing 300 of %d — narrow with the filters for capture details." + % len(view) + ) + table = [] + for _, row in view.head(300).iterrows(): + cap = _index(location, cfg.s3_prefix, row["url"]) + table.append( + { + "question": row["question"], + "bot": row["bot"], + "run_id": row["run_id"], + "origin": row["origin"], + "captured": "✅" if cap else "—", + "fetcher": (cap or {}).get("fetcher", ""), + "captured_at": (cap or {}).get("captured_at", "")[:19], + "metaculus": row["metaculus"], + "comment": row["comment"], + "url": row["url"], + } + ) + st.dataframe( + pd.DataFrame(table), + use_container_width=True, + hide_index=True, + column_config={ + # Show the full link address as the clickable text (not a label). + "url": st.column_config.LinkColumn("url"), + "metaculus": st.column_config.LinkColumn( + "metaculus", display_text="question ↗" + ), + "comment": st.column_config.LinkColumn("comment", display_text="comment ↗"), + }, + ) + + if st.sidebar.checkbox("💸 Show scraping cost"): + st.subheader("💸 Scraping cost (filtered set)") + rate = st.number_input( + "Firecrawl cost per page ($)", + min_value=0.0, + value=0.001, + step=0.0005, + format="%.4f", + help="Self-hosted Playwright is free; this prices the Firecrawl " + "fallback. Adjust to your plan's credit rate.", + ) + per_q = _scrape_report(location, cfg.s3_prefix, view.head(300)) + rows, t_fc, t_pw, t_cap, t_url = [], 0, 0, 0, 0 + for agg in sorted(per_q.values(), key=lambda a: a["question"]): + rows.append( + { + "question": agg["question"], + "urls": agg["urls"], + "captured": agg["captured"], + "playwright (free)": agg["playwright"], + "firecrawl (paid)": agg["firecrawl"], + "firecrawl $": round(agg["firecrawl"] * rate, 4), + } + ) + t_fc += agg["firecrawl"] + t_pw += agg["playwright"] + t_cap += agg["captured"] + t_url += agg["urls"] + st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) + a, b, c = st.columns(3) + a.metric("Captured", f"{t_cap}/{t_url}") + b.metric("Firecrawl pages", t_fc, help="Playwright pages are free") + c.metric("Est. Firecrawl cost", f"${t_fc * rate:.4f}") + st.caption( + f"Playwright (free): {t_pw} · Firecrawl (paid): {t_fc}. " + "⚠️ Only **successful** captures carry a fetcher in the index, so " + "Firecrawl attempts that failed the quality gate aren't counted — " + "billed-attempt tracking needs the pipeline to persist fetch attempts." + ) + + st.divider() + st.subheader("Inspect a capture") + labels = [f"[{r['question']}] {r['url']}" for _, r in view.iterrows()] + if not labels: + st.stop() + choice = st.selectbox("URL", range(len(labels)), format_func=lambda i: labels[i]) + row = view.iloc[choice] + url = row["url"] + cap = _index(location, cfg.s3_prefix, url) + + c1, c2 = st.columns([3, 2]) + with c1: + st.markdown(f"**URL:** [{url}]({url})") + st.markdown( + f"**Question:** `{row['question']}` · **Bot:** `{row['bot']}` · " + f"**Origin:** `{row['origin'] or '—'}`" + ) + st.markdown(f"**Run:** `{row['run_id'] or '—'}`") + review = row["metaculus"] or row["question_url"] + if review: + st.markdown(f"**Metaculus question:** [{review}]({review})") + if row["comment"]: + st.markdown(f"**Cited in comment:** [{row['comment']}]({row['comment']})") + if row["query"]: + st.markdown(f"**Search query:** `{row['query']}`") + if row.get("tool_args"): + st.markdown(f"**Tool args:** `{row['tool_args']}`") + with c2: + if cap: + st.markdown( + f"**Captured:** {cap.get('captured_at','')[:19]} · " + f"**Fetcher:** {cap.get('fetcher','')} · " + f"**HTTP:** {cap.get('status_code','?')}" + ) + + if not cap: + st.warning( + "No stored capture for this URL — it failed the quality gate / errored, " + "or hasn't been captured yet." + ) + st.stop() + + tab_shot, tab_md, tab_html = st.tabs(["🖼 Screenshot", "📝 Markdown", "🌐 HTML"]) + + with tab_shot: + key = cap.get("screenshot_key") + data = _get_bytes(location, key) if key else None + if data: + st.download_button("Download .webp", data, file_name="screenshot.webp") + st.image(data, use_container_width=True) + else: + st.info("No screenshot stored.") + + with tab_md: + key = cap.get("markdown_key") + data = _get_bytes(location, key) if key else None + if data: + text = data.decode("utf-8", "replace") + st.download_button("Download .md", data, file_name="page.md") + st.caption(f"{len(text):,} chars") + st.markdown(text) + else: + st.info("No markdown stored.") + + with tab_html: + key = cap.get("html_key") + data = _get_bytes(location, key) if key else None + if data: + html = data.decode("utf-8", "replace") + st.download_button("Download .html", data, file_name="page.html") + st.caption( + f"{len(html):,} chars · rendered below (CSS/images load from the " + "original site and may not all resolve — the screenshot is the " + "faithful visual record)." + ) + st.components.v1.html(html, height=800, scrolling=True) + else: + st.info("No HTML stored.") + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index c0fcff5e..f741fa95 100644 --- a/poetry.lock +++ b/poetry.lock @@ -867,6 +867,29 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "cloakbrowser" +version = "0.3.32" +description = "Stealth Chromium that passes every bot detection test. Drop-in Playwright replacement with source-level fingerprint patches." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "cloakbrowser-0.3.32-py3-none-any.whl", hash = "sha256:5a993ee019bfcd00d545d7d6d51837646bcb1e8226545acdf0b543b38a8883df"}, + {file = "cloakbrowser-0.3.32.tar.gz", hash = "sha256:7361e2f5e366f651b5d54aad3ac13e145462110e0956b538ae3686916c36535a"}, +] + +[package.dependencies] +httpx = ">=0.24" +playwright = ">=1.40" + +[package.extras] +dev = ["pytest (>=7.0)", "pytest-asyncio (>=0.23)"] +geoip = ["geoip2 (>=4.0)", "socksio (>=1.0)"] +patchright = ["patchright (>=1.40)"] +serve = ["aiohttp (>=3.9)", "websockets (>=12.0)"] + [[package]] name = "colorama" version = "0.4.6" @@ -5371,6 +5394,46 @@ dev = ["coverage[toml] (==7.10.7)", "cryptography (>=3.4.0)", "pre-commit", "pyt docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==7.10.7)", "pytest (>=8.4.2,<9.0.0)"] +[[package]] +name = "pymupdf" +version = "1.27.2.3" +description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc1bc3cae6e9e150b0dbb0a9221bdfd411d65f0db2fe359eaa22467d7cc2a05f"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:660d93cb6da5bbddf11d3982ae27745dd3a9902d9f24cdb69adab83962294b5a"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1dd460a3ae4597a755f00a3bd9771f5ebf1531dc111f6a36bf05dd00a6b84425"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:857842b4888827bd6155a1131341b2822a7ebe9a8c15a975fd7d490d7a64a30c"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:580983849c64a08d08344ca3d1580e87c01f046a8392421797bc850efd72a5b6"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-win32.whl", hash = "sha256:a5c1088a87189891a4946ab314a14b7934ac4c5b6077f7e74ebee956f8906d0e"}, + {file = "pymupdf-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:d20f68ef15195e073071dbc4ae7455257c7889af7584e39df490c0a92728526e"}, + {file = "pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2"}, + {file = "pymupdf-1.27.2.3.tar.gz", hash = "sha256:7a92faa25129e8bbec5e50eeb9214f187665428c31b05c4ef6e36c58c0b1c6d2"}, +] + +[[package]] +name = "pymupdf4llm" +version = "0.3.4" +description = "PyMuPDF Utilities for LLM/RAG" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "pymupdf4llm-0.3.4-py3-none-any.whl", hash = "sha256:0517492f82af978541162ade20fc54649cdca52acd478e33b97cb6171d69956f"}, + {file = "pymupdf4llm-0.3.4.tar.gz", hash = "sha256:48d396a5fb3c14351493c7f1dd25b2a843efdbdc4526e489ee100643a2cebec1"}, +] + +[package.dependencies] +pymupdf = ">=1.27.1" +tabulate = "*" + +[package.extras] +layout = ["pymupdf-layout (>=1.27.1)"] + [[package]] name = "pyparsing" version = "3.3.2" @@ -6659,6 +6722,22 @@ typepy = ">=1.2.0,<3" logging = ["loguru (>=0.4.1,<1)"] test = ["pytablewriter (>=0.46)", "pytest"] +[[package]] +name = "tabulate" +version = "0.10.0" +description = "Pretty-print tabular data" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3"}, + {file = "tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tcolorpy" version = "0.1.7" @@ -7818,9 +7897,9 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_it type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""] [extras] -source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"] +source-archive = ["boto3", "cloakbrowser", "firecrawl-py", "playwright", "pymupdf4llm", "trafilatura"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e" +content-hash = "d9abd6c9194bdd4769704c8c60f48f438f9d77370b35ee739555d3b9fd3e5e22" diff --git a/pyproject.toml b/pyproject.toml index d15ad580..c8b322f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,9 +57,16 @@ boto3 = {version = ">=1.34,<2.0.0", optional = true} playwright = {version = ">=1.44,<2.0.0", optional = true} firecrawl-py = {version = ">=4.0,<5.0.0", optional = true} trafilatura = {version = ">=1.9,<3.0.0", optional = true} +pymupdf4llm = {version = ">=0.0.17,<1.0.0", optional = true} +# Self-hosted anti-bot backend (CloakBrowser). Pinned tight to 0.3.x: it's a +# young, fast-moving 0.x package whose launch() API changed recently, so bump +# the minor deliberately. The pip wheel is light (httpx + playwright); the +# ~200MB patched Chromium downloads at first launch, not at install. +cloakbrowser = {version = ">=0.3.31,<0.4.0", optional = true} [tool.poetry.extras] -source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"] +# hyperbrowser is already a core dep (used elsewhere too). +source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura", "pymupdf4llm", "cloakbrowser"] [tool.poetry.scripts] source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main"