From 2d202c340709d07382ce8ce0a4b082f80426b96a Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Thu, 25 Jun 2026 17:01:36 +0000 Subject: [PATCH 1/3] Add 6 public-baseline forecasting bots Adds agentic PydanticAI bots that estimate what a sampled group of people would forecast (public sentiment, expert opinion, credible news outlets, left, center, and right), as public baselines to compare against the Community Prediction. Each bot is wrapped in the ForecastBot interface, searches for evidence of its group's views, and reports the sampled sources and their implied forecasts in the final comment. Wires the bots into run_bots.py, bot_lists.py, and the AIB tournament workflow, and adds pydantic-ai-slim as a dependency plus unit tests. Co-authored-by: Cursor --- .github/workflows/run-bot-aib-tournament.yaml | 89 +++ .../test_population_baseline_bot.py | 149 +++++ forecasting_tools/forecast_bots/bot_lists.py | 24 + .../public_baselines/__init__.py | 44 ++ .../public_baselines/center_leaning_bot.py | 39 ++ .../public_baselines/credible_news_bot.py | 44 ++ .../public_baselines/expert_opinion_bot.py | 43 ++ .../public_baselines/left_leaning_bot.py | 40 ++ .../population_baseline_bot.py | 577 ++++++++++++++++++ .../public_baselines/public_sentiment_bot.py | 44 ++ .../public_baselines/right_leaning_bot.py | 41 ++ poetry.lock | 180 +++++- pyproject.toml | 1 + run_bots.py | 92 +++ 14 files changed, 1405 insertions(+), 2 deletions(-) create mode 100644 code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/__init__.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py create mode 100644 forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py diff --git a/.github/workflows/run-bot-aib-tournament.yaml b/.github/workflows/run-bot-aib-tournament.yaml index f688810c..b5c40322 100644 --- a/.github/workflows/run-bot-aib-tournament.yaml +++ b/.github/workflows/run-bot-aib-tournament.yaml @@ -86,6 +86,95 @@ jobs: # NOTE: don't remove any of the open source models, since these are the best option for a long term baseline (other models get deprecated) + #################################### Public-baseline bots #################################### + # These agentic bots estimate what a sampled group of people would forecast + # (public, experts, credible news, left, center, right), as public baselines + # to compare against the Community Prediction. + + bot_public_sentiment_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_PUBLIC_SENTIMENT_BASELINE" + metac_name: "metac-public-sentiment-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_expert_opinion_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_EXPERT_OPINION_BASELINE" + metac_name: "metac-expert-opinion-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_credible_news_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_CREDIBLE_NEWS_BASELINE" + metac_name: "metac-credible-news-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_left_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_LEFT_LEANING_BASELINE" + metac_name: "metac-left-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_center_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_CENTER_LEANING_BASELINE" + metac_name: "metac-center-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_right_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_RIGHT_LEANING_BASELINE" + metac_name: "metac-right-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + #################################### No-research one-shot bots #################################### bot_gpt_5_5_no_research_one_shot: diff --git a/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py new file mode 100644 index 00000000..13c49372 --- /dev/null +++ b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py @@ -0,0 +1,149 @@ +from types import SimpleNamespace + +from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList +from forecasting_tools.data_models.numeric_report import NumericDistribution +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + BinaryPopulationForecast, + DiscoveredSource, + MultipleChoicePopulationForecast, + NumericPercentile, + NumericPopulationForecast, + OptionProbability, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) + + +class _FakeAgent: + def __init__(self, output: object) -> None: + self._output = output + + async def run(self, prompt: str, usage_limits: object = None) -> SimpleNamespace: + return SimpleNamespace(output=self._output) + + +def _make_bot_returning(output: object) -> PublicSentimentBaselineBot: + bot = PublicSentimentBaselineBot() + bot._build_agent = lambda output_type: _FakeAgent(output) # type: ignore + return bot + + +def _sample_sources() -> list[DiscoveredSource]: + return [ + DiscoveredSource( + name="YouGov poll (Jun 2026)", + represents="US adults", + url="https://example.com/poll", + implied_forecast="~60% expect yes", + confidence="medium", + note="Topline support translated to probability.", + ) + ] + + +async def test_binary_forecast_converts_and_clamps() -> None: + forecast = BinaryPopulationForecast( + population_summary="The public leans yes.", + sources=_sample_sources(), + aggregate_probability=0.995, + aggregate_rationale="Weighted toward the poll.", + ) + bot = _make_bot_returning(forecast) + question = BinaryQuestion(question_text="Will it happen?") + + prediction = await bot._run_forecast_on_binary(question, "") + + assert prediction.prediction_value == 0.99 + assert "YouGov poll" in prediction.reasoning + assert "Sources sampled" in prediction.reasoning + + +async def test_multiple_choice_maps_and_normalizes_options() -> None: + forecast = MultipleChoicePopulationForecast( + population_summary="Split opinion.", + sources=_sample_sources(), + option_probabilities=[ + OptionProbability(option_name="Option A", probability=0.6), + OptionProbability(option_name="Option B", probability=0.6), + ], + aggregate_rationale="Most lean A or B.", + ) + bot = _make_bot_returning(forecast) + question = MultipleChoiceQuestion( + question_text="Which option?", + options=["Option A", "Option B", "Option C"], + ) + + prediction = await bot._run_forecast_on_multiple_choice(question, "") + + assert isinstance(prediction.prediction_value, PredictedOptionList) + option_names = [ + option.option_name for option in prediction.prediction_value.predicted_options + ] + assert option_names == ["Option A", "Option B", "Option C"] + total = sum( + option.probability for option in prediction.prediction_value.predicted_options + ) + assert abs(total - 1.0) < 0.01 + + +async def test_numeric_forecast_builds_distribution() -> None: + forecast = NumericPopulationForecast( + population_summary="Public expects a mid value.", + sources=_sample_sources(), + percentiles=[ + NumericPercentile(percentile=0.1, value=10), + NumericPercentile(percentile=0.2, value=20), + NumericPercentile(percentile=0.4, value=40), + NumericPercentile(percentile=0.6, value=60), + NumericPercentile(percentile=0.8, value=80), + NumericPercentile(percentile=0.9, value=90), + ], + aggregate_rationale="Spread across plausible values.", + ) + bot = _make_bot_returning(forecast) + question = NumericQuestion( + question_text="How many?", + upper_bound=100, + lower_bound=0, + open_upper_bound=True, + open_lower_bound=True, + ) + + prediction = await bot._run_forecast_on_numeric(question, "") + + assert isinstance(prediction.prediction_value, NumericDistribution) + assert len(prediction.prediction_value.declared_percentiles) == 6 + + +def test_comment_includes_population_framing_and_sources() -> None: + from forecasting_tools.data_models.forecast_report import ( + ReasonedPrediction, + ResearchWithPredictions, + ) + + bot = PublicSentimentBaselineBot() + question = BinaryQuestion(question_text="Will it happen?") + collection = ResearchWithPredictions( + research_report="", + summary_report="", + errors=[], + predictions=[ + ReasonedPrediction( + prediction_value=0.6, + reasoning="| 1 | YouGov poll | US adults | 60% | medium | note |", + ) + ], + ) + + comment = bot._create_comment(question, [collection], 0.6, 0.0, 0.0) + + assert "BASELINE FORECAST" in comment + assert "general public" in comment + assert "YouGov poll" in comment diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py index a598f2ed..226e8518 100644 --- a/forecasting_tools/forecast_bots/bot_lists.py +++ b/forecasting_tools/forecast_bots/bot_lists.py @@ -41,6 +41,24 @@ from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) from forecasting_tools.forecast_bots.template_bot import TemplateBot @@ -64,6 +82,12 @@ def get_all_important_bot_classes() -> list[type[ForecastBot]]: SummerTemplateBot2026, GPT41OptimizedBot, NoResearchOneShotBot, + PublicSentimentBaselineBot, + ExpertOpinionBaselineBot, + CredibleNewsBaselineBot, + LeftLeaningBaselineBot, + CenterLeaningBaselineBot, + RightLeaningBaselineBot, ] diff --git a/forecasting_tools/forecast_bots/public_baselines/__init__.py b/forecasting_tools/forecast_bots/public_baselines/__init__.py new file mode 100644 index 00000000..528fd735 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/__init__.py @@ -0,0 +1,44 @@ +"""Public-baseline forecasting bots. + +These bots do NOT try to give the most accurate forecast. Instead, each one +estimates the forecast that a particular *group of people* would collectively +give if a randomized, representative sample of that group were asked the +question. They are meant to be cheap, agentic stand-ins for "what the public / +experts / news / left / center / right would predict", so the Metaculus +Community Prediction can be benchmarked against these public baselines. + +All bots are built on PydanticAI agents and wrapped in the ``ForecastBot`` +interface so they run identically to the other open-source bots. +""" + +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) + +__all__ = [ + "PopulationBaselineBot", + "PublicSentimentBaselineBot", + "ExpertOpinionBaselineBot", + "CredibleNewsBaselineBot", + "LeftLeaningBaselineBot", + "CenterLeaningBaselineBot", + "RightLeaningBaselineBot", +] diff --git a/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py new file mode 100644 index 00000000..7faaebea --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py @@ -0,0 +1,39 @@ +"""Baseline bot estimating what the political center would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class CenterLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of centrist figures and outlets.""" + + population_spec = PopulationSpec( + name="the political center", + short_name="center", + target_description=( + "centrist / moderate public figures, commentators, and media outlets " + "(e.g. Reuters, AP, the news pages of major papers, moderate and " + "independent voices, centrist think tanks), excluding strongly partisan " + "left or right sources" + ), + sampling_method=( + "Imagine sampling a representative set of moderate, non-partisan-leaning " + "voices and asking each what they would predict. Centrists tend to weight " + "mainstream expert consensus, official data, and 'both sides' framings, and " + "to avoid the strong directional priors of either wing. Sample across " + "center-left-of-neutral to center-right-of-neutral moderates." + ), + source_guidance=( + "Prioritise wire services, straight news, centrist columnists, and " + "non-partisan analysts and think tanks. When using a source, note why it " + "qualifies as centrist rather than partisan." + ), + interpretation_guidance=( + "Reflect the centrist tendency to anchor on consensus and official " + "forecasts, to split the difference between partisan narratives, and to be " + "cautious about extreme outcomes. Represent this measured framing rather " + "than either wing's view." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py b/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py new file mode 100644 index 00000000..3cf8c225 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py @@ -0,0 +1,44 @@ +"""Baseline bot estimating the forecast implied by credible news outlets.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class CredibleNewsBaselineBot(PopulationBaselineBot): + """Estimates the forecast implied by a sample of credible news outlets.""" + + population_spec = PopulationSpec( + name="credible news outlets", + short_name="credible news", + target_description=( + "established, fact-checked news organisations with strong reputations for " + "accuracy and editorial standards (e.g. Reuters, AP, BBC, The New York " + "Times, The Wall Street Journal, The Economist, Financial Times, Bloomberg, " + "Nature/Science news), sampled across outlets rather than relying on any " + "single one" + ), + sampling_method=( + "Imagine collecting the most recent reporting on this question from a " + "basket of credible, mainstream-to-high-quality outlets and asking what " + "forecast their coverage collectively implies. News outlets rarely state " + "explicit probabilities, so infer the implied forecast from how they frame " + "the situation: which outcome is treated as the default/expected one, the " + "hedging language used ('likely', 'unlikely', 'on track', 'in doubt'), and " + "which scenarios are given the most weight. Prefer reporting and analysis " + "over opinion columns." + ), + source_guidance=( + "Prioritise straight news reporting and data journalism from reputable " + "outlets and wire services. Use the publication dates to weight toward the " + "most current framing. Avoid low-credibility or partisan tabloid sources." + ), + interpretation_guidance=( + "Map editorial framing onto a concrete forecast: e.g. coverage describing " + "an outcome as 'widely expected' implies a high probability, 'facing " + "long odds' implies a low one. Account for newsroom tendencies toward " + "drama, novelty, and balance ('both sides') that can distort the implied " + "forecast." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py b/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py new file mode 100644 index 00000000..4d4f3cc0 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py @@ -0,0 +1,43 @@ +"""Baseline bot estimating what topic experts would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class ExpertOpinionBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a representative sample of relevant experts.""" + + population_spec = PopulationSpec( + name="subject-matter experts on the question's topic", + short_name="topic experts", + target_description=( + "credentialed specialists whose professional field is directly relevant to " + "the question (e.g. epidemiologists for a disease question, central-bank " + "economists for a rates question, election scientists for an election " + "question), sampled across institutions and viewpoints rather than a single " + "school of thought" + ), + sampling_method=( + "First identify which 1-3 fields of expertise are most relevant to the " + "question. Then imagine polling a representative cross-section of recognised " + "experts in those fields. Experts reason from domain models, base rates, " + "and current data, and tend to be better calibrated than the public, but " + "they also have characteristic blind spots (over-reliance on existing " + "models, herding around consensus, and slowness to update on regime " + "changes). Sample across competing expert camps where the field is divided." + ), + source_guidance=( + "Prioritise peer-reviewed research, expert surveys and elicitations, " + "official forecasts from expert bodies (IMF, IPCC, CBO, central banks, " + "WHO, etc.), analyst consensus, and named expert commentary. Down-weight " + "non-expert punditry." + ), + interpretation_guidance=( + "Translate technical findings, model outputs, and expert statements into a " + "concrete forecast for this question's resolution criteria. Where experts " + "disagree, represent the spread of expert opinion rather than collapsing to " + "a single view, and note the consensus position separately from outliers." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py new file mode 100644 index 00000000..d7099d7c --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py @@ -0,0 +1,40 @@ +"""Baseline bot estimating what the political left would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class LeftLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of left-leaning figures and outlets.""" + + population_spec = PopulationSpec( + name="the political left", + short_name="left", + target_description=( + "left-leaning / progressive public figures, commentators, and media " + "outlets (e.g. MSNBC, The Guardian, The Nation, Vox, Mother Jones, " + "prominent progressive politicians and writers), sampled across the " + "center-left to the further left" + ), + sampling_method=( + "Imagine sampling a representative set of left-leaning voices and asking " + "each what they would predict. Their forecasts are shaped by progressive " + "priors and the issues their side emphasises. Weight across the spectrum " + "from the establishment center-left to the activist left, and base the " + "leaning on the question's relevant country/context." + ), + source_guidance=( + "Prioritise reporting, op-eds, and commentary from left-leaning outlets and " + "figures, and left-leaning framing of polls and events. Identify the " + "source's lean explicitly when recording it." + ), + interpretation_guidance=( + "Reflect how this side's worldview shapes its predictions: motivated " + "reasoning toward outcomes it favours or fears, distinct trusted sources, " + "and characteristic framings of risk and blame. Faithfully represent the " + "left's expected forecast even where it diverges from neutral analysis, and " + "note internal disagreement between the center-left and the further left." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py new file mode 100644 index 00000000..b6118994 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py @@ -0,0 +1,577 @@ +"""Base class for the public-baseline (population) forecasting bots. + +A ``PopulationBaselineBot`` estimates the forecast that a *specific group of +people* would collectively produce if a randomized, representative sample of +that group were polled on the question. It is deliberately NOT trying to be +accurate about the world; it is trying to faithfully reproduce a group's +collective belief (including that group's biases). + +Each concrete bot (public sentiment, expert opinion, credible news outlets, +left, center, right) only needs to define a ``PopulationSpec`` describing who is +being sampled and how to find evidence of their views. All of the agentic +machinery, prediction conversion, and comment formatting lives here. + +The bots are built on a PydanticAI agent that is given a single web/news search +tool and asked to return a structured ``*PopulationForecast`` object containing +the individual sources it sampled, each source's implied forecast, and an +aggregate implied forecast. That structured object is then converted into the +prediction types the ``ForecastBot`` framework expects, and rendered into the +Metaculus comment so readers can see exactly which sources produced which +implied forecasts. +""" + +import logging +import os +from datetime import datetime, timezone + +import pendulum +from pydantic import BaseModel, Field +from pydantic_ai import Agent, UsageLimits +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.openrouter import OpenRouterProvider + +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.data_models.data_organizer import DataOrganizer +from forecasting_tools.data_models.forecast_report import ( + ReasonedPrediction, + ResearchWithPredictions, +) +from forecasting_tools.data_models.multiple_choice_report import ( + PredictedOption, + PredictedOptionList, +) +from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + DateQuestion, + MetaculusQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_summer import ( + SummerTemplateBot2026, +) +from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + +class PopulationSpec(BaseModel): + """Description of the group of people a baseline bot is sampling.""" + + name: str + short_name: str + target_description: str + sampling_method: str + source_guidance: str + interpretation_guidance: str + + +class DiscoveredSource(BaseModel): + """A single source used to approximate one slice of the sampled group.""" + + name: str = Field( + description="Short name of the source, e.g. 'YouGov poll (Jun 2026)', " + "'Dr. Jane Smith, virologist', or 'The Economist editorial'." + ) + represents: str = Field( + description="Which slice of the sampled group this source stands in for." + ) + url: str | None = Field( + default=None, description="A link to the source if one is available." + ) + implied_forecast: str = Field( + description="The forecast this source implies for THIS exact question, " + "stated concretely (a probability, an outcome, or a number/range)." + ) + confidence: str = Field( + description="How strongly the source implies this forecast: " + "'low', 'medium', or 'high'." + ) + note: str = Field( + description="One sentence on how this source's view was translated into " + "the implied forecast." + ) + + +class BinaryPopulationForecast(BaseModel): + population_summary: str = Field( + description="2-4 sentences on what the sampled group collectively believes " + "about this question and why." + ) + sources: list[DiscoveredSource] + aggregate_probability: float = Field( + ge=0, + le=1, + description="The group's aggregate implied probability that the question " + "resolves YES (between 0 and 1).", + ) + aggregate_rationale: str = Field( + description="How the individual sources were weighted and combined." + ) + + +class OptionProbability(BaseModel): + option_name: str + probability: float = Field(ge=0, le=1) + + +class MultipleChoicePopulationForecast(BaseModel): + population_summary: str + sources: list[DiscoveredSource] + option_probabilities: list[OptionProbability] = Field( + description="The group's aggregate implied probability for each option. " + "Use the exact option names provided and make the probabilities sum to ~1." + ) + aggregate_rationale: str + + +class NumericPercentile(BaseModel): + percentile: float = Field( + ge=0, + le=1, + description="A cumulative probability between 0 and 1 (e.g. 0.1 for the " + "10th percentile).", + ) + value: float = Field( + description="The value at this percentile, in the question's units." + ) + + +class NumericPopulationForecast(BaseModel): + population_summary: str + sources: list[DiscoveredSource] + percentiles: list[NumericPercentile] = Field( + description="An increasing list of percentile/value pairs describing the " + "group's aggregate distribution. Include at least the 0.1, 0.2, 0.4, 0.6, " + "0.8 and 0.9 percentiles with wide intervals." + ) + aggregate_rationale: str + + +class DatePercentilePoint(BaseModel): + percentile: float = Field(ge=0, le=1) + iso_date: str = Field( + description="The date at this percentile in ISO format (YYYY-MM-DD)." + ) + + +class DatePopulationForecast(BaseModel): + population_summary: str + sources: list[DiscoveredSource] + percentiles: list[DatePercentilePoint] = Field( + description="An increasing list of percentile/date pairs describing the " + "group's aggregate distribution. Include at least the 0.1, 0.2, 0.4, 0.6, " + "0.8 and 0.9 percentiles with wide intervals." + ) + aggregate_rationale: str + + +class PopulationBaselineBot(SummerTemplateBot2026): + """Estimates what a sampled group of people would collectively forecast. + + Subclasses provide ``population_spec``. The bot uses a PydanticAI agent with + a single search tool to gather evidence of the group's views, returns a + structured per-source breakdown, and converts the aggregate into the + prediction type the framework expects. + """ + + population_spec: PopulationSpec + _request_limit_per_question: int = 12 + + @classmethod + def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: + config_dict = super()._llm_config_defaults() + config_dict["summarizer"] = None + return config_dict + + async def run_research(self, question: MetaculusQuestion) -> str: + return "" + + def _get_agent_model(self) -> OpenAIChatModel: + default_llm = self.get_llm("default") + model_name = ( + default_llm.model if isinstance(default_llm, GeneralLlm) else default_llm + ) + model_name = model_name.removeprefix("openrouter/") + provider = OpenRouterProvider(api_key=os.getenv("OPENROUTER_API_KEY")) + return OpenAIChatModel(model_name, provider=provider) + + async def _search_the_web(self, query: str) -> str: + try: + searcher = AskNewsSearcher() + return await searcher.get_formatted_news_async(query) + except Exception as news_error: + logger.warning( + f"AskNews search failed ({news_error}); falling back to researcher llm." + ) + try: + researcher = self.get_llm("researcher", "llm") + return await researcher.invoke(query) + except Exception as fallback_error: + logger.warning(f"Fallback search failed: {fallback_error}") + return f"No search results available for '{query}'." + + def _build_agent(self, output_type: type[BaseModel]) -> Agent: + async def search_the_web(query: str) -> str: + """Search recent news and the web for evidence of what the target group thinks. + + Use focused queries (polls, surveys, expert statements, op-eds, + articles, social media sentiment, etc.). Returns formatted snippets + with their sources and URLs. + """ + return await self._search_the_web(query) + + return Agent( + self._get_agent_model(), + output_type=output_type, + system_prompt=self._system_prompt(), + tools=[search_the_web], + retries=2, + ) + + def _system_prompt(self) -> str: + spec = self.population_spec + return clean_indents( + f""" + You are a careful research analyst. Your job is to estimate the forecast + that {spec.name} would collectively give for a specific question. You are + NOT estimating what is most likely to actually happen, and you are NOT + giving your own opinion. + + Concretely, approximate the result of taking a RANDOMIZED, REPRESENTATIVE + sample of {spec.target_description}, asking each member to forecast the + question, and aggregating their answers. + + How to sample this group: + {spec.sampling_method} + + Finding evidence: + Use the `search_the_web` tool (2-5 focused searches) to find concrete + evidence of what this group currently believes about the question or its + close neighbors. {spec.source_guidance} + + For every source you use, record: its name, which slice of the group it + represents, its URL (if any), and the forecast it IMPLIES for THIS exact + question. Translate vague sentiment, narratives, polling, or commentary + into a concrete implied forecast that matches the question's resolution + criteria. {spec.interpretation_guidance} + + Then aggregate across the sources you sampled, weighting each by how + representative it is of {spec.target_description}, to get the group's + aggregate implied forecast. + + Rules: + - Faithfully represent this group, including its biases and blind spots, + even if you personally believe they are wrong. + - Ground your estimate in evidence you actually find. If evidence is thin, + sample more broadly and reason explicitly about how this group tends to + think about questions like this. + - Try to include 3-8 distinct sources. + - Keep it efficient: a few targeted searches, then answer. + """ + ) + + def _question_block(self, question: MetaculusQuestion) -> str: + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + return clean_indents( + f""" + Question: {question.question_text} + + Background: + {question.background_info} + + Resolution criteria: + {question.resolution_criteria} + + {question.fine_print} + + Today's date: {today} + """ + ) + + async def _run_forecast_on_binary( + self, question: BinaryQuestion, research: str + ) -> ReasonedPrediction[float]: + agent = self._build_agent(BinaryPopulationForecast) + prompt = clean_indents( + f""" + {self._question_block(question)} + + Estimate what {self.population_spec.name} would forecast as the probability + that this question resolves YES. + """ + ) + result = await agent.run( + prompt, + usage_limits=UsageLimits(request_limit=self._request_limit_per_question), + ) + forecast: BinaryPopulationForecast = result.output + probability = max(0.01, min(0.99, forecast.aggregate_probability)) + reasoning = self._format_reasoning( + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + f"Aggregate implied probability of YES: {probability:.1%}", + ) + return ReasonedPrediction(prediction_value=probability, reasoning=reasoning) + + async def _run_forecast_on_multiple_choice( + self, question: MultipleChoiceQuestion, research: str + ) -> ReasonedPrediction[PredictedOptionList]: + agent = self._build_agent(MultipleChoicePopulationForecast) + options_str = ", ".join(f'"{option}"' for option in question.options) + prompt = clean_indents( + f""" + {self._question_block(question)} + + The allowed options are: [{options_str}] + + Estimate what {self.population_spec.name} would forecast as the probability + of each option. Use the exact option names above and make the + probabilities sum to approximately 1. + """ + ) + result = await agent.run( + prompt, + usage_limits=UsageLimits(request_limit=self._request_limit_per_question), + ) + forecast: MultipleChoicePopulationForecast = result.output + predicted_options = self._build_option_list( + question, forecast.option_probabilities + ) + final_line = "Aggregate implied probabilities: " + ", ".join( + f"{option.option_name}: {option.probability:.1%}" + for option in predicted_options.predicted_options + ) + reasoning = self._format_reasoning( + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction( + prediction_value=predicted_options, reasoning=reasoning + ) + + @staticmethod + def _build_option_list( + question: MultipleChoiceQuestion, + option_probabilities: list[OptionProbability], + ) -> PredictedOptionList: + lookup = { + option.option_name.strip().lower(): option.probability + for option in option_probabilities + } + raw_probabilities = [ + max(0.0, lookup.get(option.strip().lower(), 0.0)) + for option in question.options + ] + if sum(raw_probabilities) <= 0: + raw_probabilities = [1.0 for _ in question.options] + total = sum(raw_probabilities) + normalized = [probability / total for probability in raw_probabilities] + return PredictedOptionList( + predicted_options=[ + PredictedOption(option_name=option, probability=probability) + for option, probability in zip(question.options, normalized) + ] + ) + + async def _run_forecast_on_numeric( + self, question: NumericQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + agent = self._build_agent(NumericPopulationForecast) + prompt = clean_indents( + f""" + {self._question_block(question)} + + Units for the answer: {question.unit_of_measure if question.unit_of_measure else "infer the appropriate units"} + {lower_bound_message} + {upper_bound_message} + + Estimate what {self.population_spec.name} would forecast for this number. + Provide an increasing list of percentile/value pairs (at least the 0.1, + 0.2, 0.4, 0.6, 0.8 and 0.9 percentiles) with wide intervals to reflect + the group's uncertainty. + """ + ) + result = await agent.run( + prompt, + usage_limits=UsageLimits(request_limit=self._request_limit_per_question), + ) + forecast: NumericPopulationForecast = result.output + percentiles = self._build_percentiles(forecast.percentiles) + distribution = NumericDistribution.from_question(percentiles, question) + final_line = "Aggregate implied distribution (percentile: value): " + ", ".join( + f"{int(percentile.percentile * 100)}%: {percentile.value:g}" + for percentile in percentiles + ) + reasoning = self._format_reasoning( + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction(prediction_value=distribution, reasoning=reasoning) + + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + agent = self._build_agent(DatePopulationForecast) + prompt = clean_indents( + f""" + {self._question_block(question)} + + {lower_bound_message} + {upper_bound_message} + + Estimate what {self.population_spec.name} would forecast for this date. + Provide an increasing list of percentile/date pairs (at least the 0.1, + 0.2, 0.4, 0.6, 0.8 and 0.9 percentiles), dates in YYYY-MM-DD format, with + wide intervals to reflect the group's uncertainty. + """ + ) + result = await agent.run( + prompt, + usage_limits=UsageLimits(request_limit=self._request_limit_per_question), + ) + forecast: DatePopulationForecast = result.output + percentiles = self._build_date_percentiles(forecast.percentiles) + distribution = NumericDistribution.from_question(percentiles, question) + final_line = "Aggregate implied dates (percentile: date): " + ", ".join( + f"{int(percentile.percentile * 100)}%: " + f"{datetime.fromtimestamp(percentile.value, tz=timezone.utc).date().isoformat()}" + for percentile in percentiles + ) + reasoning = self._format_reasoning( + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction(prediction_value=distribution, reasoning=reasoning) + + @staticmethod + def _build_percentiles(points: list[NumericPercentile]) -> list[Percentile]: + percentiles = [ + Percentile( + percentile=( + point.percentile + if point.percentile <= 1 + else point.percentile / 100 + ), + value=point.value, + ) + for point in points + ] + return sorted(percentiles, key=lambda percentile: percentile.percentile) + + @staticmethod + def _build_date_percentiles(points: list[DatePercentilePoint]) -> list[Percentile]: + percentiles = [ + Percentile( + percentile=( + point.percentile + if point.percentile <= 1 + else point.percentile / 100 + ), + value=pendulum.parse(point.iso_date).timestamp(), + ) + for point in points + ] + return sorted(percentiles, key=lambda percentile: percentile.percentile) + + def _format_reasoning( + self, + population_summary: str, + sources: list[DiscoveredSource], + aggregate_rationale: str, + final_line: str, + ) -> str: + spec = self.population_spec + table_rows = [] + for index, source in enumerate(sources, start=1): + source_cell = ( + f"[{source.name}]({source.url})" if source.url else source.name + ) + table_rows.append( + f"| {index} | {source_cell} | {source.represents} | " + f"{source.implied_forecast} | {source.confidence} | {source.note} |" + ) + table = ( + "\n".join(table_rows) + if table_rows + else "| - | (no sources found) | - | - | - | - |" + ) + return clean_indents( + f""" + ## What {spec.name} appears to forecast + + {population_summary} + + ### Sources sampled and their implied forecasts + | # | Source | Represents | Implied forecast | Confidence | Note | + |---|--------|-----------|------------------|-----------|------| + {table} + + ### Aggregation + {aggregate_rationale} + + **{final_line}** + """ + ) + + def _create_comment( + self, + question: MetaculusQuestion, + research_prediction_collections: list[ResearchWithPredictions], + aggregated_prediction, + final_cost: float, + time_spent_in_minutes: float, + ) -> str: + report_type = DataOrganizer.get_report_type_for_question_type(type(question)) + readable_prediction = report_type.make_readable_prediction( + aggregated_prediction + ) + spec = self.population_spec + breakdowns = [] + for collection in research_prediction_collections: + for prediction in collection.predictions: + breakdowns.append(prediction.reasoning) + combined_breakdowns = "\n\n".join(breakdowns) + comment = clean_indents( + f""" + # {spec.name.upper()} BASELINE FORECAST + *Question*: {question.question_text} + *Estimated forecast of the {spec.short_name}*: {readable_prediction} + *What this estimates*: the forecast a randomized, representative sample of + {spec.target_description} would give if asked this question. This is a + public-baseline proxy, not a best-guess of the true outcome. + *Bot Name*: {self.__class__.__name__} + + The breakdown below lists the sources that were sampled and the forecast + each one implies for this question, followed by how they were aggregated. + + {combined_breakdowns} + + --- + *Note*: This is an experimental, low-cost agentic baseline bot built on + PydanticAI. Cost/time metadata is not tracked for the agent's calls and is + therefore omitted. + """ + ) + max_comment_size = 150000 + if len(comment) > max_comment_size: + comment = ( + comment[:2000] + + "\n\n---\n\n The comment size exceeded max size and has been truncated" + ) + return comment diff --git a/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py b/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py new file mode 100644 index 00000000..777efc4c --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py @@ -0,0 +1,44 @@ +"""Baseline bot estimating what the general public would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class PublicSentimentBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a representative sample of the general public.""" + + population_spec = PopulationSpec( + name="the general public", + short_name="general public", + target_description=( + "ordinary members of the general public (not specialists), spread across " + "ages, regions, education levels, and political affiliations, weighted " + "toward the population most relevant to the question (e.g. the relevant " + "country's adults for a national question, or a global cross-section for a " + "global one)" + ), + sampling_method=( + "Imagine handing this question to a demographically representative panel " + "of everyday people and recording their gut predictions. Most members are " + "not closely following the topic, so their answers are driven by general " + "impressions, recent headlines they happened to see, vibes, hope and fear, " + "and simple heuristics rather than careful base-rate analysis. Weight by " + "how the broad population actually skews, not by how the most engaged or " + "online subgroups skew." + ), + source_guidance=( + "Prioritise opinion polls, surveys, prediction-poll/'wisdom of the crowd' " + "results, Google Trends, and broadly representative public-sentiment data. " + "Treat viral social-media reactions as evidence of mood but down-weight " + "them since they over-represent the highly engaged." + ), + interpretation_guidance=( + "Remember well-documented lay-forecasting tendencies: anchoring on the " + "current vivid narrative, optimism/pessimism and wishful thinking, poor " + "calibration (over-confidence on familiar topics, excess uncertainty on " + "unfamiliar ones), scope insensitivity, and recency bias. Reflect these in " + "the implied forecast rather than correcting them." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py new file mode 100644 index 00000000..f9ac54c9 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py @@ -0,0 +1,41 @@ +"""Baseline bot estimating what the political right would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class RightLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of right-leaning figures and outlets.""" + + population_spec = PopulationSpec( + name="the political right", + short_name="right", + target_description=( + "right-leaning / conservative public figures, commentators, and media " + "outlets (e.g. Fox News, The Wall Street Journal opinion page, National " + "Review, The Telegraph, The Free Press, prominent conservative politicians " + "and writers), sampled across the center-right to the further right" + ), + sampling_method=( + "Imagine sampling a representative set of right-leaning voices and asking " + "each what they would predict. Their forecasts are shaped by conservative " + "priors and the issues their side emphasises. Weight across the spectrum " + "from the establishment center-right to the populist/further right, and " + "base the leaning on the question's relevant country/context." + ), + source_guidance=( + "Prioritise reporting, op-eds, and commentary from right-leaning outlets " + "and figures, and right-leaning framing of polls and events. Identify the " + "source's lean explicitly when recording it." + ), + interpretation_guidance=( + "Reflect how this side's worldview shapes its predictions: motivated " + "reasoning toward outcomes it favours or fears, distinct trusted sources, " + "and characteristic framings of risk and blame. Faithfully represent the " + "right's expected forecast even where it diverges from neutral analysis, " + "and note internal disagreement between the center-right and the further " + "right." + ), + ) diff --git a/poetry.lock b/poetry.lock index c0fcff5e..84170739 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.4.0 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1756,6 +1756,25 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "backports-zstd ; python_version < \"3.14\"", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas (<3.0.0)", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard ; python_version < \"3.14\""] tqdm = ["tqdm"] +[[package]] +name = "genai-prices" +version = "0.0.67" +description = "Calculate prices for calling LLM inference APIs." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "genai_prices-0.0.67-py3-none-any.whl", hash = "sha256:08977f1e83b4132abcfc60dabf21ff13c2d25958afb9199e59c4407bf5c9ed3f"}, + {file = "genai_prices-0.0.67.tar.gz", hash = "sha256:54e07eb6541fda377187a471c5dba21a81b439c57f8dc44d89db3103c29ca343"}, +] + +[package.dependencies] +httpx2 = ">=2.0" +pydantic = ">=2.10" + +[package.extras] +cli = ["pydantic-settings (>=2.11)", "rich (>=14.3.2)", "rich-argparse (>=1.7.2)"] + [[package]] name = "gitdb" version = "4.0.12" @@ -1997,6 +2016,28 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] trio = ["trio (>=0.22.0,<1.0)"] +[[package]] +name = "httpcore2" +version = "2.3.0" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "httpcore2-2.3.0-py3-none-any.whl", hash = "sha256:477e9e334f74e5240dcac002e890580f36a57d40ff0fb14cc9655731d23b8415"}, + {file = "httpcore2-2.3.0.tar.gz", hash = "sha256:07327e251560960eea8e969d92d4c6a325feb13cca39e25340731336c3baf924"}, +] + +[package.dependencies] +h11 = ">=0.16" +truststore = ">=0.10" + +[package.extras] +asyncio = ["anyio (>=4.5.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<1.0)"] + [[package]] name = "httptools" version = "0.7.1" @@ -2087,6 +2128,31 @@ files = [ {file = "httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d"}, ] +[[package]] +name = "httpx2" +version = "2.3.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "httpx2-2.3.0-py3-none-any.whl", hash = "sha256:6f393663bdf6dbe7fe90118e3eb5b2bd024a675cae0390ac08cec9198812d8b7"}, + {file = "httpx2-2.3.0.tar.gz", hash = "sha256:227e7c41d95a76d4077a52640564132777215fc3394e07b66a3116c33d668fa9"}, +] + +[package.dependencies] +anyio = "*" +httpcore2 = "2.3.0" +idna = "*" +truststore = ">=0.10" + +[package.extras] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<15)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0) ; python_version <= \"3.13\""] + [[package]] name = "huggingface-hub" version = "1.15.0" @@ -3091,6 +3157,18 @@ semantic-router = ["aurelio-sdk (==0.0.19) ; python_full_version < \"3.14.0\"", stt-nvidia-riva = ["audioread (>=3.0.1)", "numpy (>=1.26.0)", "nvidia-riva-client (>=2.15.0)", "soundfile (>=0.12.1)"] utils = ["numpydoc (==1.8.0)"] +[[package]] +name = "logfire-api" +version = "4.37.0" +description = "Shim for the Logfire SDK which does nothing unless Logfire is installed" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "logfire_api-4.37.0-py3-none-any.whl", hash = "sha256:1d756f8ba23aa56d438e0ba2c0f529a00fcac975b8785c561b058267f9465088"}, + {file = "logfire_api-4.37.0.tar.gz", hash = "sha256:0f62debd6ed593d51307277bd6d5636b57bda07935b5604b96db10fe64441af4"}, +] + [[package]] name = "lxml" version = "6.1.1" @@ -4071,6 +4149,21 @@ vercel = ["vercel (>=0.5.6,<0.6)"] viz = ["graphviz (>=0.17)"] voice = ["numpy (>=2.2.0,<3) ; python_version >= \"3.10\"", "websockets (>=15.0,<17)"] +[[package]] +name = "opentelemetry-api" +version = "1.43.0" +description = "OpenTelemetry Python API" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "opentelemetry_api-1.43.0-py3-none-any.whl", hash = "sha256:20acf45e9b21851926835292e4045d290acade1edd2ff3de86d2f069687ba1fd"}, + {file = "opentelemetry_api-1.43.0.tar.gz", hash = "sha256:107d0d03857ea8fc7c5fcbbbd83f800c281f0d560553d61c1d675fccfd1761c1"}, +] + +[package.dependencies] +typing-extensions = ">=4.5.0" + [[package]] name = "orjson" version = "3.11.9" @@ -5003,6 +5096,59 @@ typing-inspection = ">=0.4.2" email = ["email-validator (>=2.0.0)"] timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] +[[package]] +name = "pydantic-ai-slim" +version = "2.0.0" +description = "Agent Framework / shim to use Pydantic with LLMs, slim package" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pydantic_ai_slim-2.0.0-py3-none-any.whl", hash = "sha256:39979b459a7bc73ae5294c071a7e474f123858a9f7922e9cbb662018d6431198"}, + {file = "pydantic_ai_slim-2.0.0.tar.gz", hash = "sha256:056ea466d67b47a832736ac0f33172264b09da2110dbcce09d26b82772173218"}, +] + +[package.dependencies] +genai-prices = ">=0.0.62" +griffelib = ">=2.0" +httpx = ">=0.27" +openai = {version = ">=2.29.0", optional = true, markers = "extra == \"openai\""} +opentelemetry-api = ">=1.28.0" +pydantic = ">=2.12" +pydantic-graph = "2.0.0" +tiktoken = {version = ">=0.12.0", optional = true, markers = "extra == \"openai\""} +typing-inspection = ">=0.4.0" + +[package.extras] +ag-ui = ["ag-ui-protocol (>=0.1.10)", "starlette (>=0.46.2)"] +anthropic = ["anthropic (>=0.108.0)"] +bedrock = ["boto3 (>=1.42.63)"] +cli = ["argcomplete (>=3.5.0)", "prompt-toolkit (>=3)", "pyperclip (>=1.9.0)", "pyyaml (>=6.0.2)", "rich (>=13)"] +cohere = ["cohere (>=5.20.6) ; platform_system != \"Emscripten\""] +dbos = ["dbos (>=2.10.0)"] +duckduckgo = ["ddgs (>=9.0.0)"] +evals = ["pydantic-evals (==2.0.0)"] +exa = ["exa-py (>=2.0.0)"] +google = ["google-genai (>=1.70.0)"] +groq = ["groq (>=0.25.0)"] +huggingface = ["hf-xet (<1.5.0) ; platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\"", "huggingface-hub (>=1.3.4,<2.0.0)"] +logfire = ["logfire[httpx] (>=4.16.0)"] +mcp = ["fastmcp-slim[client] (>=3.3.0)"] +mistral = ["mistralai (>=2.0.0,!=2.4.6)"] +openai = ["openai (>=2.29.0)", "tiktoken (>=0.12.0)"] +openrouter = ["openai (>=2.8.0)"] +prefect = ["prefect (>=3.6.13)"] +retries = ["tenacity (>=8.2.3)"] +sentence-transformers = ["sentence-transformers (>=5.2.0) ; python_version < \"3.14\""] +spec = ["pydantic-handlebars (>=0.1.0)", "pyyaml (>=6.0.2)"] +tavily = ["tavily-python (>=0.5.0)"] +temporal = ["temporalio (>=1.24.0)"] +ui = ["starlette (>=0.46.2)"] +voyageai = ["voyageai (>=0.3.7) ; python_version < \"3.14\""] +web = ["httpx (>=0.27.0)", "starlette (>=0.46.2)", "uvicorn (>=0.38.0)"] +web-fetch = ["markdownify (>=1.2)"] +xai = ["xai-sdk (>=1.14.0)"] + [[package]] name = "pydantic-core" version = "2.41.5" @@ -5272,6 +5418,24 @@ files = [ [package.dependencies] typing-extensions = ">=4.14.1" +[[package]] +name = "pydantic-graph" +version = "2.0.0" +description = "Graph and state machine library" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pydantic_graph-2.0.0-py3-none-any.whl", hash = "sha256:36d69fa01cd316be8584b90eef58bd21675c11c0a081b500a5c4ebe9b68310a5"}, + {file = "pydantic_graph-2.0.0.tar.gz", hash = "sha256:f0bffe84a46a5118bce0824de63d08f3f32ba4dfc1064674f449b07e15128287"}, +] + +[package.dependencies] +httpx = ">=0.27" +logfire-api = ">=3.14.1" +pydantic = ">=2.12" +typing-inspection = ">=0.4.0" + [[package]] name = "pydantic-settings" version = "2.14.1" @@ -7248,6 +7412,18 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "mypy (>=1.7.0,<1.19) ; platform_python_implementation == \"PyPy\"", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "truststore" +version = "0.10.4" +description = "Verify certificates using native system trust stores" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "truststore-0.10.4-py3-none-any.whl", hash = "sha256:adaeaecf1cbb5f4de3b1959b42d41f6fab57b2b1666adb59e89cb0b53361d981"}, + {file = "truststore-0.10.4.tar.gz", hash = "sha256:9d91bd436463ad5e4ee4aba766628dd6cd7010cf3e2461756b3303710eebc301"}, +] + [[package]] name = "typeguard" version = "4.5.2" @@ -7823,4 +7999,4 @@ source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e" +content-hash = "f05b22045cf5780b8de13e6cca605667e5478dcbc45d8c4083e62af9faff1587" diff --git a/pyproject.toml b/pyproject.toml index d15ad580..4dcfecad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ boto3 = {version = ">=1.34,<2.0.0", optional = true} playwright = {version = ">=1.44,<2.0.0", optional = true} firecrawl-py = {version = ">=4.0,<5.0.0", optional = true} trafilatura = {version = ">=1.9,<3.0.0", optional = true} +pydantic-ai-slim = {extras = ["openai"], version = "^2.0.0"} [tool.poetry.extras] source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"] diff --git a/run_bots.py b/run_bots.py index b33f1012..cd6735c0 100644 --- a/run_bots.py +++ b/run_bots.py @@ -32,6 +32,27 @@ from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) from forecasting_tools.forecast_bots.template_bot import TemplateBot from forecasting_tools.helpers.metaculus_api import ApiFilter from forecasting_tools.helpers.metaculus_client import MetaculusClient @@ -442,6 +463,27 @@ def create_bot( return default_bot +def create_population_baseline_bot( + bot_class: type[PopulationBaselineBot], + agent_llm: GeneralLlm, +) -> PopulationBaselineBot: + return bot_class( + research_reports_per_question=1, + predictions_per_research_report=1, + use_research_summary_to_forecast=False, + publish_reports_to_metaculus=default_for_publish_to_metaculus, + skip_previously_forecasted_questions=default_for_skipping_questions, + enable_summarize_research=False, + llms={ + "default": agent_llm, + "summarizer": None, + "researcher": "asknews/news-summaries", + "parser": structure_output_model, + }, + extra_metadata_in_explanation=True, + ) + + def make_claude_thinking_settings(thinking_tokens: int, max_tokens: int) -> dict: return { "temperature": 1, @@ -617,7 +659,57 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR ), } + population_baseline_agent_llm = GeneralLlm( + model="openrouter/openai/gpt-4o-mini", + temperature=0.3, + timeout=5 * 60, + ) + roughly_population_baseline_cost = roughly_gpt_4o_mini_cost * 4 + mode_base_bot_mapping = { + ############################ Public-baseline bots (June 2026) ############################ + "METAC_PUBLIC_SENTIMENT_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + PublicSentimentBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_EXPERT_OPINION_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + ExpertOpinionBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_CREDIBLE_NEWS_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + CredibleNewsBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_LEFT_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + LeftLeaningBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_CENTER_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + CenterLeaningBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_RIGHT_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + RightLeaningBaselineBot, population_baseline_agent_llm + ), + "tournaments": TournConfig.aib_and_site, + }, ############################ No-research one-shot bots ############################ "METAC_GPT_5_5_NO_RESEARCH_ONE_SHOT": { "estimated_cost_per_question": roughly_gpt_5_cost, From d59258389472f77e83a6177c8373ce00e9f68c85 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Thu, 25 Jun 2026 18:08:51 +0000 Subject: [PATCH 2/3] Refine public-baseline bots: Exa quote tool, multi-model branches, scratchpad - Add ExaQuoteSearcher tool returning article summaries + verbatim highlight quotes (with AskNews fallback) so each bot can cite the sources it sampled. - Extend ExaSearcher/ExaSource with optional summary support. - Forecast each question with 3 independent model branches (Claude Sonnet 4.5, Grok 4.20, GLM 5.1 via OpenRouter), round-robined per prediction and aggregated by the framework; add a reasoning scratchpad per branch. - Surface per-model source breakdowns and implied forecasts in the comment. - Wire branch LLMs through run_bots.py and pass EXA_API_KEY to the 6 baseline workflow jobs. Co-authored-by: Cursor --- .github/workflows/run-bot-aib-tournament.yaml | 6 + .../test_population_baseline_bot.py | 53 ++- .../research/exa_quote_searcher.py | 103 ++++++ forecasting_tools/ai_models/exa_searcher.py | 7 + .../population_baseline_bot.py | 320 ++++++++++++++---- run_bots.py | 45 ++- 6 files changed, 451 insertions(+), 83 deletions(-) create mode 100644 forecasting_tools/agents_and_tools/research/exa_quote_searcher.py diff --git a/.github/workflows/run-bot-aib-tournament.yaml b/.github/workflows/run-bot-aib-tournament.yaml index b5c40322..cade18bd 100644 --- a/.github/workflows/run-bot-aib-tournament.yaml +++ b/.github/workflows/run-bot-aib-tournament.yaml @@ -102,6 +102,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} @@ -116,6 +117,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} @@ -130,6 +132,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} @@ -144,6 +147,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} @@ -158,6 +162,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} @@ -172,6 +177,7 @@ jobs: INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} diff --git a/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py index 13c49372..7dbc7cbc 100644 --- a/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py +++ b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py @@ -1,5 +1,10 @@ from types import SimpleNamespace +from forecasting_tools.agents_and_tools.research.exa_quote_searcher import ( + ExaQuoteSearcher, +) +from forecasting_tools.ai_models.exa_searcher import ExaSource +from forecasting_tools.data_models.forecast_report import ReasonedPrediction from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList from forecasting_tools.data_models.numeric_report import NumericDistribution from forecasting_tools.data_models.questions import ( @@ -30,7 +35,7 @@ async def run(self, prompt: str, usage_limits: object = None) -> SimpleNamespace def _make_bot_returning(output: object) -> PublicSentimentBaselineBot: bot = PublicSentimentBaselineBot() - bot._build_agent = lambda output_type: _FakeAgent(output) # type: ignore + bot._build_agent = lambda output_type, branch_llm=None: _FakeAgent(output) # type: ignore return bot @@ -122,6 +127,52 @@ async def test_numeric_forecast_builds_distribution() -> None: assert len(prediction.prediction_value.declared_percentiles) == 6 +async def test_make_prediction_round_robins_branch_models() -> None: + bot = PublicSentimentBaselineBot() + question = BinaryQuestion(question_text="Will it happen?") + notepad = await bot._initialize_notepad(question) + bot._note_pads.append(notepad) + + used_models: list[str] = [] + + async def fake_binary( + question: BinaryQuestion, research: str, branch_llm: object = None + ) -> ReasonedPrediction[float]: + used_models.append(branch_llm.model) # type: ignore[union-attr] + return ReasonedPrediction(prediction_value=0.5, reasoning="x") + + bot._run_forecast_on_binary = fake_binary # type: ignore[assignment] + + for _ in range(len(bot.branch_llms)): + await bot._make_prediction(question, "") + + assert used_models == [branch.model for branch in bot.branch_llms] + + +def test_exa_quote_searcher_formats_summary_and_top_quotes() -> None: + searcher = ExaQuoteSearcher(num_quotes_per_source=2) + source = ExaSource( + original_query="poll", + auto_prompt_string=None, + title="Poll shows majority support", + url="https://example.com/poll", + text=None, + author="Jane Doe", + published_date=None, + score=0.9, + highlights=["a low-scoring quote", "a high-scoring quote"], + highlight_scores=[0.1, 0.9], + summary="A representative poll about the topic.", + ) + + formatted = searcher._format_sources("poll", [source]) + + assert "Poll shows majority support" in formatted + assert "https://example.com/poll" in formatted + assert "A representative poll about the topic." in formatted + assert "a high-scoring quote" in formatted + + def test_comment_includes_population_framing_and_sources() -> None: from forecasting_tools.data_models.forecast_report import ( ReasonedPrediction, diff --git a/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py b/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py new file mode 100644 index 00000000..6002cc70 --- /dev/null +++ b/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py @@ -0,0 +1,103 @@ +"""A lightweight Exa search tool that returns directly quotable evidence. + +Unlike ``SmartSearcher`` (which synthesises an answer from search results), this +tool returns the raw building blocks an agent needs to *quote and attribute* +sources itself: for each result it returns the title, URL, author, publish date, +a short article summary, and the most relevant highlight quotes. This is meant to +be handed to an agent as a single tool so the agent can read what sources say and +record the forecast each source implies, with verbatim quotes. +""" + +import logging + +from forecasting_tools.ai_models.exa_searcher import ExaSearcher, ExaSource, SearchInput + +logger = logging.getLogger(__name__) + + +class ExaQuoteSearcher: + """Searches the web with Exa and returns quotable highlights + summaries.""" + + def __init__( + self, + num_results: int = 6, + num_quotes_per_source: int = 4, + max_summary_chars: int = 600, + ) -> None: + self.num_results = num_results + self.num_quotes_per_source = num_quotes_per_source + self.max_summary_chars = max_summary_chars + self.exa_searcher = ExaSearcher( + include_text=False, + include_highlights=True, + include_summary=True, + num_results=num_results, + ) + + async def search_for_quotes( + self, + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: + search_input = SearchInput( + web_search_query=query, + highlight_query=query, + include_domains=include_domains or [], + exclude_domains=exclude_domains or [], + include_text=None, + start_published_date=None, + end_published_date=None, + ) + try: + sources = await self.exa_searcher.invoke(search_input) + except Exception as error: + logger.warning(f"Exa quote search failed for '{query}': {error}") + return f"No Exa search results available for '{query}' (error: {error})." + if not sources: + return f"No Exa search results found for '{query}'." + return self._format_sources(query, sources) + + def _format_sources(self, query: str, sources: list[ExaSource]) -> str: + blocks: list[str] = [f'Exa results for "{query}":'] + for index, source in enumerate(sources, start=1): + blocks.append(self._format_single_source(index, source)) + return "\n\n".join(blocks) + + def _format_single_source(self, index: int, source: ExaSource) -> str: + title = source.title or "(untitled)" + author = f" by {source.author}" if source.author else "" + header = ( + f"[{index}] {title}{author} — {source.readable_publish_date}\n" + f"URL: {source.url or 'unknown'}" + ) + summary = self._summary_text(source) + quotes = self._top_quotes(source) + quote_block = ( + "\n".join(f' - "{quote}"' for quote in quotes) + if quotes + else " - (no highlight quotes returned)" + ) + return f"{header}\nSummary: {summary}\nQuotes:\n{quote_block}" + + def _summary_text(self, source: ExaSource) -> str: + if not source.summary: + return "(no summary returned)" + summary = source.summary.strip() + if len(summary) > self.max_summary_chars: + summary = summary[: self.max_summary_chars].rstrip() + "…" + return summary + + def _top_quotes(self, source: ExaSource) -> list[str]: + scores = source.highlight_scores or [1.0] * len(source.highlights) + scored_quotes = sorted( + zip(source.highlights, scores), + key=lambda pair: pair[1], + reverse=True, + ) + top_quotes = [ + quote.strip() + for quote, _ in scored_quotes[: self.num_quotes_per_source] + if quote.strip() + ] + return top_quotes diff --git a/forecasting_tools/ai_models/exa_searcher.py b/forecasting_tools/ai_models/exa_searcher.py index d696a2d5..0a131b48 100644 --- a/forecasting_tools/ai_models/exa_searcher.py +++ b/forecasting_tools/ai_models/exa_searcher.py @@ -34,6 +34,7 @@ class ExaSource(BaseModel, Jsonable): score: float | None highlights: list[str] highlight_scores: list[float] + summary: str | None = None @property def readable_publish_date(self) -> str: @@ -119,18 +120,21 @@ class ExaSearcher(RequestLimitedModel, RetryableModel, TimeLimitedModel, IncursC COST_PER_REQUEST = 0.005 COST_PER_HIGHLIGHT = 0.001 COST_PER_TEXT = 0.001 + COST_PER_SUMMARY = 0.001 def __init__( self, *args, include_text: bool = False, include_highlights: bool = True, + include_summary: bool = False, num_results: int = 5, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.include_text = include_text self.include_highlights = include_highlights + self.include_summary = include_summary self.num_highlights_per_url = 10 self.num_sentences_per_highlight = 4 self.num_results = num_results @@ -217,6 +221,7 @@ def _prepare_request_data(self, search: SearchInput) -> tuple[str, dict, dict]: if self.include_highlights else False ), + "summary": (True if self.include_summary else False), }, } @@ -283,6 +288,7 @@ def _process_response( score=result.get("score"), highlights=result.get("highlights", []), highlight_scores=result.get("highlightScores", []), + summary=result.get("summary"), ) exa_sources.append(exa_source) return exa_sources @@ -298,6 +304,7 @@ def _calculate_cost_for_request(self, results: list[ExaSource]) -> float: cost = self.COST_PER_REQUEST cost += self.COST_PER_TEXT * len(results) if self.include_text else 0 cost += self.COST_PER_HIGHLIGHT * len(results) if self.include_highlights else 0 + cost += self.COST_PER_SUMMARY * len(results) if self.include_summary else 0 return cost async def _track_cost_in_manager_using_model_response( diff --git a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py index b6118994..56748641 100644 --- a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py +++ b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py @@ -11,15 +11,20 @@ being sampled and how to find evidence of their views. All of the agentic machinery, prediction conversion, and comment formatting lives here. -The bots are built on a PydanticAI agent that is given a single web/news search -tool and asked to return a structured ``*PopulationForecast`` object containing -the individual sources it sampled, each source's implied forecast, and an -aggregate implied forecast. That structured object is then converted into the -prediction types the ``ForecastBot`` framework expects, and rendered into the -Metaculus comment so readers can see exactly which sources produced which -implied forecasts. +The bots are built on PydanticAI agents that are each given a single Exa-based +quote-search tool and asked to return a structured ``*PopulationForecast`` object +containing the individual sources they sampled, each source's implied forecast, +and an aggregate implied forecast. + +Each question is forecast by several independent agent runs ("branches"), one per +underlying model, mirroring the research-only bot's multi-sample aggregation but +with a different model per branch. The framework then aggregates the branch +forecasts (e.g. median) into the final prediction. Every branch's structured +object is rendered into the Metaculus comment so readers can see exactly which +sources produced which implied forecasts, per model. """ +import asyncio import logging import os from datetime import datetime, timezone @@ -27,9 +32,12 @@ import pendulum from pydantic import BaseModel, Field from pydantic_ai import Agent, UsageLimits -from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.models.openrouter import OpenRouterModel from pydantic_ai.providers.openrouter import OpenRouterProvider +from forecasting_tools.agents_and_tools.research.exa_quote_searcher import ( + ExaQuoteSearcher, +) from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.data_models.data_organizer import DataOrganizer from forecasting_tools.data_models.forecast_report import ( @@ -43,6 +51,7 @@ from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile from forecasting_tools.data_models.questions import ( BinaryQuestion, + ConditionalQuestion, DateQuestion, MetaculusQuestion, MultipleChoiceQuestion, @@ -52,6 +61,7 @@ SummerTemplateBot2026, ) from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher +from forecasting_tools.helpers.metaculus_client import MetaculusClient from forecasting_tools.util.misc import clean_indents logger = logging.getLogger(__name__) @@ -96,6 +106,12 @@ class DiscoveredSource(BaseModel): class BinaryPopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes: lay out how you sampled the " + "group, what the evidence shows, and how you reasoned toward the " + "group's implied probability BEFORE committing to the number.", + ) population_summary: str = Field( description="2-4 sentences on what the sampled group collectively believes " "about this question and why." @@ -118,6 +134,11 @@ class OptionProbability(BaseModel): class MultipleChoicePopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied option probabilities BEFORE committing to numbers.", + ) population_summary: str sources: list[DiscoveredSource] option_probabilities: list[OptionProbability] = Field( @@ -140,6 +161,11 @@ class NumericPercentile(BaseModel): class NumericPopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied distribution BEFORE committing to percentile values.", + ) population_summary: str sources: list[DiscoveredSource] percentiles: list[NumericPercentile] = Field( @@ -158,6 +184,11 @@ class DatePercentilePoint(BaseModel): class DatePopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied date distribution BEFORE committing to percentile dates.", + ) population_summary: str sources: list[DiscoveredSource] percentiles: list[DatePercentilePoint] = Field( @@ -178,7 +209,74 @@ class PopulationBaselineBot(SummerTemplateBot2026): """ population_spec: PopulationSpec - _request_limit_per_question: int = 12 + # Max model round-trips per branch (a few searches + the final answer). + _request_limit_per_question: int = 8 + # Hard ceiling on a single branch so a stalled OpenRouter/Exa/AskNews call + # cannot hang the whole forecast (GeneralLlm.timeout is not wired into the + # PydanticAI agent, so we enforce it ourselves). + _agent_run_timeout_seconds: float = 240 + # Cap simultaneous agent runs (each fans out to OpenRouter + Exa) so bursts + # of branches/questions don't trip OpenRouter "error" responses or Exa rate + # limits. Shared across all questions and branches of the bot. + _max_concurrent_agent_runs: int = 2 + _agent_run_limiter: asyncio.Semaphore = asyncio.Semaphore(2) + + def __init__( + self, + *, + branch_llms: list[GeneralLlm] | None = None, + research_reports_per_question: int = 1, + use_research_summary_to_forecast: bool = False, + publish_reports_to_metaculus: bool = False, + folder_to_save_reports_to: str | None = None, + skip_previously_forecasted_questions: bool = False, + llms: dict[str, str | GeneralLlm | None] | None = None, + enable_summarize_research: bool = False, + parameters_to_exclude_from_config_dict: list[str] | None = None, + extra_metadata_in_explanation: bool = False, + required_successful_predictions: float = 0.5, + metaculus_client: MetaculusClient | None = None, + ) -> None: + self.branch_llms = branch_llms or self._default_branch_llms() + assert len(self.branch_llms) > 0, "Need at least one branch model" + super().__init__( + research_reports_per_question=research_reports_per_question, + predictions_per_research_report=len(self.branch_llms), + use_research_summary_to_forecast=use_research_summary_to_forecast, + publish_reports_to_metaculus=publish_reports_to_metaculus, + folder_to_save_reports_to=folder_to_save_reports_to, + skip_previously_forecasted_questions=skip_previously_forecasted_questions, + llms=llms, + enable_summarize_research=enable_summarize_research, + parameters_to_exclude_from_config_dict=parameters_to_exclude_from_config_dict, + extra_metadata_in_explanation=extra_metadata_in_explanation, + required_successful_predictions=required_successful_predictions, + metaculus_client=metaculus_client, + ) + self._exa_quote_searcher = ExaQuoteSearcher() + + @staticmethod + def _default_branch_llms() -> list[GeneralLlm]: + agent_timeout = 5 * 60 + return [ + GeneralLlm( + model="openrouter/anthropic/claude-sonnet-4.5", + temperature=0.3, + timeout=agent_timeout, + ), + GeneralLlm( + # grok-4.1-fast is no longer served on OpenRouter; 4.20 is the + # current cheap/fast grok generation and the closest replacement. + model="openrouter/x-ai/grok-4.20", + temperature=0.3, + timeout=agent_timeout, + ), + GeneralLlm( + model="openrouter/z-ai/glm-5.1", + temperature=0.3, + timeout=agent_timeout, + ), + ] @classmethod def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: @@ -189,48 +287,102 @@ def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: async def run_research(self, question: MetaculusQuestion) -> str: return "" - def _get_agent_model(self) -> OpenAIChatModel: - default_llm = self.get_llm("default") - model_name = ( - default_llm.model if isinstance(default_llm, GeneralLlm) else default_llm - ) - model_name = model_name.removeprefix("openrouter/") + @staticmethod + def _branch_label(branch_llm: GeneralLlm) -> str: + return branch_llm.model.removeprefix("openrouter/") + + def _get_agent_model(self, branch_llm: GeneralLlm) -> OpenRouterModel: + model_name = branch_llm.model.removeprefix("openrouter/") provider = OpenRouterProvider(api_key=os.getenv("OPENROUTER_API_KEY")) - return OpenAIChatModel(model_name, provider=provider) + return OpenRouterModel(model_name, provider=provider) + + async def _run_agent(self, agent: Agent, prompt: str) -> object: + async with self._agent_run_limiter: + return await asyncio.wait_for( + agent.run( + prompt, + usage_limits=UsageLimits( + request_limit=self._request_limit_per_question + ), + ), + timeout=self._agent_run_timeout_seconds, + ) - async def _search_the_web(self, query: str) -> str: + async def _search_for_quotes( + self, + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: try: - searcher = AskNewsSearcher() - return await searcher.get_formatted_news_async(query) - except Exception as news_error: + result = await self._exa_quote_searcher.search_for_quotes( + query, include_domains, exclude_domains + ) + if result and "No Exa search results" not in result: + return result + logger.info(f"Exa returned nothing for '{query}'; trying AskNews.") + except Exception as exa_error: logger.warning( - f"AskNews search failed ({news_error}); falling back to researcher llm." + f"Exa quote search failed ({exa_error}); falling back to AskNews." ) - try: - researcher = self.get_llm("researcher", "llm") - return await researcher.invoke(query) - except Exception as fallback_error: - logger.warning(f"Fallback search failed: {fallback_error}") - return f"No search results available for '{query}'." - - def _build_agent(self, output_type: type[BaseModel]) -> Agent: - async def search_the_web(query: str) -> str: - """Search recent news and the web for evidence of what the target group thinks. + try: + return await AskNewsSearcher().get_formatted_news_async(query) + except Exception as fallback_error: + logger.warning(f"AskNews fallback failed: {fallback_error}") + return f"No search results available for '{query}'." + + def _build_agent( + self, output_type: type[BaseModel], branch_llm: GeneralLlm + ) -> Agent: + async def search_for_quotes( + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: + """Search the web for evidence of what the target group thinks, returning quotable highlights. Use focused queries (polls, surveys, expert statements, op-eds, - articles, social media sentiment, etc.). Returns formatted snippets - with their sources and URLs. + articles, social-media sentiment, etc.). For each result you get the + title, URL, date, an article summary, and the most relevant highlight + quotes you can cite verbatim. Optionally pass include_domains / + exclude_domains (e.g. specific outlets) to target a slice of the group. """ - return await self._search_the_web(query) + return await self._search_for_quotes( + query, include_domains, exclude_domains + ) return Agent( - self._get_agent_model(), + self._get_agent_model(branch_llm), output_type=output_type, system_prompt=self._system_prompt(), - tools=[search_the_web], + tools=[search_for_quotes], retries=2, ) + async def _make_prediction( + self, question: MetaculusQuestion, research: str + ) -> ReasonedPrediction[object]: + notepad = await self._get_notepad(question) + async with self._note_pad_lock: + branch_index = notepad.note_entries.get("branch_counter", 0) + notepad.note_entries["branch_counter"] = branch_index + 1 + notepad.total_predictions_attempted += 1 + branch_llm = self.branch_llms[branch_index % len(self.branch_llms)] + + if isinstance(question, BinaryQuestion): + return await self._run_forecast_on_binary(question, research, branch_llm) + if isinstance(question, MultipleChoiceQuestion): + return await self._run_forecast_on_multiple_choice( + question, research, branch_llm + ) + if isinstance(question, NumericQuestion): + return await self._run_forecast_on_numeric(question, research, branch_llm) + if isinstance(question, DateQuestion): + return await self._run_forecast_on_date(question, research, branch_llm) + if isinstance(question, ConditionalQuestion): + return await self._run_forecast_on_conditional(question, research) + raise ValueError(f"Unknown question type: {type(question)}") + def _system_prompt(self) -> str: spec = self.population_spec return clean_indents( @@ -248,7 +400,7 @@ def _system_prompt(self) -> str: {spec.sampling_method} Finding evidence: - Use the `search_the_web` tool (2-5 focused searches) to find concrete + Use the `search_for_quotes` tool (2-5 focused searches) to find concrete evidence of what this group currently believes about the question or its close neighbors. {spec.source_guidance} @@ -292,9 +444,13 @@ def _question_block(self, question: MetaculusQuestion) -> str: ) async def _run_forecast_on_binary( - self, question: BinaryQuestion, research: str + self, + question: BinaryQuestion, + research: str, + branch_llm: GeneralLlm | None = None, ) -> ReasonedPrediction[float]: - agent = self._build_agent(BinaryPopulationForecast) + branch_llm = branch_llm or self.branch_llms[0] + agent = self._build_agent(BinaryPopulationForecast, branch_llm) prompt = clean_indents( f""" {self._question_block(question)} @@ -303,13 +459,12 @@ async def _run_forecast_on_binary( that this question resolves YES. """ ) - result = await agent.run( - prompt, - usage_limits=UsageLimits(request_limit=self._request_limit_per_question), - ) + result = await self._run_agent(agent, prompt) forecast: BinaryPopulationForecast = result.output probability = max(0.01, min(0.99, forecast.aggregate_probability)) reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, forecast.population_summary, forecast.sources, forecast.aggregate_rationale, @@ -318,9 +473,13 @@ async def _run_forecast_on_binary( return ReasonedPrediction(prediction_value=probability, reasoning=reasoning) async def _run_forecast_on_multiple_choice( - self, question: MultipleChoiceQuestion, research: str + self, + question: MultipleChoiceQuestion, + research: str, + branch_llm: GeneralLlm | None = None, ) -> ReasonedPrediction[PredictedOptionList]: - agent = self._build_agent(MultipleChoicePopulationForecast) + branch_llm = branch_llm or self.branch_llms[0] + agent = self._build_agent(MultipleChoicePopulationForecast, branch_llm) options_str = ", ".join(f'"{option}"' for option in question.options) prompt = clean_indents( f""" @@ -333,10 +492,7 @@ async def _run_forecast_on_multiple_choice( probabilities sum to approximately 1. """ ) - result = await agent.run( - prompt, - usage_limits=UsageLimits(request_limit=self._request_limit_per_question), - ) + result = await self._run_agent(agent, prompt) forecast: MultipleChoicePopulationForecast = result.output predicted_options = self._build_option_list( question, forecast.option_probabilities @@ -346,6 +502,8 @@ async def _run_forecast_on_multiple_choice( for option in predicted_options.predicted_options ) reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, forecast.population_summary, forecast.sources, forecast.aggregate_rationale, @@ -380,12 +538,16 @@ def _build_option_list( ) async def _run_forecast_on_numeric( - self, question: NumericQuestion, research: str + self, + question: NumericQuestion, + research: str, + branch_llm: GeneralLlm | None = None, ) -> ReasonedPrediction[NumericDistribution]: + branch_llm = branch_llm or self.branch_llms[0] upper_bound_message, lower_bound_message = ( self._create_upper_and_lower_bound_messages(question) ) - agent = self._build_agent(NumericPopulationForecast) + agent = self._build_agent(NumericPopulationForecast, branch_llm) prompt = clean_indents( f""" {self._question_block(question)} @@ -400,10 +562,7 @@ async def _run_forecast_on_numeric( the group's uncertainty. """ ) - result = await agent.run( - prompt, - usage_limits=UsageLimits(request_limit=self._request_limit_per_question), - ) + result = await self._run_agent(agent, prompt) forecast: NumericPopulationForecast = result.output percentiles = self._build_percentiles(forecast.percentiles) distribution = NumericDistribution.from_question(percentiles, question) @@ -412,6 +571,8 @@ async def _run_forecast_on_numeric( for percentile in percentiles ) reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, forecast.population_summary, forecast.sources, forecast.aggregate_rationale, @@ -420,12 +581,16 @@ async def _run_forecast_on_numeric( return ReasonedPrediction(prediction_value=distribution, reasoning=reasoning) async def _run_forecast_on_date( - self, question: DateQuestion, research: str + self, + question: DateQuestion, + research: str, + branch_llm: GeneralLlm | None = None, ) -> ReasonedPrediction[NumericDistribution]: + branch_llm = branch_llm or self.branch_llms[0] upper_bound_message, lower_bound_message = ( self._create_upper_and_lower_bound_messages(question) ) - agent = self._build_agent(DatePopulationForecast) + agent = self._build_agent(DatePopulationForecast, branch_llm) prompt = clean_indents( f""" {self._question_block(question)} @@ -439,10 +604,7 @@ async def _run_forecast_on_date( wide intervals to reflect the group's uncertainty. """ ) - result = await agent.run( - prompt, - usage_limits=UsageLimits(request_limit=self._request_limit_per_question), - ) + result = await self._run_agent(agent, prompt) forecast: DatePopulationForecast = result.output percentiles = self._build_date_percentiles(forecast.percentiles) distribution = NumericDistribution.from_question(percentiles, question) @@ -452,6 +614,8 @@ async def _run_forecast_on_date( for percentile in percentiles ) reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, forecast.population_summary, forecast.sources, forecast.aggregate_rationale, @@ -491,12 +655,15 @@ def _build_date_percentiles(points: list[DatePercentilePoint]) -> list[Percentil def _format_reasoning( self, + branch_llm: GeneralLlm, + scratchpad: str, population_summary: str, sources: list[DiscoveredSource], aggregate_rationale: str, final_line: str, ) -> str: spec = self.population_spec + model_label = self._branch_label(branch_llm) table_rows = [] for index, source in enumerate(sources, start=1): source_cell = ( @@ -511,13 +678,24 @@ def _format_reasoning( if table_rows else "| - | (no sources found) | - | - | - | - |" ) + max_scratchpad_chars = 2000 + trimmed_scratchpad = (scratchpad or "").strip() + if len(trimmed_scratchpad) > max_scratchpad_chars: + trimmed_scratchpad = ( + trimmed_scratchpad[:max_scratchpad_chars].rstrip() + "…" + ) + scratchpad_section = ( + f"### Reasoning scratchpad\n{trimmed_scratchpad}\n\n" + if trimmed_scratchpad + else "" + ) return clean_indents( f""" - ## What {spec.name} appears to forecast + ## {model_label} — what {spec.name} appears to forecast {population_summary} - ### Sources sampled and their implied forecasts + {scratchpad_section}### Sources sampled and their implied forecasts | # | Source | Represents | Implied forecast | Confidence | Note | |---|--------|-----------|------------------|-----------|------| {table} @@ -546,7 +724,10 @@ def _create_comment( for collection in research_prediction_collections: for prediction in collection.predictions: breakdowns.append(prediction.reasoning) - combined_breakdowns = "\n\n".join(breakdowns) + combined_breakdowns = "\n\n---\n\n".join(breakdowns) + branch_models = ", ".join( + self._branch_label(branch_llm) for branch_llm in self.branch_llms + ) comment = clean_indents( f""" # {spec.name.upper()} BASELINE FORECAST @@ -556,9 +737,14 @@ def _create_comment( {spec.target_description} would give if asked this question. This is a public-baseline proxy, not a best-guess of the true outcome. *Bot Name*: {self.__class__.__name__} - - The breakdown below lists the sources that were sampled and the forecast - each one implies for this question, followed by how they were aggregated. + *Model branches (aggregated)*: {branch_models} + + This forecast is the aggregate of {len(self.branch_llms)} independent agent + runs, one per model above. Each run searched for evidence of what + {spec.short_name} believes (via Exa quote search) and recorded the sources + it sampled and the forecast each implies. The per-model breakdowns below + show those sources and implied forecasts; the headline figure is the + aggregate across all branches. {combined_breakdowns} diff --git a/run_bots.py b/run_bots.py index cd6735c0..bd2abc7c 100644 --- a/run_bots.py +++ b/run_bots.py @@ -465,19 +465,19 @@ def create_bot( def create_population_baseline_bot( bot_class: type[PopulationBaselineBot], - agent_llm: GeneralLlm, + branch_llms: list[GeneralLlm], ) -> PopulationBaselineBot: return bot_class( research_reports_per_question=1, - predictions_per_research_report=1, + branch_llms=branch_llms, use_research_summary_to_forecast=False, publish_reports_to_metaculus=default_for_publish_to_metaculus, skip_previously_forecasted_questions=default_for_skipping_questions, enable_summarize_research=False, llms={ - "default": agent_llm, + "default": branch_llms[0], "summarizer": None, - "researcher": "asknews/news-summaries", + "researcher": "no_research", "parser": structure_output_model, }, extra_metadata_in_explanation=True, @@ -659,54 +659,69 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR ), } - population_baseline_agent_llm = GeneralLlm( - model="openrouter/openai/gpt-4o-mini", - temperature=0.3, - timeout=5 * 60, + population_baseline_agent_timeout = 5 * 60 + population_baseline_branch_llms = [ + GeneralLlm( + model="openrouter/anthropic/claude-sonnet-4.5", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + GeneralLlm( + model="openrouter/x-ai/grok-4.20", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + GeneralLlm( + model="openrouter/z-ai/glm-5.1", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + ] + roughly_population_baseline_cost = ( + roughly_sonnet_4_cost + 2 * roughly_deepseek_r1_cost ) - roughly_population_baseline_cost = roughly_gpt_4o_mini_cost * 4 mode_base_bot_mapping = { ############################ Public-baseline bots (June 2026) ############################ "METAC_PUBLIC_SENTIMENT_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - PublicSentimentBaselineBot, population_baseline_agent_llm + PublicSentimentBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, "METAC_EXPERT_OPINION_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - ExpertOpinionBaselineBot, population_baseline_agent_llm + ExpertOpinionBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, "METAC_CREDIBLE_NEWS_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - CredibleNewsBaselineBot, population_baseline_agent_llm + CredibleNewsBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, "METAC_LEFT_LEANING_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - LeftLeaningBaselineBot, population_baseline_agent_llm + LeftLeaningBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, "METAC_CENTER_LEANING_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - CenterLeaningBaselineBot, population_baseline_agent_llm + CenterLeaningBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, "METAC_RIGHT_LEANING_BASELINE": { "estimated_cost_per_question": roughly_population_baseline_cost, "bot": create_population_baseline_bot( - RightLeaningBaselineBot, population_baseline_agent_llm + RightLeaningBaselineBot, population_baseline_branch_llms ), "tournaments": TournConfig.aib_and_site, }, From 511c9fc6edaf2ada1199929ce4e12c69a18c0e97 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Fri, 26 Jun 2026 03:35:17 +0000 Subject: [PATCH 3/3] Updates --- .../forecast_bots/public_baselines/population_baseline_bot.py | 4 +--- run_bots.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py index 56748641..d437d45d 100644 --- a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py +++ b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py @@ -265,9 +265,7 @@ def _default_branch_llms() -> list[GeneralLlm]: timeout=agent_timeout, ), GeneralLlm( - # grok-4.1-fast is no longer served on OpenRouter; 4.20 is the - # current cheap/fast grok generation and the closest replacement. - model="openrouter/x-ai/grok-4.20", + model="openrouter/x-ai/grok-4.3", temperature=0.3, timeout=agent_timeout, ), diff --git a/run_bots.py b/run_bots.py index bd2abc7c..a7ad04ac 100644 --- a/run_bots.py +++ b/run_bots.py @@ -667,7 +667,7 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR timeout=population_baseline_agent_timeout, ), GeneralLlm( - model="openrouter/x-ai/grok-4.20", + model="openrouter/x-ai/grok-4.3", temperature=0.3, timeout=population_baseline_agent_timeout, ),