diff --git a/.github/workflows/run-bot-aib-tournament.yaml b/.github/workflows/run-bot-aib-tournament.yaml index f688810c..cade18bd 100644 --- a/.github/workflows/run-bot-aib-tournament.yaml +++ b/.github/workflows/run-bot-aib-tournament.yaml @@ -86,6 +86,101 @@ jobs: # NOTE: don't remove any of the open source models, since these are the best option for a long term baseline (other models get deprecated) + #################################### Public-baseline bots #################################### + # These agentic bots estimate what a sampled group of people would forecast + # (public, experts, credible news, left, center, right), as public baselines + # to compare against the Community Prediction. + + bot_public_sentiment_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_PUBLIC_SENTIMENT_BASELINE" + metac_name: "metac-public-sentiment-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_expert_opinion_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_EXPERT_OPINION_BASELINE" + metac_name: "metac-expert-opinion-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_credible_news_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_CREDIBLE_NEWS_BASELINE" + metac_name: "metac-credible-news-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_left_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_LEFT_LEANING_BASELINE" + metac_name: "metac-left-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_center_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_CENTER_LEANING_BASELINE" + metac_name: "metac-center-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + + bot_right_leaning_baseline: + needs: precache_asknews + uses: ./.github/workflows/run-bot-launcher.yaml + with: + bot_name: "METAC_RIGHT_LEANING_BASELINE" + metac_name: "metac-right-leaning-baseline" + cache_key: asknews-cache-${{ github.run_id }} + secrets: + INPUT_METACULUS_TOKENS: ${{ secrets.METACULUS_TOKENS }} + INPUT_METACULUS_API_BASE_URL: ${{ secrets.METACULUS_API_BASE_URL }} + INPUT_OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + INPUT_EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + INPUT_ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} + INPUT_ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} + #################################### No-research one-shot bots #################################### bot_gpt_5_5_no_research_one_shot: diff --git a/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py new file mode 100644 index 00000000..7dbc7cbc --- /dev/null +++ b/code_tests/unit_tests/test_forecast_bots/test_population_baseline_bot.py @@ -0,0 +1,200 @@ +from types import SimpleNamespace + +from forecasting_tools.agents_and_tools.research.exa_quote_searcher import ( + ExaQuoteSearcher, +) +from forecasting_tools.ai_models.exa_searcher import ExaSource +from forecasting_tools.data_models.forecast_report import ReasonedPrediction +from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList +from forecasting_tools.data_models.numeric_report import NumericDistribution +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + BinaryPopulationForecast, + DiscoveredSource, + MultipleChoicePopulationForecast, + NumericPercentile, + NumericPopulationForecast, + OptionProbability, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) + + +class _FakeAgent: + def __init__(self, output: object) -> None: + self._output = output + + async def run(self, prompt: str, usage_limits: object = None) -> SimpleNamespace: + return SimpleNamespace(output=self._output) + + +def _make_bot_returning(output: object) -> PublicSentimentBaselineBot: + bot = PublicSentimentBaselineBot() + bot._build_agent = lambda output_type, branch_llm=None: _FakeAgent(output) # type: ignore + return bot + + +def _sample_sources() -> list[DiscoveredSource]: + return [ + DiscoveredSource( + name="YouGov poll (Jun 2026)", + represents="US adults", + url="https://example.com/poll", + implied_forecast="~60% expect yes", + confidence="medium", + note="Topline support translated to probability.", + ) + ] + + +async def test_binary_forecast_converts_and_clamps() -> None: + forecast = BinaryPopulationForecast( + population_summary="The public leans yes.", + sources=_sample_sources(), + aggregate_probability=0.995, + aggregate_rationale="Weighted toward the poll.", + ) + bot = _make_bot_returning(forecast) + question = BinaryQuestion(question_text="Will it happen?") + + prediction = await bot._run_forecast_on_binary(question, "") + + assert prediction.prediction_value == 0.99 + assert "YouGov poll" in prediction.reasoning + assert "Sources sampled" in prediction.reasoning + + +async def test_multiple_choice_maps_and_normalizes_options() -> None: + forecast = MultipleChoicePopulationForecast( + population_summary="Split opinion.", + sources=_sample_sources(), + option_probabilities=[ + OptionProbability(option_name="Option A", probability=0.6), + OptionProbability(option_name="Option B", probability=0.6), + ], + aggregate_rationale="Most lean A or B.", + ) + bot = _make_bot_returning(forecast) + question = MultipleChoiceQuestion( + question_text="Which option?", + options=["Option A", "Option B", "Option C"], + ) + + prediction = await bot._run_forecast_on_multiple_choice(question, "") + + assert isinstance(prediction.prediction_value, PredictedOptionList) + option_names = [ + option.option_name for option in prediction.prediction_value.predicted_options + ] + assert option_names == ["Option A", "Option B", "Option C"] + total = sum( + option.probability for option in prediction.prediction_value.predicted_options + ) + assert abs(total - 1.0) < 0.01 + + +async def test_numeric_forecast_builds_distribution() -> None: + forecast = NumericPopulationForecast( + population_summary="Public expects a mid value.", + sources=_sample_sources(), + percentiles=[ + NumericPercentile(percentile=0.1, value=10), + NumericPercentile(percentile=0.2, value=20), + NumericPercentile(percentile=0.4, value=40), + NumericPercentile(percentile=0.6, value=60), + NumericPercentile(percentile=0.8, value=80), + NumericPercentile(percentile=0.9, value=90), + ], + aggregate_rationale="Spread across plausible values.", + ) + bot = _make_bot_returning(forecast) + question = NumericQuestion( + question_text="How many?", + upper_bound=100, + lower_bound=0, + open_upper_bound=True, + open_lower_bound=True, + ) + + prediction = await bot._run_forecast_on_numeric(question, "") + + assert isinstance(prediction.prediction_value, NumericDistribution) + assert len(prediction.prediction_value.declared_percentiles) == 6 + + +async def test_make_prediction_round_robins_branch_models() -> None: + bot = PublicSentimentBaselineBot() + question = BinaryQuestion(question_text="Will it happen?") + notepad = await bot._initialize_notepad(question) + bot._note_pads.append(notepad) + + used_models: list[str] = [] + + async def fake_binary( + question: BinaryQuestion, research: str, branch_llm: object = None + ) -> ReasonedPrediction[float]: + used_models.append(branch_llm.model) # type: ignore[union-attr] + return ReasonedPrediction(prediction_value=0.5, reasoning="x") + + bot._run_forecast_on_binary = fake_binary # type: ignore[assignment] + + for _ in range(len(bot.branch_llms)): + await bot._make_prediction(question, "") + + assert used_models == [branch.model for branch in bot.branch_llms] + + +def test_exa_quote_searcher_formats_summary_and_top_quotes() -> None: + searcher = ExaQuoteSearcher(num_quotes_per_source=2) + source = ExaSource( + original_query="poll", + auto_prompt_string=None, + title="Poll shows majority support", + url="https://example.com/poll", + text=None, + author="Jane Doe", + published_date=None, + score=0.9, + highlights=["a low-scoring quote", "a high-scoring quote"], + highlight_scores=[0.1, 0.9], + summary="A representative poll about the topic.", + ) + + formatted = searcher._format_sources("poll", [source]) + + assert "Poll shows majority support" in formatted + assert "https://example.com/poll" in formatted + assert "A representative poll about the topic." in formatted + assert "a high-scoring quote" in formatted + + +def test_comment_includes_population_framing_and_sources() -> None: + from forecasting_tools.data_models.forecast_report import ( + ReasonedPrediction, + ResearchWithPredictions, + ) + + bot = PublicSentimentBaselineBot() + question = BinaryQuestion(question_text="Will it happen?") + collection = ResearchWithPredictions( + research_report="", + summary_report="", + errors=[], + predictions=[ + ReasonedPrediction( + prediction_value=0.6, + reasoning="| 1 | YouGov poll | US adults | 60% | medium | note |", + ) + ], + ) + + comment = bot._create_comment(question, [collection], 0.6, 0.0, 0.0) + + assert "BASELINE FORECAST" in comment + assert "general public" in comment + assert "YouGov poll" in comment diff --git a/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py b/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py new file mode 100644 index 00000000..6002cc70 --- /dev/null +++ b/forecasting_tools/agents_and_tools/research/exa_quote_searcher.py @@ -0,0 +1,103 @@ +"""A lightweight Exa search tool that returns directly quotable evidence. + +Unlike ``SmartSearcher`` (which synthesises an answer from search results), this +tool returns the raw building blocks an agent needs to *quote and attribute* +sources itself: for each result it returns the title, URL, author, publish date, +a short article summary, and the most relevant highlight quotes. This is meant to +be handed to an agent as a single tool so the agent can read what sources say and +record the forecast each source implies, with verbatim quotes. +""" + +import logging + +from forecasting_tools.ai_models.exa_searcher import ExaSearcher, ExaSource, SearchInput + +logger = logging.getLogger(__name__) + + +class ExaQuoteSearcher: + """Searches the web with Exa and returns quotable highlights + summaries.""" + + def __init__( + self, + num_results: int = 6, + num_quotes_per_source: int = 4, + max_summary_chars: int = 600, + ) -> None: + self.num_results = num_results + self.num_quotes_per_source = num_quotes_per_source + self.max_summary_chars = max_summary_chars + self.exa_searcher = ExaSearcher( + include_text=False, + include_highlights=True, + include_summary=True, + num_results=num_results, + ) + + async def search_for_quotes( + self, + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: + search_input = SearchInput( + web_search_query=query, + highlight_query=query, + include_domains=include_domains or [], + exclude_domains=exclude_domains or [], + include_text=None, + start_published_date=None, + end_published_date=None, + ) + try: + sources = await self.exa_searcher.invoke(search_input) + except Exception as error: + logger.warning(f"Exa quote search failed for '{query}': {error}") + return f"No Exa search results available for '{query}' (error: {error})." + if not sources: + return f"No Exa search results found for '{query}'." + return self._format_sources(query, sources) + + def _format_sources(self, query: str, sources: list[ExaSource]) -> str: + blocks: list[str] = [f'Exa results for "{query}":'] + for index, source in enumerate(sources, start=1): + blocks.append(self._format_single_source(index, source)) + return "\n\n".join(blocks) + + def _format_single_source(self, index: int, source: ExaSource) -> str: + title = source.title or "(untitled)" + author = f" by {source.author}" if source.author else "" + header = ( + f"[{index}] {title}{author} — {source.readable_publish_date}\n" + f"URL: {source.url or 'unknown'}" + ) + summary = self._summary_text(source) + quotes = self._top_quotes(source) + quote_block = ( + "\n".join(f' - "{quote}"' for quote in quotes) + if quotes + else " - (no highlight quotes returned)" + ) + return f"{header}\nSummary: {summary}\nQuotes:\n{quote_block}" + + def _summary_text(self, source: ExaSource) -> str: + if not source.summary: + return "(no summary returned)" + summary = source.summary.strip() + if len(summary) > self.max_summary_chars: + summary = summary[: self.max_summary_chars].rstrip() + "…" + return summary + + def _top_quotes(self, source: ExaSource) -> list[str]: + scores = source.highlight_scores or [1.0] * len(source.highlights) + scored_quotes = sorted( + zip(source.highlights, scores), + key=lambda pair: pair[1], + reverse=True, + ) + top_quotes = [ + quote.strip() + for quote, _ in scored_quotes[: self.num_quotes_per_source] + if quote.strip() + ] + return top_quotes diff --git a/forecasting_tools/ai_models/exa_searcher.py b/forecasting_tools/ai_models/exa_searcher.py index d696a2d5..0a131b48 100644 --- a/forecasting_tools/ai_models/exa_searcher.py +++ b/forecasting_tools/ai_models/exa_searcher.py @@ -34,6 +34,7 @@ class ExaSource(BaseModel, Jsonable): score: float | None highlights: list[str] highlight_scores: list[float] + summary: str | None = None @property def readable_publish_date(self) -> str: @@ -119,18 +120,21 @@ class ExaSearcher(RequestLimitedModel, RetryableModel, TimeLimitedModel, IncursC COST_PER_REQUEST = 0.005 COST_PER_HIGHLIGHT = 0.001 COST_PER_TEXT = 0.001 + COST_PER_SUMMARY = 0.001 def __init__( self, *args, include_text: bool = False, include_highlights: bool = True, + include_summary: bool = False, num_results: int = 5, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.include_text = include_text self.include_highlights = include_highlights + self.include_summary = include_summary self.num_highlights_per_url = 10 self.num_sentences_per_highlight = 4 self.num_results = num_results @@ -217,6 +221,7 @@ def _prepare_request_data(self, search: SearchInput) -> tuple[str, dict, dict]: if self.include_highlights else False ), + "summary": (True if self.include_summary else False), }, } @@ -283,6 +288,7 @@ def _process_response( score=result.get("score"), highlights=result.get("highlights", []), highlight_scores=result.get("highlightScores", []), + summary=result.get("summary"), ) exa_sources.append(exa_source) return exa_sources @@ -298,6 +304,7 @@ def _calculate_cost_for_request(self, results: list[ExaSource]) -> float: cost = self.COST_PER_REQUEST cost += self.COST_PER_TEXT * len(results) if self.include_text else 0 cost += self.COST_PER_HIGHLIGHT * len(results) if self.include_highlights else 0 + cost += self.COST_PER_SUMMARY * len(results) if self.include_summary else 0 return cost async def _track_cost_in_manager_using_model_response( diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py index a598f2ed..226e8518 100644 --- a/forecasting_tools/forecast_bots/bot_lists.py +++ b/forecasting_tools/forecast_bots/bot_lists.py @@ -41,6 +41,24 @@ from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) from forecasting_tools.forecast_bots.template_bot import TemplateBot @@ -64,6 +82,12 @@ def get_all_important_bot_classes() -> list[type[ForecastBot]]: SummerTemplateBot2026, GPT41OptimizedBot, NoResearchOneShotBot, + PublicSentimentBaselineBot, + ExpertOpinionBaselineBot, + CredibleNewsBaselineBot, + LeftLeaningBaselineBot, + CenterLeaningBaselineBot, + RightLeaningBaselineBot, ] diff --git a/forecasting_tools/forecast_bots/public_baselines/__init__.py b/forecasting_tools/forecast_bots/public_baselines/__init__.py new file mode 100644 index 00000000..528fd735 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/__init__.py @@ -0,0 +1,44 @@ +"""Public-baseline forecasting bots. + +These bots do NOT try to give the most accurate forecast. Instead, each one +estimates the forecast that a particular *group of people* would collectively +give if a randomized, representative sample of that group were asked the +question. They are meant to be cheap, agentic stand-ins for "what the public / +experts / news / left / center / right would predict", so the Metaculus +Community Prediction can be benchmarked against these public baselines. + +All bots are built on PydanticAI agents and wrapped in the ``ForecastBot`` +interface so they run identically to the other open-source bots. +""" + +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) + +__all__ = [ + "PopulationBaselineBot", + "PublicSentimentBaselineBot", + "ExpertOpinionBaselineBot", + "CredibleNewsBaselineBot", + "LeftLeaningBaselineBot", + "CenterLeaningBaselineBot", + "RightLeaningBaselineBot", +] diff --git a/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py new file mode 100644 index 00000000..7faaebea --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/center_leaning_bot.py @@ -0,0 +1,39 @@ +"""Baseline bot estimating what the political center would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class CenterLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of centrist figures and outlets.""" + + population_spec = PopulationSpec( + name="the political center", + short_name="center", + target_description=( + "centrist / moderate public figures, commentators, and media outlets " + "(e.g. Reuters, AP, the news pages of major papers, moderate and " + "independent voices, centrist think tanks), excluding strongly partisan " + "left or right sources" + ), + sampling_method=( + "Imagine sampling a representative set of moderate, non-partisan-leaning " + "voices and asking each what they would predict. Centrists tend to weight " + "mainstream expert consensus, official data, and 'both sides' framings, and " + "to avoid the strong directional priors of either wing. Sample across " + "center-left-of-neutral to center-right-of-neutral moderates." + ), + source_guidance=( + "Prioritise wire services, straight news, centrist columnists, and " + "non-partisan analysts and think tanks. When using a source, note why it " + "qualifies as centrist rather than partisan." + ), + interpretation_guidance=( + "Reflect the centrist tendency to anchor on consensus and official " + "forecasts, to split the difference between partisan narratives, and to be " + "cautious about extreme outcomes. Represent this measured framing rather " + "than either wing's view." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py b/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py new file mode 100644 index 00000000..3cf8c225 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/credible_news_bot.py @@ -0,0 +1,44 @@ +"""Baseline bot estimating the forecast implied by credible news outlets.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class CredibleNewsBaselineBot(PopulationBaselineBot): + """Estimates the forecast implied by a sample of credible news outlets.""" + + population_spec = PopulationSpec( + name="credible news outlets", + short_name="credible news", + target_description=( + "established, fact-checked news organisations with strong reputations for " + "accuracy and editorial standards (e.g. Reuters, AP, BBC, The New York " + "Times, The Wall Street Journal, The Economist, Financial Times, Bloomberg, " + "Nature/Science news), sampled across outlets rather than relying on any " + "single one" + ), + sampling_method=( + "Imagine collecting the most recent reporting on this question from a " + "basket of credible, mainstream-to-high-quality outlets and asking what " + "forecast their coverage collectively implies. News outlets rarely state " + "explicit probabilities, so infer the implied forecast from how they frame " + "the situation: which outcome is treated as the default/expected one, the " + "hedging language used ('likely', 'unlikely', 'on track', 'in doubt'), and " + "which scenarios are given the most weight. Prefer reporting and analysis " + "over opinion columns." + ), + source_guidance=( + "Prioritise straight news reporting and data journalism from reputable " + "outlets and wire services. Use the publication dates to weight toward the " + "most current framing. Avoid low-credibility or partisan tabloid sources." + ), + interpretation_guidance=( + "Map editorial framing onto a concrete forecast: e.g. coverage describing " + "an outcome as 'widely expected' implies a high probability, 'facing " + "long odds' implies a low one. Account for newsroom tendencies toward " + "drama, novelty, and balance ('both sides') that can distort the implied " + "forecast." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py b/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py new file mode 100644 index 00000000..4d4f3cc0 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/expert_opinion_bot.py @@ -0,0 +1,43 @@ +"""Baseline bot estimating what topic experts would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class ExpertOpinionBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a representative sample of relevant experts.""" + + population_spec = PopulationSpec( + name="subject-matter experts on the question's topic", + short_name="topic experts", + target_description=( + "credentialed specialists whose professional field is directly relevant to " + "the question (e.g. epidemiologists for a disease question, central-bank " + "economists for a rates question, election scientists for an election " + "question), sampled across institutions and viewpoints rather than a single " + "school of thought" + ), + sampling_method=( + "First identify which 1-3 fields of expertise are most relevant to the " + "question. Then imagine polling a representative cross-section of recognised " + "experts in those fields. Experts reason from domain models, base rates, " + "and current data, and tend to be better calibrated than the public, but " + "they also have characteristic blind spots (over-reliance on existing " + "models, herding around consensus, and slowness to update on regime " + "changes). Sample across competing expert camps where the field is divided." + ), + source_guidance=( + "Prioritise peer-reviewed research, expert surveys and elicitations, " + "official forecasts from expert bodies (IMF, IPCC, CBO, central banks, " + "WHO, etc.), analyst consensus, and named expert commentary. Down-weight " + "non-expert punditry." + ), + interpretation_guidance=( + "Translate technical findings, model outputs, and expert statements into a " + "concrete forecast for this question's resolution criteria. Where experts " + "disagree, represent the spread of expert opinion rather than collapsing to " + "a single view, and note the consensus position separately from outliers." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py new file mode 100644 index 00000000..d7099d7c --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/left_leaning_bot.py @@ -0,0 +1,40 @@ +"""Baseline bot estimating what the political left would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class LeftLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of left-leaning figures and outlets.""" + + population_spec = PopulationSpec( + name="the political left", + short_name="left", + target_description=( + "left-leaning / progressive public figures, commentators, and media " + "outlets (e.g. MSNBC, The Guardian, The Nation, Vox, Mother Jones, " + "prominent progressive politicians and writers), sampled across the " + "center-left to the further left" + ), + sampling_method=( + "Imagine sampling a representative set of left-leaning voices and asking " + "each what they would predict. Their forecasts are shaped by progressive " + "priors and the issues their side emphasises. Weight across the spectrum " + "from the establishment center-left to the activist left, and base the " + "leaning on the question's relevant country/context." + ), + source_guidance=( + "Prioritise reporting, op-eds, and commentary from left-leaning outlets and " + "figures, and left-leaning framing of polls and events. Identify the " + "source's lean explicitly when recording it." + ), + interpretation_guidance=( + "Reflect how this side's worldview shapes its predictions: motivated " + "reasoning toward outcomes it favours or fears, distinct trusted sources, " + "and characteristic framings of risk and blame. Faithfully represent the " + "left's expected forecast even where it diverges from neutral analysis, and " + "note internal disagreement between the center-left and the further left." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py new file mode 100644 index 00000000..d437d45d --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/population_baseline_bot.py @@ -0,0 +1,761 @@ +"""Base class for the public-baseline (population) forecasting bots. + +A ``PopulationBaselineBot`` estimates the forecast that a *specific group of +people* would collectively produce if a randomized, representative sample of +that group were polled on the question. It is deliberately NOT trying to be +accurate about the world; it is trying to faithfully reproduce a group's +collective belief (including that group's biases). + +Each concrete bot (public sentiment, expert opinion, credible news outlets, +left, center, right) only needs to define a ``PopulationSpec`` describing who is +being sampled and how to find evidence of their views. All of the agentic +machinery, prediction conversion, and comment formatting lives here. + +The bots are built on PydanticAI agents that are each given a single Exa-based +quote-search tool and asked to return a structured ``*PopulationForecast`` object +containing the individual sources they sampled, each source's implied forecast, +and an aggregate implied forecast. + +Each question is forecast by several independent agent runs ("branches"), one per +underlying model, mirroring the research-only bot's multi-sample aggregation but +with a different model per branch. The framework then aggregates the branch +forecasts (e.g. median) into the final prediction. Every branch's structured +object is rendered into the Metaculus comment so readers can see exactly which +sources produced which implied forecasts, per model. +""" + +import asyncio +import logging +import os +from datetime import datetime, timezone + +import pendulum +from pydantic import BaseModel, Field +from pydantic_ai import Agent, UsageLimits +from pydantic_ai.models.openrouter import OpenRouterModel +from pydantic_ai.providers.openrouter import OpenRouterProvider + +from forecasting_tools.agents_and_tools.research.exa_quote_searcher import ( + ExaQuoteSearcher, +) +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.data_models.data_organizer import DataOrganizer +from forecasting_tools.data_models.forecast_report import ( + ReasonedPrediction, + ResearchWithPredictions, +) +from forecasting_tools.data_models.multiple_choice_report import ( + PredictedOption, + PredictedOptionList, +) +from forecasting_tools.data_models.numeric_report import NumericDistribution, Percentile +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + ConditionalQuestion, + DateQuestion, + MetaculusQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_summer import ( + SummerTemplateBot2026, +) +from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher +from forecasting_tools.helpers.metaculus_client import MetaculusClient +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + +class PopulationSpec(BaseModel): + """Description of the group of people a baseline bot is sampling.""" + + name: str + short_name: str + target_description: str + sampling_method: str + source_guidance: str + interpretation_guidance: str + + +class DiscoveredSource(BaseModel): + """A single source used to approximate one slice of the sampled group.""" + + name: str = Field( + description="Short name of the source, e.g. 'YouGov poll (Jun 2026)', " + "'Dr. Jane Smith, virologist', or 'The Economist editorial'." + ) + represents: str = Field( + description="Which slice of the sampled group this source stands in for." + ) + url: str | None = Field( + default=None, description="A link to the source if one is available." + ) + implied_forecast: str = Field( + description="The forecast this source implies for THIS exact question, " + "stated concretely (a probability, an outcome, or a number/range)." + ) + confidence: str = Field( + description="How strongly the source implies this forecast: " + "'low', 'medium', or 'high'." + ) + note: str = Field( + description="One sentence on how this source's view was translated into " + "the implied forecast." + ) + + +class BinaryPopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes: lay out how you sampled the " + "group, what the evidence shows, and how you reasoned toward the " + "group's implied probability BEFORE committing to the number.", + ) + population_summary: str = Field( + description="2-4 sentences on what the sampled group collectively believes " + "about this question and why." + ) + sources: list[DiscoveredSource] + aggregate_probability: float = Field( + ge=0, + le=1, + description="The group's aggregate implied probability that the question " + "resolves YES (between 0 and 1).", + ) + aggregate_rationale: str = Field( + description="How the individual sources were weighted and combined." + ) + + +class OptionProbability(BaseModel): + option_name: str + probability: float = Field(ge=0, le=1) + + +class MultipleChoicePopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied option probabilities BEFORE committing to numbers.", + ) + population_summary: str + sources: list[DiscoveredSource] + option_probabilities: list[OptionProbability] = Field( + description="The group's aggregate implied probability for each option. " + "Use the exact option names provided and make the probabilities sum to ~1." + ) + aggregate_rationale: str + + +class NumericPercentile(BaseModel): + percentile: float = Field( + ge=0, + le=1, + description="A cumulative probability between 0 and 1 (e.g. 0.1 for the " + "10th percentile).", + ) + value: float = Field( + description="The value at this percentile, in the question's units." + ) + + +class NumericPopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied distribution BEFORE committing to percentile values.", + ) + population_summary: str + sources: list[DiscoveredSource] + percentiles: list[NumericPercentile] = Field( + description="An increasing list of percentile/value pairs describing the " + "group's aggregate distribution. Include at least the 0.1, 0.2, 0.4, 0.6, " + "0.8 and 0.9 percentiles with wide intervals." + ) + aggregate_rationale: str + + +class DatePercentilePoint(BaseModel): + percentile: float = Field(ge=0, le=1) + iso_date: str = Field( + description="The date at this percentile in ISO format (YYYY-MM-DD)." + ) + + +class DatePopulationForecast(BaseModel): + scratchpad: str = Field( + default="", + description="Your private working notes reasoning toward the group's " + "implied date distribution BEFORE committing to percentile dates.", + ) + population_summary: str + sources: list[DiscoveredSource] + percentiles: list[DatePercentilePoint] = Field( + description="An increasing list of percentile/date pairs describing the " + "group's aggregate distribution. Include at least the 0.1, 0.2, 0.4, 0.6, " + "0.8 and 0.9 percentiles with wide intervals." + ) + aggregate_rationale: str + + +class PopulationBaselineBot(SummerTemplateBot2026): + """Estimates what a sampled group of people would collectively forecast. + + Subclasses provide ``population_spec``. The bot uses a PydanticAI agent with + a single search tool to gather evidence of the group's views, returns a + structured per-source breakdown, and converts the aggregate into the + prediction type the framework expects. + """ + + population_spec: PopulationSpec + # Max model round-trips per branch (a few searches + the final answer). + _request_limit_per_question: int = 8 + # Hard ceiling on a single branch so a stalled OpenRouter/Exa/AskNews call + # cannot hang the whole forecast (GeneralLlm.timeout is not wired into the + # PydanticAI agent, so we enforce it ourselves). + _agent_run_timeout_seconds: float = 240 + # Cap simultaneous agent runs (each fans out to OpenRouter + Exa) so bursts + # of branches/questions don't trip OpenRouter "error" responses or Exa rate + # limits. Shared across all questions and branches of the bot. + _max_concurrent_agent_runs: int = 2 + _agent_run_limiter: asyncio.Semaphore = asyncio.Semaphore(2) + + def __init__( + self, + *, + branch_llms: list[GeneralLlm] | None = None, + research_reports_per_question: int = 1, + use_research_summary_to_forecast: bool = False, + publish_reports_to_metaculus: bool = False, + folder_to_save_reports_to: str | None = None, + skip_previously_forecasted_questions: bool = False, + llms: dict[str, str | GeneralLlm | None] | None = None, + enable_summarize_research: bool = False, + parameters_to_exclude_from_config_dict: list[str] | None = None, + extra_metadata_in_explanation: bool = False, + required_successful_predictions: float = 0.5, + metaculus_client: MetaculusClient | None = None, + ) -> None: + self.branch_llms = branch_llms or self._default_branch_llms() + assert len(self.branch_llms) > 0, "Need at least one branch model" + super().__init__( + research_reports_per_question=research_reports_per_question, + predictions_per_research_report=len(self.branch_llms), + use_research_summary_to_forecast=use_research_summary_to_forecast, + publish_reports_to_metaculus=publish_reports_to_metaculus, + folder_to_save_reports_to=folder_to_save_reports_to, + skip_previously_forecasted_questions=skip_previously_forecasted_questions, + llms=llms, + enable_summarize_research=enable_summarize_research, + parameters_to_exclude_from_config_dict=parameters_to_exclude_from_config_dict, + extra_metadata_in_explanation=extra_metadata_in_explanation, + required_successful_predictions=required_successful_predictions, + metaculus_client=metaculus_client, + ) + self._exa_quote_searcher = ExaQuoteSearcher() + + @staticmethod + def _default_branch_llms() -> list[GeneralLlm]: + agent_timeout = 5 * 60 + return [ + GeneralLlm( + model="openrouter/anthropic/claude-sonnet-4.5", + temperature=0.3, + timeout=agent_timeout, + ), + GeneralLlm( + model="openrouter/x-ai/grok-4.3", + temperature=0.3, + timeout=agent_timeout, + ), + GeneralLlm( + model="openrouter/z-ai/glm-5.1", + temperature=0.3, + timeout=agent_timeout, + ), + ] + + @classmethod + def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: + config_dict = super()._llm_config_defaults() + config_dict["summarizer"] = None + return config_dict + + async def run_research(self, question: MetaculusQuestion) -> str: + return "" + + @staticmethod + def _branch_label(branch_llm: GeneralLlm) -> str: + return branch_llm.model.removeprefix("openrouter/") + + def _get_agent_model(self, branch_llm: GeneralLlm) -> OpenRouterModel: + model_name = branch_llm.model.removeprefix("openrouter/") + provider = OpenRouterProvider(api_key=os.getenv("OPENROUTER_API_KEY")) + return OpenRouterModel(model_name, provider=provider) + + async def _run_agent(self, agent: Agent, prompt: str) -> object: + async with self._agent_run_limiter: + return await asyncio.wait_for( + agent.run( + prompt, + usage_limits=UsageLimits( + request_limit=self._request_limit_per_question + ), + ), + timeout=self._agent_run_timeout_seconds, + ) + + async def _search_for_quotes( + self, + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: + try: + result = await self._exa_quote_searcher.search_for_quotes( + query, include_domains, exclude_domains + ) + if result and "No Exa search results" not in result: + return result + logger.info(f"Exa returned nothing for '{query}'; trying AskNews.") + except Exception as exa_error: + logger.warning( + f"Exa quote search failed ({exa_error}); falling back to AskNews." + ) + try: + return await AskNewsSearcher().get_formatted_news_async(query) + except Exception as fallback_error: + logger.warning(f"AskNews fallback failed: {fallback_error}") + return f"No search results available for '{query}'." + + def _build_agent( + self, output_type: type[BaseModel], branch_llm: GeneralLlm + ) -> Agent: + async def search_for_quotes( + query: str, + include_domains: list[str] | None = None, + exclude_domains: list[str] | None = None, + ) -> str: + """Search the web for evidence of what the target group thinks, returning quotable highlights. + + Use focused queries (polls, surveys, expert statements, op-eds, + articles, social-media sentiment, etc.). For each result you get the + title, URL, date, an article summary, and the most relevant highlight + quotes you can cite verbatim. Optionally pass include_domains / + exclude_domains (e.g. specific outlets) to target a slice of the group. + """ + return await self._search_for_quotes( + query, include_domains, exclude_domains + ) + + return Agent( + self._get_agent_model(branch_llm), + output_type=output_type, + system_prompt=self._system_prompt(), + tools=[search_for_quotes], + retries=2, + ) + + async def _make_prediction( + self, question: MetaculusQuestion, research: str + ) -> ReasonedPrediction[object]: + notepad = await self._get_notepad(question) + async with self._note_pad_lock: + branch_index = notepad.note_entries.get("branch_counter", 0) + notepad.note_entries["branch_counter"] = branch_index + 1 + notepad.total_predictions_attempted += 1 + branch_llm = self.branch_llms[branch_index % len(self.branch_llms)] + + if isinstance(question, BinaryQuestion): + return await self._run_forecast_on_binary(question, research, branch_llm) + if isinstance(question, MultipleChoiceQuestion): + return await self._run_forecast_on_multiple_choice( + question, research, branch_llm + ) + if isinstance(question, NumericQuestion): + return await self._run_forecast_on_numeric(question, research, branch_llm) + if isinstance(question, DateQuestion): + return await self._run_forecast_on_date(question, research, branch_llm) + if isinstance(question, ConditionalQuestion): + return await self._run_forecast_on_conditional(question, research) + raise ValueError(f"Unknown question type: {type(question)}") + + def _system_prompt(self) -> str: + spec = self.population_spec + return clean_indents( + f""" + You are a careful research analyst. Your job is to estimate the forecast + that {spec.name} would collectively give for a specific question. You are + NOT estimating what is most likely to actually happen, and you are NOT + giving your own opinion. + + Concretely, approximate the result of taking a RANDOMIZED, REPRESENTATIVE + sample of {spec.target_description}, asking each member to forecast the + question, and aggregating their answers. + + How to sample this group: + {spec.sampling_method} + + Finding evidence: + Use the `search_for_quotes` tool (2-5 focused searches) to find concrete + evidence of what this group currently believes about the question or its + close neighbors. {spec.source_guidance} + + For every source you use, record: its name, which slice of the group it + represents, its URL (if any), and the forecast it IMPLIES for THIS exact + question. Translate vague sentiment, narratives, polling, or commentary + into a concrete implied forecast that matches the question's resolution + criteria. {spec.interpretation_guidance} + + Then aggregate across the sources you sampled, weighting each by how + representative it is of {spec.target_description}, to get the group's + aggregate implied forecast. + + Rules: + - Faithfully represent this group, including its biases and blind spots, + even if you personally believe they are wrong. + - Ground your estimate in evidence you actually find. If evidence is thin, + sample more broadly and reason explicitly about how this group tends to + think about questions like this. + - Try to include 3-8 distinct sources. + - Keep it efficient: a few targeted searches, then answer. + """ + ) + + def _question_block(self, question: MetaculusQuestion) -> str: + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + return clean_indents( + f""" + Question: {question.question_text} + + Background: + {question.background_info} + + Resolution criteria: + {question.resolution_criteria} + + {question.fine_print} + + Today's date: {today} + """ + ) + + async def _run_forecast_on_binary( + self, + question: BinaryQuestion, + research: str, + branch_llm: GeneralLlm | None = None, + ) -> ReasonedPrediction[float]: + branch_llm = branch_llm or self.branch_llms[0] + agent = self._build_agent(BinaryPopulationForecast, branch_llm) + prompt = clean_indents( + f""" + {self._question_block(question)} + + Estimate what {self.population_spec.name} would forecast as the probability + that this question resolves YES. + """ + ) + result = await self._run_agent(agent, prompt) + forecast: BinaryPopulationForecast = result.output + probability = max(0.01, min(0.99, forecast.aggregate_probability)) + reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + f"Aggregate implied probability of YES: {probability:.1%}", + ) + return ReasonedPrediction(prediction_value=probability, reasoning=reasoning) + + async def _run_forecast_on_multiple_choice( + self, + question: MultipleChoiceQuestion, + research: str, + branch_llm: GeneralLlm | None = None, + ) -> ReasonedPrediction[PredictedOptionList]: + branch_llm = branch_llm or self.branch_llms[0] + agent = self._build_agent(MultipleChoicePopulationForecast, branch_llm) + options_str = ", ".join(f'"{option}"' for option in question.options) + prompt = clean_indents( + f""" + {self._question_block(question)} + + The allowed options are: [{options_str}] + + Estimate what {self.population_spec.name} would forecast as the probability + of each option. Use the exact option names above and make the + probabilities sum to approximately 1. + """ + ) + result = await self._run_agent(agent, prompt) + forecast: MultipleChoicePopulationForecast = result.output + predicted_options = self._build_option_list( + question, forecast.option_probabilities + ) + final_line = "Aggregate implied probabilities: " + ", ".join( + f"{option.option_name}: {option.probability:.1%}" + for option in predicted_options.predicted_options + ) + reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction( + prediction_value=predicted_options, reasoning=reasoning + ) + + @staticmethod + def _build_option_list( + question: MultipleChoiceQuestion, + option_probabilities: list[OptionProbability], + ) -> PredictedOptionList: + lookup = { + option.option_name.strip().lower(): option.probability + for option in option_probabilities + } + raw_probabilities = [ + max(0.0, lookup.get(option.strip().lower(), 0.0)) + for option in question.options + ] + if sum(raw_probabilities) <= 0: + raw_probabilities = [1.0 for _ in question.options] + total = sum(raw_probabilities) + normalized = [probability / total for probability in raw_probabilities] + return PredictedOptionList( + predicted_options=[ + PredictedOption(option_name=option, probability=probability) + for option, probability in zip(question.options, normalized) + ] + ) + + async def _run_forecast_on_numeric( + self, + question: NumericQuestion, + research: str, + branch_llm: GeneralLlm | None = None, + ) -> ReasonedPrediction[NumericDistribution]: + branch_llm = branch_llm or self.branch_llms[0] + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + agent = self._build_agent(NumericPopulationForecast, branch_llm) + prompt = clean_indents( + f""" + {self._question_block(question)} + + Units for the answer: {question.unit_of_measure if question.unit_of_measure else "infer the appropriate units"} + {lower_bound_message} + {upper_bound_message} + + Estimate what {self.population_spec.name} would forecast for this number. + Provide an increasing list of percentile/value pairs (at least the 0.1, + 0.2, 0.4, 0.6, 0.8 and 0.9 percentiles) with wide intervals to reflect + the group's uncertainty. + """ + ) + result = await self._run_agent(agent, prompt) + forecast: NumericPopulationForecast = result.output + percentiles = self._build_percentiles(forecast.percentiles) + distribution = NumericDistribution.from_question(percentiles, question) + final_line = "Aggregate implied distribution (percentile: value): " + ", ".join( + f"{int(percentile.percentile * 100)}%: {percentile.value:g}" + for percentile in percentiles + ) + reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction(prediction_value=distribution, reasoning=reasoning) + + async def _run_forecast_on_date( + self, + question: DateQuestion, + research: str, + branch_llm: GeneralLlm | None = None, + ) -> ReasonedPrediction[NumericDistribution]: + branch_llm = branch_llm or self.branch_llms[0] + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + agent = self._build_agent(DatePopulationForecast, branch_llm) + prompt = clean_indents( + f""" + {self._question_block(question)} + + {lower_bound_message} + {upper_bound_message} + + Estimate what {self.population_spec.name} would forecast for this date. + Provide an increasing list of percentile/date pairs (at least the 0.1, + 0.2, 0.4, 0.6, 0.8 and 0.9 percentiles), dates in YYYY-MM-DD format, with + wide intervals to reflect the group's uncertainty. + """ + ) + result = await self._run_agent(agent, prompt) + forecast: DatePopulationForecast = result.output + percentiles = self._build_date_percentiles(forecast.percentiles) + distribution = NumericDistribution.from_question(percentiles, question) + final_line = "Aggregate implied dates (percentile: date): " + ", ".join( + f"{int(percentile.percentile * 100)}%: " + f"{datetime.fromtimestamp(percentile.value, tz=timezone.utc).date().isoformat()}" + for percentile in percentiles + ) + reasoning = self._format_reasoning( + branch_llm, + forecast.scratchpad, + forecast.population_summary, + forecast.sources, + forecast.aggregate_rationale, + final_line, + ) + return ReasonedPrediction(prediction_value=distribution, reasoning=reasoning) + + @staticmethod + def _build_percentiles(points: list[NumericPercentile]) -> list[Percentile]: + percentiles = [ + Percentile( + percentile=( + point.percentile + if point.percentile <= 1 + else point.percentile / 100 + ), + value=point.value, + ) + for point in points + ] + return sorted(percentiles, key=lambda percentile: percentile.percentile) + + @staticmethod + def _build_date_percentiles(points: list[DatePercentilePoint]) -> list[Percentile]: + percentiles = [ + Percentile( + percentile=( + point.percentile + if point.percentile <= 1 + else point.percentile / 100 + ), + value=pendulum.parse(point.iso_date).timestamp(), + ) + for point in points + ] + return sorted(percentiles, key=lambda percentile: percentile.percentile) + + def _format_reasoning( + self, + branch_llm: GeneralLlm, + scratchpad: str, + population_summary: str, + sources: list[DiscoveredSource], + aggregate_rationale: str, + final_line: str, + ) -> str: + spec = self.population_spec + model_label = self._branch_label(branch_llm) + table_rows = [] + for index, source in enumerate(sources, start=1): + source_cell = ( + f"[{source.name}]({source.url})" if source.url else source.name + ) + table_rows.append( + f"| {index} | {source_cell} | {source.represents} | " + f"{source.implied_forecast} | {source.confidence} | {source.note} |" + ) + table = ( + "\n".join(table_rows) + if table_rows + else "| - | (no sources found) | - | - | - | - |" + ) + max_scratchpad_chars = 2000 + trimmed_scratchpad = (scratchpad or "").strip() + if len(trimmed_scratchpad) > max_scratchpad_chars: + trimmed_scratchpad = ( + trimmed_scratchpad[:max_scratchpad_chars].rstrip() + "…" + ) + scratchpad_section = ( + f"### Reasoning scratchpad\n{trimmed_scratchpad}\n\n" + if trimmed_scratchpad + else "" + ) + return clean_indents( + f""" + ## {model_label} — what {spec.name} appears to forecast + + {population_summary} + + {scratchpad_section}### Sources sampled and their implied forecasts + | # | Source | Represents | Implied forecast | Confidence | Note | + |---|--------|-----------|------------------|-----------|------| + {table} + + ### Aggregation + {aggregate_rationale} + + **{final_line}** + """ + ) + + def _create_comment( + self, + question: MetaculusQuestion, + research_prediction_collections: list[ResearchWithPredictions], + aggregated_prediction, + final_cost: float, + time_spent_in_minutes: float, + ) -> str: + report_type = DataOrganizer.get_report_type_for_question_type(type(question)) + readable_prediction = report_type.make_readable_prediction( + aggregated_prediction + ) + spec = self.population_spec + breakdowns = [] + for collection in research_prediction_collections: + for prediction in collection.predictions: + breakdowns.append(prediction.reasoning) + combined_breakdowns = "\n\n---\n\n".join(breakdowns) + branch_models = ", ".join( + self._branch_label(branch_llm) for branch_llm in self.branch_llms + ) + comment = clean_indents( + f""" + # {spec.name.upper()} BASELINE FORECAST + *Question*: {question.question_text} + *Estimated forecast of the {spec.short_name}*: {readable_prediction} + *What this estimates*: the forecast a randomized, representative sample of + {spec.target_description} would give if asked this question. This is a + public-baseline proxy, not a best-guess of the true outcome. + *Bot Name*: {self.__class__.__name__} + *Model branches (aggregated)*: {branch_models} + + This forecast is the aggregate of {len(self.branch_llms)} independent agent + runs, one per model above. Each run searched for evidence of what + {spec.short_name} believes (via Exa quote search) and recorded the sources + it sampled and the forecast each implies. The per-model breakdowns below + show those sources and implied forecasts; the headline figure is the + aggregate across all branches. + + {combined_breakdowns} + + --- + *Note*: This is an experimental, low-cost agentic baseline bot built on + PydanticAI. Cost/time metadata is not tracked for the agent's calls and is + therefore omitted. + """ + ) + max_comment_size = 150000 + if len(comment) > max_comment_size: + comment = ( + comment[:2000] + + "\n\n---\n\n The comment size exceeded max size and has been truncated" + ) + return comment diff --git a/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py b/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py new file mode 100644 index 00000000..777efc4c --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/public_sentiment_bot.py @@ -0,0 +1,44 @@ +"""Baseline bot estimating what the general public would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class PublicSentimentBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a representative sample of the general public.""" + + population_spec = PopulationSpec( + name="the general public", + short_name="general public", + target_description=( + "ordinary members of the general public (not specialists), spread across " + "ages, regions, education levels, and political affiliations, weighted " + "toward the population most relevant to the question (e.g. the relevant " + "country's adults for a national question, or a global cross-section for a " + "global one)" + ), + sampling_method=( + "Imagine handing this question to a demographically representative panel " + "of everyday people and recording their gut predictions. Most members are " + "not closely following the topic, so their answers are driven by general " + "impressions, recent headlines they happened to see, vibes, hope and fear, " + "and simple heuristics rather than careful base-rate analysis. Weight by " + "how the broad population actually skews, not by how the most engaged or " + "online subgroups skew." + ), + source_guidance=( + "Prioritise opinion polls, surveys, prediction-poll/'wisdom of the crowd' " + "results, Google Trends, and broadly representative public-sentiment data. " + "Treat viral social-media reactions as evidence of mood but down-weight " + "them since they over-represent the highly engaged." + ), + interpretation_guidance=( + "Remember well-documented lay-forecasting tendencies: anchoring on the " + "current vivid narrative, optimism/pessimism and wishful thinking, poor " + "calibration (over-confidence on familiar topics, excess uncertainty on " + "unfamiliar ones), scope insensitivity, and recency bias. Reflect these in " + "the implied forecast rather than correcting them." + ), + ) diff --git a/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py b/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py new file mode 100644 index 00000000..f9ac54c9 --- /dev/null +++ b/forecasting_tools/forecast_bots/public_baselines/right_leaning_bot.py @@ -0,0 +1,41 @@ +"""Baseline bot estimating what the political right would forecast.""" + +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, + PopulationSpec, +) + + +class RightLeaningBaselineBot(PopulationBaselineBot): + """Estimates the forecast of a sample of right-leaning figures and outlets.""" + + population_spec = PopulationSpec( + name="the political right", + short_name="right", + target_description=( + "right-leaning / conservative public figures, commentators, and media " + "outlets (e.g. Fox News, The Wall Street Journal opinion page, National " + "Review, The Telegraph, The Free Press, prominent conservative politicians " + "and writers), sampled across the center-right to the further right" + ), + sampling_method=( + "Imagine sampling a representative set of right-leaning voices and asking " + "each what they would predict. Their forecasts are shaped by conservative " + "priors and the issues their side emphasises. Weight across the spectrum " + "from the establishment center-right to the populist/further right, and " + "base the leaning on the question's relevant country/context." + ), + source_guidance=( + "Prioritise reporting, op-eds, and commentary from right-leaning outlets " + "and figures, and right-leaning framing of polls and events. Identify the " + "source's lean explicitly when recording it." + ), + interpretation_guidance=( + "Reflect how this side's worldview shapes its predictions: motivated " + "reasoning toward outcomes it favours or fears, distinct trusted sources, " + "and characteristic framings of risk and blame. Faithfully represent the " + "right's expected forecast even where it diverges from neutral analysis, " + "and note internal disagreement between the center-right and the further " + "right." + ), + ) diff --git a/poetry.lock b/poetry.lock index c0fcff5e..84170739 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.4.0 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1756,6 +1756,25 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "backports-zstd ; python_version < \"3.14\"", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas (<3.0.0)", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard ; python_version < \"3.14\""] tqdm = ["tqdm"] +[[package]] +name = "genai-prices" +version = "0.0.67" +description = "Calculate prices for calling LLM inference APIs." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "genai_prices-0.0.67-py3-none-any.whl", hash = "sha256:08977f1e83b4132abcfc60dabf21ff13c2d25958afb9199e59c4407bf5c9ed3f"}, + {file = "genai_prices-0.0.67.tar.gz", hash = "sha256:54e07eb6541fda377187a471c5dba21a81b439c57f8dc44d89db3103c29ca343"}, +] + +[package.dependencies] +httpx2 = ">=2.0" +pydantic = ">=2.10" + +[package.extras] +cli = ["pydantic-settings (>=2.11)", "rich (>=14.3.2)", "rich-argparse (>=1.7.2)"] + [[package]] name = "gitdb" version = "4.0.12" @@ -1997,6 +2016,28 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] trio = ["trio (>=0.22.0,<1.0)"] +[[package]] +name = "httpcore2" +version = "2.3.0" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "httpcore2-2.3.0-py3-none-any.whl", hash = "sha256:477e9e334f74e5240dcac002e890580f36a57d40ff0fb14cc9655731d23b8415"}, + {file = "httpcore2-2.3.0.tar.gz", hash = "sha256:07327e251560960eea8e969d92d4c6a325feb13cca39e25340731336c3baf924"}, +] + +[package.dependencies] +h11 = ">=0.16" +truststore = ">=0.10" + +[package.extras] +asyncio = ["anyio (>=4.5.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<1.0)"] + [[package]] name = "httptools" version = "0.7.1" @@ -2087,6 +2128,31 @@ files = [ {file = "httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d"}, ] +[[package]] +name = "httpx2" +version = "2.3.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "httpx2-2.3.0-py3-none-any.whl", hash = "sha256:6f393663bdf6dbe7fe90118e3eb5b2bd024a675cae0390ac08cec9198812d8b7"}, + {file = "httpx2-2.3.0.tar.gz", hash = "sha256:227e7c41d95a76d4077a52640564132777215fc3394e07b66a3116c33d668fa9"}, +] + +[package.dependencies] +anyio = "*" +httpcore2 = "2.3.0" +idna = "*" +truststore = ">=0.10" + +[package.extras] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<15)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0) ; python_version <= \"3.13\""] + [[package]] name = "huggingface-hub" version = "1.15.0" @@ -3091,6 +3157,18 @@ semantic-router = ["aurelio-sdk (==0.0.19) ; python_full_version < \"3.14.0\"", stt-nvidia-riva = ["audioread (>=3.0.1)", "numpy (>=1.26.0)", "nvidia-riva-client (>=2.15.0)", "soundfile (>=0.12.1)"] utils = ["numpydoc (==1.8.0)"] +[[package]] +name = "logfire-api" +version = "4.37.0" +description = "Shim for the Logfire SDK which does nothing unless Logfire is installed" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "logfire_api-4.37.0-py3-none-any.whl", hash = "sha256:1d756f8ba23aa56d438e0ba2c0f529a00fcac975b8785c561b058267f9465088"}, + {file = "logfire_api-4.37.0.tar.gz", hash = "sha256:0f62debd6ed593d51307277bd6d5636b57bda07935b5604b96db10fe64441af4"}, +] + [[package]] name = "lxml" version = "6.1.1" @@ -4071,6 +4149,21 @@ vercel = ["vercel (>=0.5.6,<0.6)"] viz = ["graphviz (>=0.17)"] voice = ["numpy (>=2.2.0,<3) ; python_version >= \"3.10\"", "websockets (>=15.0,<17)"] +[[package]] +name = "opentelemetry-api" +version = "1.43.0" +description = "OpenTelemetry Python API" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "opentelemetry_api-1.43.0-py3-none-any.whl", hash = "sha256:20acf45e9b21851926835292e4045d290acade1edd2ff3de86d2f069687ba1fd"}, + {file = "opentelemetry_api-1.43.0.tar.gz", hash = "sha256:107d0d03857ea8fc7c5fcbbbd83f800c281f0d560553d61c1d675fccfd1761c1"}, +] + +[package.dependencies] +typing-extensions = ">=4.5.0" + [[package]] name = "orjson" version = "3.11.9" @@ -5003,6 +5096,59 @@ typing-inspection = ">=0.4.2" email = ["email-validator (>=2.0.0)"] timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] +[[package]] +name = "pydantic-ai-slim" +version = "2.0.0" +description = "Agent Framework / shim to use Pydantic with LLMs, slim package" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pydantic_ai_slim-2.0.0-py3-none-any.whl", hash = "sha256:39979b459a7bc73ae5294c071a7e474f123858a9f7922e9cbb662018d6431198"}, + {file = "pydantic_ai_slim-2.0.0.tar.gz", hash = "sha256:056ea466d67b47a832736ac0f33172264b09da2110dbcce09d26b82772173218"}, +] + +[package.dependencies] +genai-prices = ">=0.0.62" +griffelib = ">=2.0" +httpx = ">=0.27" +openai = {version = ">=2.29.0", optional = true, markers = "extra == \"openai\""} +opentelemetry-api = ">=1.28.0" +pydantic = ">=2.12" +pydantic-graph = "2.0.0" +tiktoken = {version = ">=0.12.0", optional = true, markers = "extra == \"openai\""} +typing-inspection = ">=0.4.0" + +[package.extras] +ag-ui = ["ag-ui-protocol (>=0.1.10)", "starlette (>=0.46.2)"] +anthropic = ["anthropic (>=0.108.0)"] +bedrock = ["boto3 (>=1.42.63)"] +cli = ["argcomplete (>=3.5.0)", "prompt-toolkit (>=3)", "pyperclip (>=1.9.0)", "pyyaml (>=6.0.2)", "rich (>=13)"] +cohere = ["cohere (>=5.20.6) ; platform_system != \"Emscripten\""] +dbos = ["dbos (>=2.10.0)"] +duckduckgo = ["ddgs (>=9.0.0)"] +evals = ["pydantic-evals (==2.0.0)"] +exa = ["exa-py (>=2.0.0)"] +google = ["google-genai (>=1.70.0)"] +groq = ["groq (>=0.25.0)"] +huggingface = ["hf-xet (<1.5.0) ; platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\"", "huggingface-hub (>=1.3.4,<2.0.0)"] +logfire = ["logfire[httpx] (>=4.16.0)"] +mcp = ["fastmcp-slim[client] (>=3.3.0)"] +mistral = ["mistralai (>=2.0.0,!=2.4.6)"] +openai = ["openai (>=2.29.0)", "tiktoken (>=0.12.0)"] +openrouter = ["openai (>=2.8.0)"] +prefect = ["prefect (>=3.6.13)"] +retries = ["tenacity (>=8.2.3)"] +sentence-transformers = ["sentence-transformers (>=5.2.0) ; python_version < \"3.14\""] +spec = ["pydantic-handlebars (>=0.1.0)", "pyyaml (>=6.0.2)"] +tavily = ["tavily-python (>=0.5.0)"] +temporal = ["temporalio (>=1.24.0)"] +ui = ["starlette (>=0.46.2)"] +voyageai = ["voyageai (>=0.3.7) ; python_version < \"3.14\""] +web = ["httpx (>=0.27.0)", "starlette (>=0.46.2)", "uvicorn (>=0.38.0)"] +web-fetch = ["markdownify (>=1.2)"] +xai = ["xai-sdk (>=1.14.0)"] + [[package]] name = "pydantic-core" version = "2.41.5" @@ -5272,6 +5418,24 @@ files = [ [package.dependencies] typing-extensions = ">=4.14.1" +[[package]] +name = "pydantic-graph" +version = "2.0.0" +description = "Graph and state machine library" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "pydantic_graph-2.0.0-py3-none-any.whl", hash = "sha256:36d69fa01cd316be8584b90eef58bd21675c11c0a081b500a5c4ebe9b68310a5"}, + {file = "pydantic_graph-2.0.0.tar.gz", hash = "sha256:f0bffe84a46a5118bce0824de63d08f3f32ba4dfc1064674f449b07e15128287"}, +] + +[package.dependencies] +httpx = ">=0.27" +logfire-api = ">=3.14.1" +pydantic = ">=2.12" +typing-inspection = ">=0.4.0" + [[package]] name = "pydantic-settings" version = "2.14.1" @@ -7248,6 +7412,18 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "mypy (>=1.7.0,<1.19) ; platform_python_implementation == \"PyPy\"", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "truststore" +version = "0.10.4" +description = "Verify certificates using native system trust stores" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "truststore-0.10.4-py3-none-any.whl", hash = "sha256:adaeaecf1cbb5f4de3b1959b42d41f6fab57b2b1666adb59e89cb0b53361d981"}, + {file = "truststore-0.10.4.tar.gz", hash = "sha256:9d91bd436463ad5e4ee4aba766628dd6cd7010cf3e2461756b3303710eebc301"}, +] + [[package]] name = "typeguard" version = "4.5.2" @@ -7823,4 +7999,4 @@ source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e" +content-hash = "f05b22045cf5780b8de13e6cca605667e5478dcbc45d8c4083e62af9faff1587" diff --git a/pyproject.toml b/pyproject.toml index d15ad580..4dcfecad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ boto3 = {version = ">=1.34,<2.0.0", optional = true} playwright = {version = ">=1.44,<2.0.0", optional = true} firecrawl-py = {version = ">=4.0,<5.0.0", optional = true} trafilatura = {version = ">=1.9,<3.0.0", optional = true} +pydantic-ai-slim = {extras = ["openai"], version = "^2.0.0"} [tool.poetry.extras] source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"] diff --git a/run_bots.py b/run_bots.py index b33f1012..a7ad04ac 100644 --- a/run_bots.py +++ b/run_bots.py @@ -32,6 +32,27 @@ from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) +from forecasting_tools.forecast_bots.public_baselines.center_leaning_bot import ( + CenterLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.credible_news_bot import ( + CredibleNewsBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.expert_opinion_bot import ( + ExpertOpinionBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.left_leaning_bot import ( + LeftLeaningBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.population_baseline_bot import ( + PopulationBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.public_sentiment_bot import ( + PublicSentimentBaselineBot, +) +from forecasting_tools.forecast_bots.public_baselines.right_leaning_bot import ( + RightLeaningBaselineBot, +) from forecasting_tools.forecast_bots.template_bot import TemplateBot from forecasting_tools.helpers.metaculus_api import ApiFilter from forecasting_tools.helpers.metaculus_client import MetaculusClient @@ -442,6 +463,27 @@ def create_bot( return default_bot +def create_population_baseline_bot( + bot_class: type[PopulationBaselineBot], + branch_llms: list[GeneralLlm], +) -> PopulationBaselineBot: + return bot_class( + research_reports_per_question=1, + branch_llms=branch_llms, + use_research_summary_to_forecast=False, + publish_reports_to_metaculus=default_for_publish_to_metaculus, + skip_previously_forecasted_questions=default_for_skipping_questions, + enable_summarize_research=False, + llms={ + "default": branch_llms[0], + "summarizer": None, + "researcher": "no_research", + "parser": structure_output_model, + }, + extra_metadata_in_explanation=True, + ) + + def make_claude_thinking_settings(thinking_tokens: int, max_tokens: int) -> dict: return { "temperature": 1, @@ -617,7 +659,72 @@ def get_default_bot_dict() -> dict[str, RunBotConfig]: # NOSONAR ), } + population_baseline_agent_timeout = 5 * 60 + population_baseline_branch_llms = [ + GeneralLlm( + model="openrouter/anthropic/claude-sonnet-4.5", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + GeneralLlm( + model="openrouter/x-ai/grok-4.3", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + GeneralLlm( + model="openrouter/z-ai/glm-5.1", + temperature=0.3, + timeout=population_baseline_agent_timeout, + ), + ] + roughly_population_baseline_cost = ( + roughly_sonnet_4_cost + 2 * roughly_deepseek_r1_cost + ) + mode_base_bot_mapping = { + ############################ Public-baseline bots (June 2026) ############################ + "METAC_PUBLIC_SENTIMENT_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + PublicSentimentBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_EXPERT_OPINION_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + ExpertOpinionBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_CREDIBLE_NEWS_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + CredibleNewsBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_LEFT_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + LeftLeaningBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_CENTER_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + CenterLeaningBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, + "METAC_RIGHT_LEANING_BASELINE": { + "estimated_cost_per_question": roughly_population_baseline_cost, + "bot": create_population_baseline_bot( + RightLeaningBaselineBot, population_baseline_branch_llms + ), + "tournaments": TournConfig.aib_and_site, + }, ############################ No-research one-shot bots ############################ "METAC_GPT_5_5_NO_RESEARCH_ONE_SHOT": { "estimated_cost_per_question": roughly_gpt_5_cost,