tangle-network · drewstone · Jun 7, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 7, 2026
diff --git a/bench/src/research-gate.mts b/bench/src/research-gate.mts
@@ -8,11 +8,12 @@
  * (parametric control). Pure router HTTP (bearer `TANGLE_API_KEY`) — never touches the
  * sandbox, so it never contends with sandbox-bound gates.
  *
- * Reuses the kernel primitives (no reinvention): `routerChatWithUsage` (answer +
- * real token usage), `runPool` (bounded concurrency), `appendRunRecord` (the durable
- * corpus), and the bench's own `adapter.judge`. The AGENTIC HARNESS regime
- * (opencode/pi multi-turn in a box) is NOT here — it runs through `runExperiment` /
- * `rsi.ts` with `sandboxAgentRun` (backendType opencode|pi|…); this file is only the
+ * The retrieve→answer body is the shared `runResearchShot` (research-shot.ts) — the SAME
+ * body the kernel-driven variant uses (research-loop.mts), so this flat best-of-k pool and
+ * the real-kernel multi-round loop score identical shots. Reuses `runPool` (bounded
+ * concurrency), `appendRunRecord` (the durable corpus), and the bench's own `adapter.judge`;
+ * nothing is reinvented. The AGENTIC HARNESS regime (opencode/pi multi-turn in a box) runs
+ * through `runExperiment` / `rsi.ts` with `sandboxAgentRun`; this file is the flat,
  * non-agentic search-RAG baseline.
  *
  * Each shot's answer is graded by the bench judge; writes one corpus RunRecord/task
@@ -26,9 +27,8 @@
  */
 
 import { ADAPTERS } from './adapters'
-import type { BenchTask } from './benchmarks/types'
 import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
-import { routerChatWithUsage } from './router-client'
+import { runResearchShot, type ShotCfg } from './research-shot'
 import { runPool } from './run-pool'
 
 function must(name: string): string {
@@ -37,121 +37,6 @@ function must(name: string): string {
   return v
 }
 
-interface ShotCfg {
-  model: string
-  /** search provider id: 'default'/'off'/'none' = no search (parametric control); else a router provider. */
-  search: string
-  maxResults: number
-  /** how many top search URLs to web_fetch full page content for (0 = snippets only). */
-  fetchTopK: number
-  temperature: number
-  routerBaseUrl: string
-  routerKey: string
-  timeoutMs: number
-}
-
-interface Shot {
-  task: BenchTask
-  attempt: number
-  answer: string
-  ok: boolean
-  detail?: string
-  wallMs: number
-  /** count of search hits retrieved (0 ⇒ no search happened / it failed). */
-  searches: number
-}
-
-/** Fetch a URL's extracted page text via the router web_fetch MCP tool. Returns '' on any failure. */
-async function fetchPage(url: string, cfg: ShotCfg): Promise<string> {
-  try {
-    const res = await fetch(`${cfg.routerBaseUrl}/search/mcp?provider=${encodeURIComponent(cfg.search)}`, {
-      method: 'POST',
-      headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` },
-      body: JSON.stringify({ jsonrpc: '2.0', id: 1, method: 'tools/call', params: { name: 'web_fetch', arguments: { url } } }),
-      ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(Math.min(cfg.timeoutMs, 60_000)) } : {}),
-    })
-    if (!res.ok) return ''
-    const body = (await res.json()) as { result?: { content?: Array<{ text?: string }> } }
-    const text = body.result?.content?.[0]?.text ?? ''
-    // The tool returns a JSON string {url,title,content}; pull `content` if parseable, else the raw text.
-    try {
-      const parsed = JSON.parse(text) as { content?: string }
-      return (parsed.content ?? text).slice(0, 2500)
-    } catch {
-      return text.slice(0, 2500)
-    }
-  } catch {
-    return ''
-  }
-}
-
-/**
- * One research rollout, 2-step RAG: (1) provider-pinned web search + web_fetch of the
- * top-K pages, (2) answer with that evidence via `routerChatWithUsage`. No tools on the
- * answer call, so `content` is always present and arms differ ONLY by the provider's
- * evidence — no tool-loop `content:null` tail biasing the search arm. The COMMIT prompt
- * stops the model deferring ("may I search?"), which otherwise scores 0. Fault-isolated.
- */
-async function runResearchShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<Shot> {
-  const startedAt = Date.now()
-  const useSearch = cfg.search !== 'default' && cfg.search !== 'off' && cfg.search !== 'none'
-  let searches = 0
-  try {
-    // 1) Provider-pinned web search (proven /v1/search). The control arm skips this.
-    let context = ''
-    if (useSearch) {
-      // Query = the clean question (first non-empty line), NOT the whole prompt: the appended
-      // worker-contract boilerplate pollutes the query and returns 0 hits.
-      const query = (task.prompt.split('\n').find((l) => l.trim().length > 0) ?? task.prompt).slice(0, 300)
-      const sres = await fetch(`${cfg.routerBaseUrl}/search?provider=${encodeURIComponent(cfg.search)}`, {
-        method: 'POST',
-        headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` },
-        body: JSON.stringify({ query, count: cfg.maxResults }),
-        ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}),
-      })
-      if (!sres.ok) {
-        // Surface, never silently degrade to parametric — a failed search must be visible.
-        console.warn(`  [search FAIL ${task.id}#${attempt}] HTTP ${sres.status}: ${(await sres.text()).slice(0, 140)}`)
-      } else {
-        const sb = (await sres.json()) as { data?: Array<{ title?: string; url?: string; snippet?: string }> }
-        const hits = sb.data ?? []
-        searches = hits.length
-        // Fetch the full page text of the top-K results (snippets rarely carry exact figures).
-        const fetched = await Promise.all(hits.slice(0, cfg.fetchTopK).map((h) => (h.url ? fetchPage(h.url, cfg) : Promise.resolve(''))))
-        context = hits
-          .map((h, i) => `[${i + 1}] ${h.title ?? ''}\n${h.snippet ?? ''}\n${h.url ?? ''}${fetched[i] ? `\nPAGE CONTENT:\n${fetched[i]}` : ''}`)
-          .join('\n\n')
-      }
-    }
-
-    // 2) Answer — no tools (content always present), COMMIT (no deferral), via the shared
-    //    router primitive (real usage + transient-retry handling, not a hand-rolled fetch).
-    const commit =
-      'You have no further tools and cannot ask questions or request more research. ' +
-      'Output a SINGLE, FINAL answer to the question, leading with the value in the exact units and precision requested ' +
-      '(e.g. "Answer: -47.9 billion USD"). ' +
-      (useSearch
-        ? 'Use the WEB SEARCH RESULTS below (snippets + fetched page content) as your primary evidence; cite the source. '
-        : 'Answer from your own knowledge. ') +
-      'If you are not fully certain, still COMMIT to your single best estimate — never refuse, defer, or reply with a question.'
-    const userContent =
-      useSearch && context ? `${task.prompt}\n\n=== WEB SEARCH RESULTS (provider: ${cfg.search}) ===\n${context}` : task.prompt
-    const { content } = await routerChatWithUsage(
-      { routerBaseUrl: cfg.routerBaseUrl, routerKey: cfg.routerKey, model: cfg.model },
-      [
-        { role: 'system', content: commit },
-        { role: 'user', content: userContent },
-      ],
-      { temperature: cfg.temperature, ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}) },
-    )
-    const answer = content.trim()
-    const ok = answer.length > 0
-    return { task, attempt, answer, ok, searches, wallMs: Date.now() - startedAt, ...(ok ? {} : { detail: `empty answer (searches=${searches})` }) }
-  } catch (err) {
-    return { task, attempt, answer: '', ok: false, searches, wallMs: Date.now() - startedAt, detail: `rollout error: ${(err instanceof Error ? err.message : String(err)).slice(0, 200)}` }
-  }
-}
-
 async function main(): Promise<void> {
   const benchName = process.env.BENCH ?? 'finsearchcomp'
   const makeAdapter = ADAPTERS[benchName]
@@ -186,11 +71,11 @@ async function main(): Promise<void> {
   const units = tasks.flatMap((task) => Array.from({ length: k }, (_, attempt) => ({ task, attempt })))
   console.log(`\n▶ phase 1: ${units.length} rollouts (conc=${concurrency}) · search=${search}`)
   const shots = await runPool(units, concurrency, async (u) => {
-    const s = await runResearchShot(u.task, u.attempt, cfg)
+    const s = await runResearchShot(u.task.prompt, u.task.id, u.attempt, cfg)
     console.log(`  rollout ${u.task.id}#${u.attempt}: ${s.ok ? `answer ${s.answer.length}B · ${s.searches} search(es)` : `NO ANSWER (${s.detail})`} (${(s.wallMs / 1000) | 0}s)`)
     return s
   })
-  const shotOf = (id: string, i: number) => shots.find((o) => o.value?.task.id === id && o.value?.attempt === i)?.value
+  const shotOf = (id: string, i: number) => shots.find((o) => o.value?.taskId === id && o.value?.attempt === i)?.value
 
   // Phase 2 — judge via the bench's OWN judge; write one RunRecord/task (the shared corpus).
   console.log(`\n▶ phase 2: judging via ${adapter.name} judge → ${corpusPath}`)
@@ -202,7 +87,7 @@ async function main(): Promise<void> {
       let sc: { score: number; resolved: boolean } | undefined
       if (s?.ok) {
         try {
-          const v = await adapter.judge(s.task, s.answer)
+          const v = await adapter.judge(task, s.answer)
           sc = { score: v.score, resolved: v.resolved }
           console.log(`  judge ${task.id}#${i}: score=${(v.score * 100).toFixed(1)}% resolved=${v.resolved}`)
         } catch (err) {

diff --git a/bench/src/research-loop.mts b/bench/src/research-loop.mts
@@ -0,0 +1,100 @@
+/**
+ * Stateful research leaderboard — the research benches run through the REAL kernel
+ * (`runExperiment` → `runLoop` + `createDynamicDriver`), NOT the flat one-shot RAG pool.
+ * Same retrieve→answer body as `research-gate.mts` (shared `runResearchShot`), but driven
+ * over `ROUNDS` with analyst steering: each round the arm reshapes the prompt from the
+ * prior round's trace, so this is the multi-round, resumable-by-steer DEPTH regime — the
+ * thing the one-shot leaderboard is not. The executor is router-backed (off-sandbox), so
+ * search works and the kernel never touches a box (see router-executor.ts).
+ *
+ *   dotenvx run -f ~/company/devops/secrets/.env.keys -f ~/company/devops/secrets/agent-state.env -- \
+ *     env BENCH=finsearchcomp MODEL=gpt-4o-mini SEARCH=you N=10 ROUNDS=3 CONCURRENCY=3 \
+ *     JUDGE_MODEL=gpt-4o-mini CORPUS=/tmp/research-loop-you.jsonl tsx src/research-loop.mts
+ *   tsx src/corpus-report.mts <armA.jsonl> <armB.jsonl>   # paired-bootstrap across arms
+ */
+import { ADAPTERS } from './adapters'
+import { type Arm, analystArm, answerOutput, arm, llmAnalyst, randomArm, runExperiment, sandboxAgentRun } from './experiment'
+import type { ShotCfg } from './research-shot'
+import { routerSandboxClient } from './router-executor'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+async function main(): Promise<void> {
+  const benchName = process.env.BENCH ?? 'finsearchcomp'
+  const makeAdapter = ADAPTERS[benchName]
+  if (!makeAdapter) throw new Error(`unknown BENCH=${benchName} (have: ${Object.keys(ADAPTERS).join(', ')})`)
+
+  const model = process.env.MODEL ?? process.env.WORKER_MODEL ?? 'gpt-4o-mini'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const search = process.env.SEARCH ?? 'you'
+  const rounds = Number(process.env.ROUNDS ?? 3)
+  const n = Number(process.env.N ?? 10)
+  const concurrency = Number(process.env.CONCURRENCY ?? 3)
+  if (!Number.isInteger(rounds) || rounds < 1) throw new Error(`ROUNDS must be a positive integer, got ${process.env.ROUNDS}`)
+  if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`)
+
+  const cfg: ShotCfg = {
+    model,
+    search,
+    maxResults: Number(process.env.SEARCH_MAX_RESULTS ?? 5),
+    fetchTopK: Number(process.env.FETCH_TOP_K ?? 3),
+    temperature: Number(process.env.TEMPERATURE ?? 0.7),
+    routerBaseUrl,
+    routerKey,
+    timeoutMs: process.env.SHOT_TIMEOUT_MS ? Number(process.env.SHOT_TIMEOUT_MS) : 600_000,
+  }
+  const router = { routerBaseUrl, routerKey, model }
+
+  // The steer policies under test — arm[0] is the compute control (independent retries).
+  const policies: [Arm, ...Arm[]] = [
+    randomArm('blind'), // compute control: ROUNDS independent shots, no steer
+    analystArm('critical-audit', llmAnalyst(router)), // audit the prior answer, steer the next search+answer
+    arm('aggressive-push', (root, _h, r) =>
+      r === 0
+        ? root
+        : `${root}\n\nYour prior answer was incomplete or imprecise. Search again with a more specific query, then COMMIT a single more precise final value now.`),
+  ]
+
+  const adapter = makeAdapter()
+  console.log(
+    `=== research LOOP (router executor · real kernel) · bench=${benchName} · model=${model} · search=${search} · N=${n} ROUNDS=${rounds} conc=${concurrency} ===`,
+  )
+  await adapter.preflight()
+
+  const corpus = process.env.CORPUS ?? `${process.cwd()}/corpus/research-loop-${adapter.name}-${search}.jsonl`
+  const r = await runExperiment({
+    adapter,
+    sandboxClient: routerSandboxClient(cfg),
+    agentRun: sandboxAgentRun({ model, routerBaseUrl, routerKey }),
+    arms: policies,
+    model,
+    rounds,
+    n,
+    ids: process.env.IDS ? process.env.IDS.split(',') : undefined,
+    concurrency,
+    output: answerOutput,
+    corpusPath: corpus,
+  })
+
+  const pct = (x: number) => (r.n > 0 ? `${((x / r.n) * 100).toFixed(1)}%` : 'n/a')
+  console.log(`\n=== ${adapter.name}: ${r.arms.length} policies × rounds=${rounds} (clean n=${r.n}, excluded ${r.errored}) ===`)
+  console.log(`  blind (1 round): ${pct(r.blind)}`)
+  for (const a of r.arms) {
+    const tag =
+      a.label === r.arms[0]?.label
+        ? '  <- compute control'
+        : `  delta vs control ${((a.deltaVsControl / Math.max(r.n, 1)) * 100).toFixed(1)}pp`
+    console.log(`  ${a.label}@${rounds}: ${pct(a.resolved)}${tag}`)
+  }
+  console.log(`corpus: ${corpus}  ->  paired CI + BH via: tsx src/corpus-report.mts ${corpus}`)
+}
+
+main().catch((err) => {
+  console.error(err instanceof Error ? (err.stack ?? err.message) : String(err))
+  process.exit(1)
+})