diff --git a/bench/src/research-gate.mts b/bench/src/research-gate.mts index 343cadc..0b87c8e 100644 --- a/bench/src/research-gate.mts +++ b/bench/src/research-gate.mts @@ -8,11 +8,12 @@ * (parametric control). Pure router HTTP (bearer `TANGLE_API_KEY`) — never touches the * sandbox, so it never contends with sandbox-bound gates. * - * Reuses the kernel primitives (no reinvention): `routerChatWithUsage` (answer + - * real token usage), `runPool` (bounded concurrency), `appendRunRecord` (the durable - * corpus), and the bench's own `adapter.judge`. The AGENTIC HARNESS regime - * (opencode/pi multi-turn in a box) is NOT here — it runs through `runExperiment` / - * `rsi.ts` with `sandboxAgentRun` (backendType opencode|pi|…); this file is only the + * The retrieve→answer body is the shared `runResearchShot` (research-shot.ts) — the SAME + * body the kernel-driven variant uses (research-loop.mts), so this flat best-of-k pool and + * the real-kernel multi-round loop score identical shots. Reuses `runPool` (bounded + * concurrency), `appendRunRecord` (the durable corpus), and the bench's own `adapter.judge`; + * nothing is reinvented. The AGENTIC HARNESS regime (opencode/pi multi-turn in a box) runs + * through `runExperiment` / `rsi.ts` with `sandboxAgentRun`; this file is the flat, * non-agentic search-RAG baseline. * * Each shot's answer is graded by the bench judge; writes one corpus RunRecord/task @@ -26,9 +27,8 @@ */ import { ADAPTERS } from './adapters' -import type { BenchTask } from './benchmarks/types' import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus' -import { routerChatWithUsage } from './router-client' +import { runResearchShot, type ShotCfg } from './research-shot' import { runPool } from './run-pool' function must(name: string): string { @@ -37,121 +37,6 @@ function must(name: string): string { return v } -interface ShotCfg { - model: string - /** search provider id: 'default'/'off'/'none' = no search (parametric control); else a router provider. */ - search: string - maxResults: number - /** how many top search URLs to web_fetch full page content for (0 = snippets only). */ - fetchTopK: number - temperature: number - routerBaseUrl: string - routerKey: string - timeoutMs: number -} - -interface Shot { - task: BenchTask - attempt: number - answer: string - ok: boolean - detail?: string - wallMs: number - /** count of search hits retrieved (0 ⇒ no search happened / it failed). */ - searches: number -} - -/** Fetch a URL's extracted page text via the router web_fetch MCP tool. Returns '' on any failure. */ -async function fetchPage(url: string, cfg: ShotCfg): Promise { - try { - const res = await fetch(`${cfg.routerBaseUrl}/search/mcp?provider=${encodeURIComponent(cfg.search)}`, { - method: 'POST', - headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` }, - body: JSON.stringify({ jsonrpc: '2.0', id: 1, method: 'tools/call', params: { name: 'web_fetch', arguments: { url } } }), - ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(Math.min(cfg.timeoutMs, 60_000)) } : {}), - }) - if (!res.ok) return '' - const body = (await res.json()) as { result?: { content?: Array<{ text?: string }> } } - const text = body.result?.content?.[0]?.text ?? '' - // The tool returns a JSON string {url,title,content}; pull `content` if parseable, else the raw text. - try { - const parsed = JSON.parse(text) as { content?: string } - return (parsed.content ?? text).slice(0, 2500) - } catch { - return text.slice(0, 2500) - } - } catch { - return '' - } -} - -/** - * One research rollout, 2-step RAG: (1) provider-pinned web search + web_fetch of the - * top-K pages, (2) answer with that evidence via `routerChatWithUsage`. No tools on the - * answer call, so `content` is always present and arms differ ONLY by the provider's - * evidence — no tool-loop `content:null` tail biasing the search arm. The COMMIT prompt - * stops the model deferring ("may I search?"), which otherwise scores 0. Fault-isolated. - */ -async function runResearchShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise { - const startedAt = Date.now() - const useSearch = cfg.search !== 'default' && cfg.search !== 'off' && cfg.search !== 'none' - let searches = 0 - try { - // 1) Provider-pinned web search (proven /v1/search). The control arm skips this. - let context = '' - if (useSearch) { - // Query = the clean question (first non-empty line), NOT the whole prompt: the appended - // worker-contract boilerplate pollutes the query and returns 0 hits. - const query = (task.prompt.split('\n').find((l) => l.trim().length > 0) ?? task.prompt).slice(0, 300) - const sres = await fetch(`${cfg.routerBaseUrl}/search?provider=${encodeURIComponent(cfg.search)}`, { - method: 'POST', - headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` }, - body: JSON.stringify({ query, count: cfg.maxResults }), - ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}), - }) - if (!sres.ok) { - // Surface, never silently degrade to parametric — a failed search must be visible. - console.warn(` [search FAIL ${task.id}#${attempt}] HTTP ${sres.status}: ${(await sres.text()).slice(0, 140)}`) - } else { - const sb = (await sres.json()) as { data?: Array<{ title?: string; url?: string; snippet?: string }> } - const hits = sb.data ?? [] - searches = hits.length - // Fetch the full page text of the top-K results (snippets rarely carry exact figures). - const fetched = await Promise.all(hits.slice(0, cfg.fetchTopK).map((h) => (h.url ? fetchPage(h.url, cfg) : Promise.resolve('')))) - context = hits - .map((h, i) => `[${i + 1}] ${h.title ?? ''}\n${h.snippet ?? ''}\n${h.url ?? ''}${fetched[i] ? `\nPAGE CONTENT:\n${fetched[i]}` : ''}`) - .join('\n\n') - } - } - - // 2) Answer — no tools (content always present), COMMIT (no deferral), via the shared - // router primitive (real usage + transient-retry handling, not a hand-rolled fetch). - const commit = - 'You have no further tools and cannot ask questions or request more research. ' + - 'Output a SINGLE, FINAL answer to the question, leading with the value in the exact units and precision requested ' + - '(e.g. "Answer: -47.9 billion USD"). ' + - (useSearch - ? 'Use the WEB SEARCH RESULTS below (snippets + fetched page content) as your primary evidence; cite the source. ' - : 'Answer from your own knowledge. ') + - 'If you are not fully certain, still COMMIT to your single best estimate — never refuse, defer, or reply with a question.' - const userContent = - useSearch && context ? `${task.prompt}\n\n=== WEB SEARCH RESULTS (provider: ${cfg.search}) ===\n${context}` : task.prompt - const { content } = await routerChatWithUsage( - { routerBaseUrl: cfg.routerBaseUrl, routerKey: cfg.routerKey, model: cfg.model }, - [ - { role: 'system', content: commit }, - { role: 'user', content: userContent }, - ], - { temperature: cfg.temperature, ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}) }, - ) - const answer = content.trim() - const ok = answer.length > 0 - return { task, attempt, answer, ok, searches, wallMs: Date.now() - startedAt, ...(ok ? {} : { detail: `empty answer (searches=${searches})` }) } - } catch (err) { - return { task, attempt, answer: '', ok: false, searches, wallMs: Date.now() - startedAt, detail: `rollout error: ${(err instanceof Error ? err.message : String(err)).slice(0, 200)}` } - } -} - async function main(): Promise { const benchName = process.env.BENCH ?? 'finsearchcomp' const makeAdapter = ADAPTERS[benchName] @@ -186,11 +71,11 @@ async function main(): Promise { const units = tasks.flatMap((task) => Array.from({ length: k }, (_, attempt) => ({ task, attempt }))) console.log(`\n▶ phase 1: ${units.length} rollouts (conc=${concurrency}) · search=${search}`) const shots = await runPool(units, concurrency, async (u) => { - const s = await runResearchShot(u.task, u.attempt, cfg) + const s = await runResearchShot(u.task.prompt, u.task.id, u.attempt, cfg) console.log(` rollout ${u.task.id}#${u.attempt}: ${s.ok ? `answer ${s.answer.length}B · ${s.searches} search(es)` : `NO ANSWER (${s.detail})`} (${(s.wallMs / 1000) | 0}s)`) return s }) - const shotOf = (id: string, i: number) => shots.find((o) => o.value?.task.id === id && o.value?.attempt === i)?.value + const shotOf = (id: string, i: number) => shots.find((o) => o.value?.taskId === id && o.value?.attempt === i)?.value // Phase 2 — judge via the bench's OWN judge; write one RunRecord/task (the shared corpus). console.log(`\n▶ phase 2: judging via ${adapter.name} judge → ${corpusPath}`) @@ -202,7 +87,7 @@ async function main(): Promise { let sc: { score: number; resolved: boolean } | undefined if (s?.ok) { try { - const v = await adapter.judge(s.task, s.answer) + const v = await adapter.judge(task, s.answer) sc = { score: v.score, resolved: v.resolved } console.log(` judge ${task.id}#${i}: score=${(v.score * 100).toFixed(1)}% resolved=${v.resolved}`) } catch (err) { diff --git a/bench/src/research-loop.mts b/bench/src/research-loop.mts new file mode 100644 index 0000000..0b7dafb --- /dev/null +++ b/bench/src/research-loop.mts @@ -0,0 +1,100 @@ +/** + * Stateful research leaderboard — the research benches run through the REAL kernel + * (`runExperiment` → `runLoop` + `createDynamicDriver`), NOT the flat one-shot RAG pool. + * Same retrieve→answer body as `research-gate.mts` (shared `runResearchShot`), but driven + * over `ROUNDS` with analyst steering: each round the arm reshapes the prompt from the + * prior round's trace, so this is the multi-round, resumable-by-steer DEPTH regime — the + * thing the one-shot leaderboard is not. The executor is router-backed (off-sandbox), so + * search works and the kernel never touches a box (see router-executor.ts). + * + * dotenvx run -f ~/company/devops/secrets/.env.keys -f ~/company/devops/secrets/agent-state.env -- \ + * env BENCH=finsearchcomp MODEL=gpt-4o-mini SEARCH=you N=10 ROUNDS=3 CONCURRENCY=3 \ + * JUDGE_MODEL=gpt-4o-mini CORPUS=/tmp/research-loop-you.jsonl tsx src/research-loop.mts + * tsx src/corpus-report.mts # paired-bootstrap across arms + */ +import { ADAPTERS } from './adapters' +import { type Arm, analystArm, answerOutput, arm, llmAnalyst, randomArm, runExperiment, sandboxAgentRun } from './experiment' +import type { ShotCfg } from './research-shot' +import { routerSandboxClient } from './router-executor' + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +async function main(): Promise { + const benchName = process.env.BENCH ?? 'finsearchcomp' + const makeAdapter = ADAPTERS[benchName] + if (!makeAdapter) throw new Error(`unknown BENCH=${benchName} (have: ${Object.keys(ADAPTERS).join(', ')})`) + + const model = process.env.MODEL ?? process.env.WORKER_MODEL ?? 'gpt-4o-mini' + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const search = process.env.SEARCH ?? 'you' + const rounds = Number(process.env.ROUNDS ?? 3) + const n = Number(process.env.N ?? 10) + const concurrency = Number(process.env.CONCURRENCY ?? 3) + if (!Number.isInteger(rounds) || rounds < 1) throw new Error(`ROUNDS must be a positive integer, got ${process.env.ROUNDS}`) + if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`) + + const cfg: ShotCfg = { + model, + search, + maxResults: Number(process.env.SEARCH_MAX_RESULTS ?? 5), + fetchTopK: Number(process.env.FETCH_TOP_K ?? 3), + temperature: Number(process.env.TEMPERATURE ?? 0.7), + routerBaseUrl, + routerKey, + timeoutMs: process.env.SHOT_TIMEOUT_MS ? Number(process.env.SHOT_TIMEOUT_MS) : 600_000, + } + const router = { routerBaseUrl, routerKey, model } + + // The steer policies under test — arm[0] is the compute control (independent retries). + const policies: [Arm, ...Arm[]] = [ + randomArm('blind'), // compute control: ROUNDS independent shots, no steer + analystArm('critical-audit', llmAnalyst(router)), // audit the prior answer, steer the next search+answer + arm('aggressive-push', (root, _h, r) => + r === 0 + ? root + : `${root}\n\nYour prior answer was incomplete or imprecise. Search again with a more specific query, then COMMIT a single more precise final value now.`), + ] + + const adapter = makeAdapter() + console.log( + `=== research LOOP (router executor · real kernel) · bench=${benchName} · model=${model} · search=${search} · N=${n} ROUNDS=${rounds} conc=${concurrency} ===`, + ) + await adapter.preflight() + + const corpus = process.env.CORPUS ?? `${process.cwd()}/corpus/research-loop-${adapter.name}-${search}.jsonl` + const r = await runExperiment({ + adapter, + sandboxClient: routerSandboxClient(cfg), + agentRun: sandboxAgentRun({ model, routerBaseUrl, routerKey }), + arms: policies, + model, + rounds, + n, + ids: process.env.IDS ? process.env.IDS.split(',') : undefined, + concurrency, + output: answerOutput, + corpusPath: corpus, + }) + + const pct = (x: number) => (r.n > 0 ? `${((x / r.n) * 100).toFixed(1)}%` : 'n/a') + console.log(`\n=== ${adapter.name}: ${r.arms.length} policies × rounds=${rounds} (clean n=${r.n}, excluded ${r.errored}) ===`) + console.log(` blind (1 round): ${pct(r.blind)}`) + for (const a of r.arms) { + const tag = + a.label === r.arms[0]?.label + ? ' <- compute control' + : ` delta vs control ${((a.deltaVsControl / Math.max(r.n, 1)) * 100).toFixed(1)}pp` + console.log(` ${a.label}@${rounds}: ${pct(a.resolved)}${tag}`) + } + console.log(`corpus: ${corpus} -> paired CI + BH via: tsx src/corpus-report.mts ${corpus}`) +} + +main().catch((err) => { + console.error(err instanceof Error ? (err.stack ?? err.message) : String(err)) + process.exit(1) +}) diff --git a/bench/src/research-shot.ts b/bench/src/research-shot.ts new file mode 100644 index 0000000..55ec113 --- /dev/null +++ b/bench/src/research-shot.ts @@ -0,0 +1,134 @@ +/** + * One research rollout as a reusable primitive: 2-step RAG — (1) provider-pinned web + * search via the router's proven `/v1/search?provider=` + `web_fetch` of the top-K + * result pages, (2) answer with that evidence via `routerChatWithUsage` (no tools on the + * answer call → `content` always present, so a search arm differs from the parametric + * control ONLY by the evidence). Pure router HTTP (bearer `TANGLE_API_KEY`). + * + * Shared by the off-sandbox RAG leaderboard (`research-gate.mts`) and the router-backed + * loop executor (`router-executor.ts`), so both score the identical retrieve→answer body + * — the only difference is who drives the rounds (a flat best-of-k pool vs the real + * `runLoop` kernel with analyst steering). + */ +import { routerChatWithUsage } from './router-client' + +export interface ShotCfg { + model: string + /** search provider id: 'default'/'off'/'none' = no search (parametric control); else a router provider. */ + search: string + maxResults: number + /** how many top search URLs to web_fetch full page content for (0 = snippets only). */ + fetchTopK: number + temperature: number + routerBaseUrl: string + routerKey: string + timeoutMs: number +} + +export interface Shot { + taskId: string + attempt: number + answer: string + ok: boolean + detail?: string + wallMs: number + /** count of search hits retrieved (0 ⇒ no search happened / it failed). */ + searches: number +} + +/** Fetch a URL's extracted page text via the router web_fetch MCP tool. Returns '' on any failure. */ +export async function fetchPage(url: string, cfg: ShotCfg): Promise { + try { + const res = await fetch(`${cfg.routerBaseUrl}/search/mcp?provider=${encodeURIComponent(cfg.search)}`, { + method: 'POST', + headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` }, + body: JSON.stringify({ jsonrpc: '2.0', id: 1, method: 'tools/call', params: { name: 'web_fetch', arguments: { url } } }), + ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(Math.min(cfg.timeoutMs, 60_000)) } : {}), + }) + if (!res.ok) return '' + const body = (await res.json()) as { result?: { content?: Array<{ text?: string }> } } + const text = body.result?.content?.[0]?.text ?? '' + // The tool returns a JSON string {url,title,content}; pull `content` if parseable, else the raw text. + try { + const parsed = JSON.parse(text) as { content?: string } + return (parsed.content ?? text).slice(0, 2500) + } catch { + return text.slice(0, 2500) + } + } catch { + return '' + } +} + +/** + * One research rollout, 2-step RAG against `prompt` (the task question, possibly with a + * steer appended): (1) provider-pinned web search + web_fetch of the top-K pages, (2) + * answer with that evidence. The search query is the clean question (first non-empty + * line) — appended worker-contract / steer boilerplate pollutes the query and returns 0 + * hits — while the ANSWER sees the full `prompt` so a steer round can act on it. The + * COMMIT prompt stops the model deferring ("may I search?"), which otherwise scores 0. + * Fault-isolated: a flaky call → a NO-ANSWER `Shot`, never a throw. + */ +export async function runResearchShot(prompt: string, taskId: string, attempt: number, cfg: ShotCfg): Promise { + const startedAt = Date.now() + const useSearch = cfg.search !== 'default' && cfg.search !== 'off' && cfg.search !== 'none' + let searches = 0 + try { + // 1) Provider-pinned web search (proven /v1/search). The control arm skips this. + let context = '' + if (useSearch) { + const query = (prompt.split('\n').find((l) => l.trim().length > 0) ?? prompt).slice(0, 300) + const sres = await fetch(`${cfg.routerBaseUrl}/search?provider=${encodeURIComponent(cfg.search)}`, { + method: 'POST', + headers: { 'content-type': 'application/json', authorization: `Bearer ${cfg.routerKey}` }, + body: JSON.stringify({ query, count: cfg.maxResults }), + ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}), + }) + if (!sres.ok) { + // Surface, never silently degrade to parametric — a failed search must be visible. + console.warn(` [search FAIL ${taskId}#${attempt}] HTTP ${sres.status}: ${(await sres.text()).slice(0, 140)}`) + } else { + const sb = (await sres.json()) as { data?: Array<{ title?: string; url?: string; snippet?: string }> } + const hits = sb.data ?? [] + searches = hits.length + // Fetch the full page text of the top-K results (snippets rarely carry exact figures). + const fetched = await Promise.all(hits.slice(0, cfg.fetchTopK).map((h) => (h.url ? fetchPage(h.url, cfg) : Promise.resolve('')))) + context = hits + .map((h, i) => `[${i + 1}] ${h.title ?? ''}\n${h.snippet ?? ''}\n${h.url ?? ''}${fetched[i] ? `\nPAGE CONTENT:\n${fetched[i]}` : ''}`) + .join('\n\n') + } + } + + // 2) Answer — no tools (content always present), COMMIT (no deferral), via the shared router primitive. + const commit = + 'You have no further tools and cannot ask questions or request more research. ' + + 'Output a SINGLE, FINAL answer to the question, leading with the value in the exact units and precision requested ' + + '(e.g. "Answer: -47.9 billion USD"). ' + + (useSearch + ? 'Use the WEB SEARCH RESULTS below (snippets + fetched page content) as your primary evidence; cite the source. ' + : 'Answer from your own knowledge. ') + + 'If you are not fully certain, still COMMIT to your single best estimate — never refuse, defer, or reply with a question.' + const userContent = useSearch && context ? `${prompt}\n\n=== WEB SEARCH RESULTS (provider: ${cfg.search}) ===\n${context}` : prompt + const { content } = await routerChatWithUsage( + { routerBaseUrl: cfg.routerBaseUrl, routerKey: cfg.routerKey, model: cfg.model }, + [ + { role: 'system', content: commit }, + { role: 'user', content: userContent }, + ], + { temperature: cfg.temperature, ...(cfg.timeoutMs ? { signal: AbortSignal.timeout(cfg.timeoutMs) } : {}) }, + ) + const answer = content.trim() + const ok = answer.length > 0 + return { taskId, attempt, answer, ok, searches, wallMs: Date.now() - startedAt, ...(ok ? {} : { detail: `empty answer (searches=${searches})` }) } + } catch (err) { + return { + taskId, + attempt, + answer: '', + ok: false, + searches, + wallMs: Date.now() - startedAt, + detail: `rollout error: ${(err instanceof Error ? err.message : String(err)).slice(0, 200)}`, + } + } +} diff --git a/bench/src/router-executor.ts b/bench/src/router-executor.ts new file mode 100644 index 0000000..f92ecda --- /dev/null +++ b/bench/src/router-executor.ts @@ -0,0 +1,41 @@ +/** + * Router-backed `LoopSandboxClient` — the "router" cost-dial backend the one-flow header + * names (experiment.ts: "backend = the injected LoopSandboxClient (router / local-bridge / + * sandbox)"). Each box the kernel provisions runs ONE research shot per `streamPrompt` + * (router web-search + answer, off-sandbox) and emits the terminal `{ finalText }` event + * `answerOutput` already parses. + * + * Why off-sandbox: research is retrieval, not in-box code execution — it never needed a + * box, and a real sandbox box reaches only the router (egress allowlist, ops-board #976), + * so it cannot web-search natively anyway. Routing the executor through the router instead + * of a box lets the REAL `runLoop` kernel drive research with full `rounds` + analyst + * steering (multi-round, resumable-by-steer — the depth regime), search working, with no + * sandbox dependency. + * + * No `criuStatus`/`describePlacement` ⇒ the kernel uses a fresh executor per iteration + * (no fork, no live box across rounds); statefulness comes from the loop's across-round + * steer (the arm reshapes round N's prompt from round N-1's trace), not a live session. + */ +import type { LoopSandboxClient } from '@tangle-network/agent-runtime/loops' +import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' +import { runResearchShot, type ShotCfg } from './research-shot' + +export function routerSandboxClient(cfg: ShotCfg): LoopSandboxClient { + let seq = 0 + return { + async create(_options?: CreateSandboxOptions): Promise { + const id = `router-research-${seq++}` + return { + id, + async *streamPrompt(message: string): AsyncGenerator { + const shot = await runResearchShot(message, id, 0, cfg) + yield { + type: 'result', + data: { finalText: shot.answer, success: shot.ok, searches: shot.searches }, + } satisfies SandboxEvent + }, + async delete(): Promise {}, + } as unknown as SandboxInstance + }, + } +}