From 9a960effb69a5dac895d9a8ecc4d9197e9cdd5a2 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 7 Jun 2026 10:02:31 -0600 Subject: [PATCH] feat(campaign): preflightModels/assertModelsServed + building-doctrine doc --- CLAUDE.md | 2 + docs/building-doctrine.md | 42 ++++++ src/index.ts | 11 ++ src/integrity/preflight.test.ts | 254 ++++++++++++++++++++++++++++++++ src/integrity/preflight.ts | 192 ++++++++++++++++++++++++ 5 files changed, 501 insertions(+) create mode 100644 docs/building-doctrine.md create mode 100644 src/integrity/preflight.test.ts create mode 100644 src/integrity/preflight.ts diff --git a/CLAUDE.md b/CLAUDE.md index e131603b..f2067e02 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,6 +7,8 @@ Two docs, two audiences: Wire-protocol consumers (any language other than TypeScript) → [`docs/wire-protocol.md`](./docs/wire-protocol.md) and [`clients/python/README.md`](./clients/python/README.md). +How fleet agents that consume this substrate are built (reachable defaults, platform-first debugging, experiment integrity) → [`docs/building-doctrine.md`](./docs/building-doctrine.md). + Update the doc closest to the change. Don't duplicate content across docs; cross-link. ## Tech stack (unchanging) diff --git a/docs/building-doctrine.md b/docs/building-doctrine.md new file mode 100644 index 00000000..105e2f32 --- /dev/null +++ b/docs/building-doctrine.md @@ -0,0 +1,42 @@ +# Building doctrine + +How every fleet agent that consumes `agent-eval` is built. Each rule is mechanical: a primitive or test makes the rule enforceable rather than aspirational. For the mental model and primitives this references, see [`concepts.md`](./concepts.md) and the [`/contract`](../src/contract/index.ts) surface. + +## 1. Defaults must be provably reachable + +Every hard-coded model id or endpoint default is verifiable against the live router. Membership in `{baseUrl}/models` is the free check; an optional 1-token probe per model confirms the router will actually serve it. A default the router cannot serve is a config bug caught before the run, not a runtime surprise that silently degrades into a stub. Backend ids are namespaced by binding: cli-bridge ids (`claude-code/*`, `kimi-code/*`, `opencode/*`) never appear as defaults in code reachable from production — bridge use is an explicit env opt-in, never an implicit fallback. + +Enforced by: `preflightModels` (membership + optional probe) and `assertModelsServed` (gate that names every unreachable id with status + detail). + +## 2. Probe the platform before peeling client layers + +When a request fails, one direct call against the live endpoint bisects platform-versus-client before any code-level debugging begins. A 401 from the router on a `model_not_found` is the platform telling you the default is dead; a connection refused is the platform being unreachable. Establish which side is at fault with a probe first, then debug only the side that is actually broken. + +Enforced by: `preflightModels({ probe: true })` — the probe is the platform-side bisection, carrying the router's own `error.message` back to the caller. + +## 3. Agent-produced findings are hypotheses + +Enumeration of candidate problems may fan out to agents, but agent output is not evidence. Truth comes from probes against ground truth, not from an agent's assertion. Every classification carries quoted evidence, and nothing unverified is merged or reported as fact. A confident-sounding agent claim with no probe behind it is a hypothesis awaiting falsification. + +Enforced by: `assertRealBackend` over the resulting `RunRecord[]` — an agent that claims success while the backend was never called reads as a stub, not a pass. + +## 4. Experiment integrity checklist + +Any lift or benchmark claim satisfies all of the following before it is reported: + +- A frozen, disjoint held-out set, spent exactly once, after candidate selection. +- The propose and selection steps never see held-out data. +- The paired bootstrap confidence interval excludes zero for a "ship" or "match" verdict. +- The same scorer and the same items on both sides of any comparison. +- A leakage check from builder inputs into the evaluation set. +- Cross-family judge panels, with inter-rater reliability reported and gated. +- Missing evidence is never scored as zero — fail loud over fabricate. +- No optional stopping: the stopping rule is fixed before the run. + +Enforced by: `pairedBootstrap` (CI), `assertCrossFamily` (panel diversity), `interRaterReliability` (agreement), and `assertRealBackend` (no stub run masquerading as a result). + +## 5. Fix the class, not the instance + +A drifted default is the symptom of a missing convention. The fix ships the convention and its guard alongside the one-line correction, so the same drift cannot recur silently. Patching the single dead id without adding the preflight gate leaves the class open; the next default rots the same way. + +Enforced by: `assertModelsServed` wired into the campaign preflight — the guard that turns "this one model was dead" into "no campaign spends tokens against an unreachable default." diff --git a/src/index.ts b/src/index.ts index b08db128..15e0a1bb 100644 --- a/src/index.ts +++ b/src/index.ts @@ -226,6 +226,17 @@ export { BackendIntegrityError, summarizeBackendIntegrity, } from './integrity/backend-integrity' +// Pre-hoc complement to assertRealBackend: verify the campaign's models are +// served by the router BEFORE spending tokens, so a dead default surfaces as a +// config error instead of a stub run. +export { + assertModelsServed, + type ModelPreflight, + ModelsUnreachableError, + type PreflightModelsOptions, + type PreflightOutcome, + preflightModels, +} from './integrity/preflight' export { type AssertSingleBackendOptions, assertSingleBackend, diff --git a/src/integrity/preflight.test.ts b/src/integrity/preflight.test.ts new file mode 100644 index 00000000..a99e943b --- /dev/null +++ b/src/integrity/preflight.test.ts @@ -0,0 +1,254 @@ +import { describe, expect, it } from 'vitest' +import { assertModelsServed, ModelsUnreachableError, preflightModels } from './preflight' + +const BASE = 'https://router.tangle.tools/v1' +const KEY = 'test-key' + +function listResponse(ids: string[]): Response { + return new Response(JSON.stringify({ data: ids.map((id) => ({ id })) }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }) +} + +/** Build a fetch fake whose chat-completions responses are keyed by model id. */ +function makeFetch( + listedIds: string[], + probeByModel: Record = {}, +): typeof fetch { + return (async (input: RequestInfo | URL, init?: RequestInit) => { + const url = String(input) + if (url.endsWith('/models')) return listResponse(listedIds) + if (url.endsWith('/chat/completions')) { + const model = JSON.parse(String(init?.body)).model as string + const spec = probeByModel[model] ?? { status: 200 } + return new Response(spec.body === undefined ? '{}' : JSON.stringify(spec.body), { + status: spec.status, + headers: { 'content-type': 'application/json' }, + }) + } + throw new Error(`unexpected url ${url}`) + }) as typeof fetch +} + +describe('preflightModels — membership only', () => { + it('marks listed vs unlisted models, served null when not probed', async () => { + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['claude-sonnet-4-6', 'opencode/zai-coding-plan/glm-5.1'], + fetchImpl: makeFetch(['claude-sonnet-4-6', 'deepseek-v4-pro']), + }) + expect(out.succeeded).toBe(true) + expect(out.error).toBeNull() + expect(out.value).toEqual([ + { model: 'claude-sonnet-4-6', listed: true, served: null, status: null, detail: null }, + { + model: 'opencode/zai-coding-plan/glm-5.1', + listed: false, + served: null, + status: null, + detail: null, + }, + ]) + }) + + it('tolerates a trailing slash on baseUrl', async () => { + const out = await preflightModels({ + baseUrl: `${BASE}/`, + apiKey: KEY, + models: ['claude-haiku-4-5'], + fetchImpl: makeFetch(['claude-haiku-4-5']), + }) + expect(out.value?.[0]?.listed).toBe(true) + }) +}) + +describe('preflightModels — probe', () => { + it('served true on 200', async () => { + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['claude-sonnet-4-6'], + probe: true, + fetchImpl: makeFetch(['claude-sonnet-4-6'], { 'claude-sonnet-4-6': { status: 200 } }), + }) + expect(out.value).toEqual([ + { model: 'claude-sonnet-4-6', listed: true, served: true, status: 200, detail: null }, + ]) + }) + + it('served false on 401 and captures the body error.message as detail', async () => { + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['opencode/zai-coding-plan/glm-5.1'], + probe: true, + fetchImpl: makeFetch([], { + 'opencode/zai-coding-plan/glm-5.1': { + status: 401, + body: { + message: 'No API key configured for model opencode/zai-coding-plan/glm-5.1', + code: 'model_not_found', + }, + }, + }), + }) + expect(out.value).toEqual([ + { + model: 'opencode/zai-coding-plan/glm-5.1', + listed: false, + served: false, + status: 401, + detail: 'No API key configured for model opencode/zai-coding-plan/glm-5.1', + }, + ]) + }) + + it('served false on 503 with no usable body message', async () => { + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['deepseek-v4-pro'], + probe: true, + fetchImpl: makeFetch(['deepseek-v4-pro'], { 'deepseek-v4-pro': { status: 503, body: {} } }), + }) + expect(out.value).toEqual([ + { model: 'deepseek-v4-pro', listed: true, served: false, status: 503, detail: null }, + ]) + }) + + it('reads error.message nested under error', async () => { + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['gpt-4.1-mini'], + probe: true, + fetchImpl: makeFetch(['gpt-4.1-mini'], { + 'gpt-4.1-mini': { status: 429, body: { error: { message: 'rate limited' } } }, + }), + }) + expect(out.value?.[0]).toMatchObject({ served: false, status: 429, detail: 'rate limited' }) + }) +}) + +describe('preflightModels — network failure', () => { + it('GET failure returns a typed outcome, never throws', async () => { + const fetchImpl = (async () => { + throw new Error('ECONNREFUSED') + }) as typeof fetch + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['claude-sonnet-4-6'], + fetchImpl, + }) + expect(out.succeeded).toBe(false) + expect(out.value).toBeNull() + expect(out.error).toContain('ECONNREFUSED') + }) + + it('non-2xx /models returns a typed outcome with the status', async () => { + const fetchImpl = (async () => new Response('forbidden', { status: 403 })) as typeof fetch + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['claude-sonnet-4-6'], + fetchImpl, + }) + expect(out.succeeded).toBe(false) + expect(out.error).toContain('403') + }) + + it('probe POST failure returns a typed outcome', async () => { + const fetchImpl = (async (input: RequestInfo | URL) => { + const url = String(input) + if (url.endsWith('/models')) return listResponse(['claude-sonnet-4-6']) + throw new Error('socket hang up') + }) as typeof fetch + const out = await preflightModels({ + baseUrl: BASE, + apiKey: KEY, + models: ['claude-sonnet-4-6'], + probe: true, + fetchImpl, + }) + expect(out.succeeded).toBe(false) + expect(out.error).toContain('socket hang up') + }) +}) + +describe('assertModelsServed', () => { + it('passes silently when every model is served', async () => { + const models = ['claude-sonnet-4-6', 'deepseek-v4-pro', 'gpt-4.1-mini'] + await expect( + assertModelsServed({ baseUrl: BASE, apiKey: KEY, models, fetchImpl: makeFetch(models) }), + ).resolves.toHaveLength(3) + }) + + it('throws naming EVERY dead model — unlisted and probe-failed alike', async () => { + const models = [ + 'claude-sonnet-4-6', + 'opencode/dead-a', + 'kimi-code/dead-b', + 'claude-code/dead-c', + ] + let thrown: unknown + try { + await assertModelsServed({ + baseUrl: BASE, + apiKey: KEY, + models, + probe: true, + fetchImpl: makeFetch(['claude-sonnet-4-6', 'claude-code/dead-c'], { + 'claude-sonnet-4-6': { status: 200 }, + 'opencode/dead-a': { + status: 401, + body: { message: 'No API key configured for model opencode/dead-a' }, + }, + 'kimi-code/dead-b': { + status: 401, + body: { message: 'No API key configured for model kimi-code/dead-b' }, + }, + // listed but unconfigured: caught only by the probe + 'claude-code/dead-c': { + status: 401, + body: { message: 'No API key configured for model claude-code/dead-c' }, + }, + }), + }) + } catch (err) { + thrown = err + } + expect(thrown).toBeInstanceOf(ModelsUnreachableError) + const msg = (thrown as Error).message + expect(msg).toContain('opencode/dead-a') + expect(msg).toContain('kimi-code/dead-b') + expect(msg).toContain('claude-code/dead-c') + expect(msg).toContain('3/4') + // the served model is never named + expect(msg).not.toContain('claude-sonnet-4-6') + expect((thrown as ModelsUnreachableError).results).toHaveLength(4) + }) + + it('a listed-but-probe-failed model is dead (no partial silent pass)', async () => { + await expect( + assertModelsServed({ + baseUrl: BASE, + apiKey: KEY, + models: ['deepseek-v4-pro'], + probe: true, + fetchImpl: makeFetch(['deepseek-v4-pro'], { 'deepseek-v4-pro': { status: 503, body: {} } }), + }), + ).rejects.toThrow(ModelsUnreachableError) + }) + + it('rethrows a network failure rather than reporting a partial pass', async () => { + const fetchImpl = (async () => { + throw new Error('ECONNREFUSED') + }) as typeof fetch + await expect( + assertModelsServed({ baseUrl: BASE, apiKey: KEY, models: ['claude-sonnet-4-6'], fetchImpl }), + ).rejects.toThrow(/ECONNREFUSED/) + }) +}) diff --git a/src/integrity/preflight.ts b/src/integrity/preflight.ts new file mode 100644 index 00000000..c2833c57 --- /dev/null +++ b/src/integrity/preflight.ts @@ -0,0 +1,192 @@ +/** + * Backend preflight: verify the models a campaign is about to spend tokens + * against are actually served by the router BEFORE the run starts. The PRE-hoc + * complement to `assertRealBackend` (which inspects RunRecords AFTER the run to + * catch a stub/unconfigured backend). + * + * Two checks, increasing in cost: + * - membership (free): GET `{baseUrl}/models` once; a model is `listed` when + * its id is in the served set. + * - probe (spends a tiny number of tokens): POST `{baseUrl}/chat/completions` + * per model with a 1-message, 5-token request; `served` is whether the + * router returns 2xx, with the HTTP `status` and the body's `error.message` + * captured in `detail`. + * + * A default model the router cannot serve is a config bug. Gate a campaign on + * `assertModelsServed` and it surfaces every dead id with its status + detail + * instead of silently producing a stub run. + */ + +import { AgentEvalError, ConfigError } from '../errors' + +export interface ModelPreflight { + /** The model id as supplied by the caller. */ + model: string + /** Membership in the `{baseUrl}/models` served set. */ + listed: boolean + /** 2xx on a 1-token chat probe. `null` when `probe` was not requested. */ + served: boolean | null + /** HTTP status of the probe. `null` when not probed. */ + status: number | null + /** Probe body's `error.message` when present, else `null`. */ + detail: string | null +} + +export interface PreflightModelsOptions { + /** Router base URL, e.g. `https://router.tangle.tools/v1`. Trailing slash tolerated. */ + baseUrl: string + /** Bearer token sent as `Authorization: Bearer `. */ + apiKey: string + /** Model ids to check. */ + models: string[] + /** When true, additionally spend a 1-token chat probe per model. Default false. */ + probe?: boolean + /** Injectable fetch for tests; defaults to the global. */ + fetchImpl?: typeof fetch +} + +export interface PreflightOutcome { + succeeded: boolean + value: ModelPreflight[] | null + error: string | null +} + +interface ModelsListBody { + data?: ReadonlyArray<{ id?: unknown }> +} + +interface ChatErrorBody { + error?: { message?: unknown } + message?: unknown +} + +function stripSlash(url: string): string { + return url.replace(/\/+$/, '') +} + +/** Extract `error.message` (then top-level `message`) from a chat-completions error body. */ +function errorMessage(body: unknown): string | null { + if (body == null || typeof body !== 'object') return null + const b = body as ChatErrorBody + if (b.error && typeof b.error.message === 'string') return b.error.message + if (typeof b.message === 'string') return b.message + return null +} + +/** + * Check that `models` are reachable on the router. Returns a typed outcome — + * a network failure yields `{ succeeded: false, error }`, never a throw and + * never a partial result silently reported as success. No retries, no + * fallbacks. + * + * The membership check (one GET) always runs. When `probe` is true, each model + * additionally gets a 1-token chat probe so a model that is listed but + * unconfigured (a 401 `model_not_found` from the router) is caught. + */ +export async function preflightModels(opts: PreflightModelsOptions): Promise { + const fetchImpl = opts.fetchImpl ?? fetch + const baseUrl = stripSlash(opts.baseUrl) + const authHeaders = { authorization: `Bearer ${opts.apiKey}` } + + let served: Set + try { + const res = await fetchImpl(`${baseUrl}/models`, { method: 'GET', headers: authHeaders }) + if (!res.ok) { + const text = await res.text().catch(() => '') + return { + succeeded: false, + value: null, + error: `preflightModels: GET ${baseUrl}/models → ${res.status} ${text.slice(0, 400)}`, + } + } + const body = (await res.json()) as ModelsListBody + const ids = Array.isArray(body.data) ? body.data : [] + served = new Set(ids.map((m) => m.id).filter((id): id is string => typeof id === 'string')) + } catch (err) { + return { + succeeded: false, + value: null, + error: `preflightModels: GET ${baseUrl}/models failed — ${err instanceof Error ? err.message : String(err)}`, + } + } + + const results: ModelPreflight[] = [] + for (const model of opts.models) { + const listed = served.has(model) + if (!opts.probe) { + results.push({ model, listed, served: null, status: null, detail: null }) + continue + } + try { + const res = await fetchImpl(`${baseUrl}/chat/completions`, { + method: 'POST', + headers: { ...authHeaders, 'content-type': 'application/json' }, + body: JSON.stringify({ + model, + messages: [{ role: 'user', content: 'ping' }], + max_tokens: 5, + }), + }) + let detail: string | null = null + if (!res.ok) { + const body = await res.json().catch(() => null) + detail = errorMessage(body) + } + results.push({ model, listed, served: res.ok, status: res.status, detail }) + } catch (err) { + return { + succeeded: false, + value: null, + error: `preflightModels: probe POST ${baseUrl}/chat/completions (model ${model}) failed — ${err instanceof Error ? err.message : String(err)}`, + } + } + } + + return { succeeded: true, value: results, error: null } +} + +export class ModelsUnreachableError extends AgentEvalError { + constructor( + message: string, + public readonly results: ReadonlyArray, + ) { + super('config', message) + this.name = 'ModelsUnreachableError' + } +} + +function describeFailure(r: ModelPreflight): string { + if (!r.listed) { + const probeNote = + r.served === false ? ` (probe ${r.status}${r.detail ? `: ${r.detail}` : ''})` : '' + return `${r.model}: not in /models${probeNote}` + } + // listed but failed its probe + return `${r.model}: listed but probe ${r.status}${r.detail ? ` — ${r.detail}` : ''}` +} + +/** + * Throw `ModelsUnreachableError` naming EVERY model that is unlisted or (when + * probed) failed its probe — with status + detail per model. A model is dead + * if it is unlisted, or if `served === false`. Callers gate a campaign on this + * before spending tokens. When the network call itself fails the underlying + * outcome error is rethrown — there is no partial silent pass. + */ +export async function assertModelsServed(opts: PreflightModelsOptions): Promise { + const outcome = await preflightModels(opts) + if (!outcome.succeeded || outcome.value === null) { + throw new ConfigError( + outcome.error ?? 'assertModelsServed: preflight failed without an error message', + ) + } + const dead = outcome.value.filter((r) => !r.listed || r.served === false) + if (dead.length > 0) { + throw new ModelsUnreachableError( + `assertModelsServed: ${dead.length}/${outcome.value.length} model(s) unreachable on the router — ${dead + .map(describeFailure) + .join('; ')}`, + outcome.value, + ) + } + return outcome.value +}