From d3e72e96cddff30e4d637bb76644ee6d6540b721 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 6 Jun 2026 11:36:56 -0600
Subject: [PATCH 1/3] feat(bench): CL-bench (Context Learning)
 deployable-selector gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integrate Tencent/Fudan CL-bench (arXiv:2602.03587) as a router-only selector
gate. CL-bench grades a model's answer to an in-context-knowledge task against
expert rubrics; the official metric is binary (pass ALL rubrics, avg ~63/task),
but the per-rubric pass-count yields a CONTINUOUS score (fraction satisfied) —
the within-task graded variance a verifier-grounded selector needs and that the
pass/fail-deterministic benches (aec) lacked.

The gate (modeled on humaneval-gate) runs two paired arms over the same tasks —
random@K identical completions vs diverse@K strategy-lensed completions — grades
each with the benchmark's own rubric judge (an LLM, run by us = deployable but
noisy, so we rank by the variance-reduced fraction not the binary, judge model +
temp pinned), verifier-selects by fraction, and reports paired-bootstrap lifts on
BOTH the continuous fraction and the official binary. Writes a corpus RunRecord/
task that `corpus-replay --selector=verifier` + `corpus-report` consume unchanged.

Router-only (no sandbox); fetches the public HF jsonl via curl|head so a smoke
pulls only the first N records. Fail loud on a malformed/empty task set or a
failed judge parse (a real zero, never masked).
---
 bench/src/clbench-context-gate.mts | 356 +++++++++++++++++++++++++++++
 1 file changed, 356 insertions(+)
 create mode 100644 bench/src/clbench-context-gate.mts

diff --git a/bench/src/clbench-context-gate.mts b/bench/src/clbench-context-gate.mts
new file mode 100644
index 0000000..a3ad787
--- /dev/null
+++ b/bench/src/clbench-context-gate.mts
@@ -0,0 +1,356 @@
+/**
+ * CL-bench (Context Learning) deployable-selector gate — Tencent/Fudan's CL-bench
+ * (arXiv:2602.03587) repurposed for our verifier-grounded selector question.
+ *
+ * CL-bench asks whether a model can learn NEW knowledge from in-context material
+ * (a rule book, a framework doc, a conversation) at inference time: each record is
+ * a `messages` conversation (system + context-laden turns, the final turn the task)
+ * graded against expert `rubrics`. The official metric is BINARY — a task is solved
+ * only if the response passes EVERY rubric (avg 63 rubrics/task). We keep that binary
+ * as `resolved`, but the per-rubric pass-count gives a CONTINUOUS score (fraction of
+ * rubrics satisfied) — the within-task graded variance a selector needs, which the
+ * deterministic-but-pass/fail benches (aec) lacked.
+ *
+ * The CHECKER is the benchmark's own rubric judge (an LLM, per CL-bench's eval.py),
+ * run by us at inference time — deployable, but NOT deterministic, so treat the judge
+ * as a noisy verifier: we rank by the rubric FRACTION (variance-reduced over many
+ * rubrics), not the binary, and the judge model + temperature are pinned for
+ * test-retest stability. This is the LLM-judge analogue of the HumanEval Docker gate;
+ * read a positive result as "a deployable rubric-fraction verifier captures selection
+ * value on a hard context-learning domain", scoped by judge noise.
+ *
+ * Router-only (no sandbox): worker + judge are both router chat calls. Two paired arms
+ * over the same tasks, each "shot" = one stateless completion of the final turn:
+ *   random@K  — K completions over the unmodified conversation
+ *   diverse@K — K completions, the i-th with a strategy lens prepended to the system turn
+ * verifierGroundedSelect ranks the K shots by rubric fraction. Metrics are reported on
+ * BOTH the continuous fraction (the gate-relevant signal) and the official binary.
+ * Writes a corpus RunRecord/task (condition random@k) so `corpus-replay --selector` and
+ * `corpus-report` consume it unchanged. Fail loud.
+ *
+ *   dotenvx run -f … -- env N=20 K=4 WORKER_MODEL=deepseek-chat JUDGE_MODEL=deepseek-chat \
+ *     CORPUS=/tmp/clbench-ctx.jsonl tsx src/clbench-context-gate.mts
+ */
+
+import { execFileSync } from 'node:child_process'
+import { existsSync, readFileSync } from 'node:fs'
+import { composeStrategies } from './directives'
+import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type RouterConfig, routerChatWithUsage } from './router-client'
+import { selfConsistencySelect, verifierGroundedSelect } from './selector'
+
+const datasetUrl = 'https://huggingface.co/datasets/tencent/CL-bench/resolve/main/CL-bench.jsonl'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+interface ChatMessage {
+  role: string
+  content: string
+}
+
+interface CtxTask {
+  id: string
+  messages: ChatMessage[]
+  rubrics: string[]
+  category: string
+}
+
+/** Fetch the first `count` lines of the (large, ~300MB) CL-bench JSONL via a piped
+ *  `curl | head` so a smoke pulls only a few records, then slice [offset, offset+limit].
+ *  A local cached file (CLBENCH_CTX_FILE) short-circuits the fetch for powered runs.
+ *  Fail loud on a malformed record — a silently-short task set would poison the gate. */
+function loadCtxTasks(limit: number, offset: number): CtxTask[] {
+  const need = offset + limit
+  let raw: string
+  const cached = process.env.CLBENCH_CTX_FILE
+  if (cached) {
+    if (!existsSync(cached)) throw new Error(`CLBENCH_CTX_FILE not found: ${cached}`)
+    raw = execFileSync('bash', ['-c', `head -n ${need} ${JSON.stringify(cached)}`], { maxBuffer: 1 << 30 }).toString('utf8')
+  } else {
+    // -fsSL: fail on HTTP error, follow redirects (HF resolve 302s to the CDN). `head`
+    // closing the pipe after `need` lines gives curl a benign SIGPIPE (exit 23) on a
+    // multi-hundred-MB file — suppress curl's stderr so it isn't mistaken for a fault;
+    // a real fetch failure surfaces as 0 parsed tasks below.
+    raw = execFileSync('bash', ['-c', `curl -fsSL ${JSON.stringify(datasetUrl)} 2>/dev/null | head -n ${need}`], {
+      maxBuffer: 1 << 30,
+    }).toString('utf8')
+  }
+  const tasks: CtxTask[] = []
+  for (const line of raw.split('\n')) {
+    if (line.trim() === '') continue
+    const d = JSON.parse(line) as {
+      messages?: ChatMessage[]
+      rubrics?: unknown[]
+      metadata?: { task_id?: string; context_category?: string }
+    }
+    const messages = d.messages
+    const taskId = d.metadata?.task_id
+    if (!Array.isArray(messages) || messages.length === 0 || !taskId) {
+      throw new Error(`malformed CL-bench record: ${line.slice(0, 120)}`)
+    }
+    // Rubrics are strings or {rubric_criteria} objects (mirrors eval.py's build_rubrics_text).
+    const rubrics = (d.rubrics ?? []).map((r) =>
+      typeof r === 'string' ? r : String((r as { rubric_criteria?: string }).rubric_criteria ?? '').trim(),
+    ).filter((r) => r.length > 0)
+    if (rubrics.length === 0) throw new Error(`CL-bench record ${taskId} has no rubrics`)
+    tasks.push({ id: taskId, messages, rubrics, category: d.metadata?.context_category ?? 'Unknown' })
+  }
+  if (tasks.length === 0) throw new Error('CL-bench parsed to 0 tasks')
+  if (offset >= tasks.length) throw new Error(`OFFSET ${offset} >= fetched size ${tasks.length}`)
+  return tasks.slice(offset, offset + limit)
+}
+
+/** Apply a diversity strategy lens to a conversation by prepending it to the system
+ *  turn (or, if none, inserting a system turn). The non-system context turns — which
+ *  carry the in-context knowledge the task is about — are never mutated. */
+function diversifyMessages(messages: ChatMessage[], lensSystem: string, baseSystem: string): ChatMessage[] {
+  if (messages[0]?.role === 'system') {
+    return [{ role: 'system', content: lensSystem }, ...messages.slice(1)]
+  }
+  return [{ role: 'system', content: composeStrategies(baseSystem, 1)[0] as string }, ...messages]
+}
+
+const judgePrompt = (rubricsText: string, modelOutput: string): string =>
+  'You are a rigorous, strict grading teacher. Grade the student response against the 【Rubrics】, ' +
+  'checking EACH requirement independently.\n\n' +
+  `【Rubrics】:\n${rubricsText}\n` +
+  `【Student Response】:\n${modelOutput}\n\n` +
+  'Output ONLY this JSON (no other text):\n' +
+  '{\n  "status": ["yes" or "no", ... one per rubric, in order],\n  "all_pass": 0 or 1\n}\n'
+
+interface RubricVerdict {
+  /** fraction of rubrics satisfied (0..1) — the continuous within-task signal. */
+  fraction: number
+  /** official binary: every rubric satisfied. */
+  allPass: boolean
+  /** rubric count actually graded (for diagnostics). */
+  graded: number
+}
+
+function parseJudge(reply: string, rubricCount: number): RubricVerdict {
+  let text = reply.trim()
+  if (text.startsWith('```json')) text = text.slice(7)
+  if (text.startsWith('```')) text = text.slice(3)
+  if (text.endsWith('```')) text = text.slice(0, -3)
+  const obj = JSON.parse(text.trim()) as { status?: unknown[]; all_pass?: unknown }
+  const status = Array.isArray(obj.status) ? obj.status : []
+  const yes = status.filter((s) => String(s).trim().toLowerCase() === 'yes').length
+  const graded = status.length > 0 ? status.length : rubricCount
+  const fraction = graded > 0 ? yes / graded : 0
+  // Trust an explicit all_pass when given; else derive from the per-rubric list.
+  const allPass = obj.all_pass === 1 || obj.all_pass === '1' || (status.length === rubricCount && yes === rubricCount && rubricCount > 0)
+  return { fraction, allPass, graded }
+}
+
+/** Grade one completion with the rubric judge. A judge API/parse failure is a real
+ *  zero (the response could not be validated) — surfaced, never masked. */
+async function judgeRubrics(cfg: RouterConfig, task: CtxTask, output: string): Promise<RubricVerdict> {
+  if (!output.trim()) return { fraction: 0, allPass: false, graded: 0 }
+  const rubricsText = task.rubrics.map((r, i) => `${i + 1}. ${r}`).join('\n')
+  const res = await routerChatWithUsage(cfg, [{ role: 'user', content: judgePrompt(rubricsText, output) }], { temperature: 0 })
+  return parseJudge(typeof res.content === 'string' ? res.content : '', task.rubrics.length)
+}
+
+async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
+  const results: R[] = new Array(items.length)
+  let next = 0
+  async function worker(): Promise<void> {
+    for (;;) {
+      const idx = next
+      next += 1
+      if (idx >= items.length) return
+      results[idx] = await fn(items[idx] as T, idx)
+    }
+  }
+  await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
+  return results
+}
+
+function makeRng(seed: number): () => number {
+  let s = seed | 0
+  return () => {
+    s = (s + 0x6d2b79f5) | 0
+    let t = Math.imul(s ^ (s >>> 15), 1 | s)
+    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+interface PairedLift {
+  point: number
+  low: number
+  high: number
+  pairs: number
+  discordant: number
+}
+
+/** Paired lift = mean over tasks of (treatment − baseline) with a 95% bootstrap CI.
+ *  Works on continuous per-task values (rubric fractions) as well as {0,1}. */
+function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
+  if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
+  const n = baseline.length
+  if (n === 0) throw new Error('pairedLift: no pairs')
+  const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
+  const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
+  const point = mean(deltas)
+  const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
+  const rng = makeRng(0x9e3779b9)
+  const rint = (m: number) => Math.floor(rng() * m)
+  const boots: number[] = []
+  for (let b = 0; b < bootstrapN; b += 1) {
+    let acc = 0
+    for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
+    boots.push(acc / n)
+  }
+  boots.sort((x, y) => x - y)
+  return {
+    point,
+    low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN,
+    high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN,
+    pairs: n,
+    discordant,
+  }
+}
+
+const pct = (x: number) => `${(x * 100).toFixed(1)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+interface Shot {
+  output: string
+  verdict: RubricVerdict
+}
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 20)
+  const k = Number(process.env.K ?? 4)
+  const offset = Number(process.env.OFFSET ?? 0)
+  const model = process.env.WORKER_MODEL ?? 'deepseek-chat'
+  const judgeModel = process.env.JUDGE_MODEL ?? model
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const solveConcurrency = Number(process.env.CONCURRENCY ?? 6)
+  const corpusPath = process.env.CORPUS ?? '/tmp/clbench-ctx.jsonl'
+  if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`)
+  if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`)
+
+  const workerCfg: RouterConfig = { routerBaseUrl, routerKey, model }
+  const judgeCfg: RouterConfig = { routerBaseUrl, routerKey, model: judgeModel }
+
+  console.log(`=== CL-bench (Context Learning) selector gate · N=${n} K=${k} offset=${offset} ===`)
+  console.log(`  worker=${model}  judge=${judgeModel} (rubric-fraction verifier)  router=${routerBaseUrl}`)
+  console.log('  regime: STATELESS single completions — selector no-self-correction LOWER BOUND, judge is an LLM (noisy verifier)')
+
+  const tasks = loadCtxTasks(n, offset)
+  console.log(`loaded ${tasks.length} task(s); rubrics/task: ${tasks.map((t) => t.rubrics.length).join(',')}`)
+
+  type Unit = { taskIdx: number; arm: 'random' | 'diverse'; shot: number; messages: ChatMessage[] }
+  const units: Unit[] = []
+  for (let ti = 0; ti < tasks.length; ti += 1) {
+    const task = tasks[ti] as CtxTask
+    const baseSystem = task.messages[0]?.role === 'system' ? (task.messages[0] as ChatMessage).content : 'You are a helpful assistant.'
+    const lenses = composeStrategies(baseSystem, k)
+    for (let s = 0; s < k; s += 1) {
+      units.push({ taskIdx: ti, arm: 'random', shot: s, messages: task.messages })
+      units.push({ taskIdx: ti, arm: 'diverse', shot: s, messages: diversifyMessages(task.messages, lenses[s] as string, baseSystem) })
+    }
+  }
+  console.log(`\n▶ solving ${units.length} attempts (${tasks.length} tasks × ${k} shots × 2 arms) via router, conc=${solveConcurrency}`)
+  const outputs = await pool(units, solveConcurrency, async (u) => {
+    const res = await routerChatWithUsage(workerCfg, u.messages, { temperature: Number(process.env.TEMPERATURE ?? '0.8') })
+    return typeof res.content === 'string' ? res.content : ''
+  })
+
+  console.log(`▶ grading ${outputs.length} completions with the rubric judge (${judgeModel}), conc=${solveConcurrency}`)
+  const verdicts = await pool(units, solveConcurrency, (u, i) => judgeRubrics(judgeCfg, tasks[u.taskIdx] as CtxTask, outputs[i] as string))
+
+  // Regroup into per-task arms, shot order preserved.
+  const byTask = tasks.map(() => ({ random: [] as Shot[], diverse: [] as Shot[] }))
+  units.forEach((u, i) => {
+    const shot: Shot = { output: outputs[i] as string, verdict: verdicts[i] as RubricVerdict }
+    const grp = byTask[u.taskIdx] as { random: Shot[]; diverse: Shot[] }
+    if (u.arm === 'random') grp.random[u.shot] = shot
+    else grp.diverse[u.shot] = shot
+  })
+
+  // Per-task aligned metrics, on BOTH the continuous fraction and the official binary.
+  const fr = { blind: [] as number[], random: [] as number[], diverse: [] as number[], oracle: [] as number[], sc: [] as number[] }
+  const bin = { blind: [] as number[], random: [] as number[], diverse: [] as number[], oracle: [] as number[] }
+  for (const grp of byTask) {
+    const rFr = grp.random.map((s) => s.verdict.fraction)
+    const dFr = grp.diverse.map((s) => s.verdict.fraction)
+    const rIdx = verifierGroundedSelect(rFr) // rank random shots by rubric fraction
+    const dIdx = verifierGroundedSelect(dFr)
+    const scIdx = selfConsistencySelect(grp.diverse.map((s) => s.output))
+    fr.blind.push((grp.random[0] as Shot).verdict.fraction)
+    fr.random.push((grp.random[rIdx] as Shot).verdict.fraction)
+    fr.diverse.push((grp.diverse[dIdx] as Shot).verdict.fraction)
+    fr.oracle.push(Math.max(...dFr))
+    fr.sc.push((grp.diverse[scIdx] as Shot).verdict.fraction)
+    bin.blind.push((grp.random[0] as Shot).verdict.allPass ? 1 : 0)
+    bin.random.push((grp.random[rIdx] as Shot).verdict.allPass ? 1 : 0)
+    bin.diverse.push((grp.diverse[dIdx] as Shot).verdict.allPass ? 1 : 0)
+    bin.oracle.push(dFr.some((_, j) => (grp.diverse[j] as Shot).verdict.allPass) ? 1 : 0)
+  }
+  const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length
+
+  console.log(`\n${'='.repeat(78)}`)
+  console.log(`RESULTS · CL-bench Context Learning · n=${tasks.length} · k=${k} · worker=${model} · judge=${judgeModel}`)
+  console.log('='.repeat(78))
+  console.log('  — rubric FRACTION (continuous within-task signal, the gate-relevant metric) —')
+  console.log(`  blind (shot 0)             ${pct(rate(fr.blind))}`)
+  console.log(`  random@k (verifier-pick)   ${pct(rate(fr.random))}`)
+  console.log(`  diverse@k (verifier-pick)  ${pct(rate(fr.diverse))}`)
+  console.log(`  oracle@k (max fraction)    ${pct(rate(fr.oracle))}`)
+  console.log(`  self-consistency@k         ${pct(rate(fr.sc))}`)
+  console.log('  — official BINARY all-rubrics-pass (solving rate) —')
+  console.log(`  blind ${pct(rate(bin.blind))}  random@k ${pct(rate(bin.random))}  diverse@k ${pct(rate(bin.diverse))}  oracle@k ${pct(rate(bin.oracle))}`)
+
+  const row = (label: string, l: PairedLift) =>
+    console.log(`  ${label.padEnd(36)} ${pp(l.point).padStart(7)}   CI [${pp(l.low)}, ${pp(l.high)}]   (paired ${l.pairs}, discordant ${l.discordant})`)
+  console.log(`\n  PAIRED LIFTS on rubric fraction (95% bootstrap CI, B=10000):`)
+  row('random@k − blind (compute)', pairedLift(fr.blind, fr.random))
+  row('diverse@k − random@k (verifier)', pairedLift(fr.random, fr.diverse))
+  row('diverse@k − blind (total)', pairedLift(fr.blind, fr.diverse))
+  row('verifier-pick − sc-pick (diverse)', pairedLift(fr.sc, fr.diverse))
+  const ceiling = pairedLift(fr.random, fr.oracle)
+  row('oracle@k − random@k (ceiling)', ceiling)
+
+  // Corpus: one RunRecord/task for the random@k arm, ranked by the rubric-fraction
+  // verifier — `corpus-replay --selector=verifier` + `corpus-report` consume it.
+  for (let ti = 0; ti < tasks.length; ti += 1) {
+    const task = tasks[ti] as CtxTask
+    const grp = byTask[ti] as { random: Shot[]; diverse: Shot[] }
+    const attempts: AttemptRecord[] = grp.random.map((s, round) => ({
+      round,
+      prompt: 'clbench-context',
+      output: s.output.slice(0, 4000),
+      valid: s.verdict.allPass,
+      score: s.verdict.fraction,
+      eventCount: 1,
+      eventTypes: { 'router.chat': 1 },
+      traceTail: s.output.slice(-600),
+    }))
+    const record: RunRecord = {
+      ts: new Date().toISOString(),
+      benchmark: 'clbench-context',
+      instanceId: task.id,
+      condition: `random@${k}`,
+      model,
+      blindResolved: (grp.random[0] as Shot).verdict.allPass,
+      resolved: grp.random.some((s) => s.verdict.allPass),
+      attempts,
+      infraError: false,
+    }
+    await appendRunRecord(corpusPath, record)
+  }
+  console.log(`\n=== wrote ${tasks.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)
+}
+
+main().catch((err) => {
+  console.error(`clbench-context-gate: ${err instanceof Error ? err.message : String(err)}`)
+  process.exit(1)
+})

From ddfd30470ad4b2748ae452f92ba90e7ddcbf66a1 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 6 Jun 2026 11:50:26 -0600
Subject: [PATCH 2/3] feat(bench): CL-Bench (Continual) Codebase Adaptation
 verifier-grounded selector gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integrate pgasawa CL-Bench (arXiv:2606.05661) codebase_adaptation — the ONE of its
six domains with a DEPLOYABLE checker (the rest grade against realized outcomes the
agent never has = oracles). Its scorer applies the instance's provided test_patch and
runs pytest in the instance's Docker image, keying off the exit code: an INDEPENDENT
deployable check (tests ≠ answer), the clean analogue of the HumanEval gate and unlike
the CL-bench Context gate where the rubric judge IS the metric.

Two pieces:
- scripts/clbench_codebase_judge.py — a thin bridge exposing CL-Bench's own
  `evaluate_submission(patch, instance)` as a standalone (instance_id, patch) -> verdict
  call (run in CL-Bench's venv). Verified to self-check: gold patch passes, empty fails.
- src/clbench-codebase-gate.mts — the gate. Each instance is SWE-bench format; the worker
  is a fault-isolated sandbox rollout (opencode clones repo@base_commit, fixes source,
  writes a diff read off the box FS). Two paired arms (random@K identical vs diverse@K
  strategy-lensed), verifierGroundedSelect by pytest-pass, paired-bootstrap lifts on
  blind/random@k/diverse@k/oracle@k, and a corpus RunRecord/task that corpus-replay
  --selector=verifier consumes. Infra-errored rollouts/judges are excluded, never scored 0.

Needs Docker + the CL-Bench images (`clbench setup codebase_adaptation`) for judging and a
reachable sandbox for rollouts. Independent of the CL-bench (Context) gate (separate PR).
---
 bench/scripts/clbench_codebase_judge.py |  73 +++++
 bench/src/clbench-codebase-gate.mts     | 357 ++++++++++++++++++++++++
 2 files changed, 430 insertions(+)
 create mode 100644 bench/scripts/clbench_codebase_judge.py
 create mode 100644 bench/src/clbench-codebase-gate.mts

diff --git a/bench/scripts/clbench_codebase_judge.py b/bench/scripts/clbench_codebase_judge.py
new file mode 100644
index 0000000..b54ab60
--- /dev/null
+++ b/bench/scripts/clbench_codebase_judge.py
@@ -0,0 +1,73 @@
+"""Deployable-checker bridge for CL-Bench (Continual) Codebase Adaptation.
+
+CL-Bench's `codebase_adaptation` is the ONE domain whose scorer is a deployable
+checker (not an oracle): it applies the instance's provided `test_patch` and runs
+the project's pytest suite inside the instance's Docker image, keying off the exit
+code — exactly the SWE-bench / commit0 regime. This bridge exposes that scorer as a
+standalone (instance_id, patch) -> {success,status} call so our TypeScript gate can
+rank K candidate patches by a verifier the agent could legitimately run itself.
+
+Run it with CL-Bench's OWN venv + repo root on the path (its `src.tasks...` package):
+
+    <clbench>/.venv/bin/python clbench_codebase_judge.py \
+        --dataset <clbench>/data/codebase_adaptation/final-dataset.jsonl \
+        --instance-id jazzband__tablib-534 --patch-file /tmp/candidate.patch
+    # invoked with cwd=<clbench> so `import src.tasks...` resolves
+
+Prints one JSON line: {"instance_id","success","status","error"}. Fail loud — a
+Docker/import failure exits non-zero with the message on stderr, never a silent 0.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+
+from src.tasks.codebase_adaptation.evaluator import evaluate_submission
+
+
+def load_instance(dataset_path: str, instance_id: str) -> dict:
+    with open(dataset_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if row.get("instance_id") == instance_id:
+                return row
+    raise SystemExit(f"instance_id not found in {dataset_path}: {instance_id}")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="CL-Bench codebase_adaptation deployable judge")
+    ap.add_argument("--dataset", required=True, help="path to final-dataset.jsonl")
+    ap.add_argument("--instance-id", required=True)
+    ap.add_argument("--patch-file", required=True, help="file holding the candidate unified git diff")
+    args = ap.parse_args()
+
+    instance = load_instance(args.dataset, args.instance_id)
+    with open(args.patch_file, encoding="utf-8") as f:
+        patch = f.read()
+
+    # evaluate_submission spins the instance's Docker image, applies test_patch + the
+    # candidate, runs pytest, and reports success on a clean exit. A genuine infra
+    # failure raises — let it propagate (non-zero exit) so the caller never reads a
+    # transport fault as a failed test.
+    result = evaluate_submission(patch, instance)
+    print(json.dumps({
+        "instance_id": args.instance_id,
+        "success": bool(result.success),
+        "status": result.status,
+        "error": (result.error or "")[:500],
+    }))
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except SystemExit:
+        raise
+    except Exception as exc:  # infra/import failure — fail loud, do not emit a fake verdict
+        print(f"clbench_codebase_judge: {type(exc).__name__}: {exc}", file=sys.stderr)
+        sys.exit(2)
diff --git a/bench/src/clbench-codebase-gate.mts b/bench/src/clbench-codebase-gate.mts
new file mode 100644
index 0000000..ad8f475
--- /dev/null
+++ b/bench/src/clbench-codebase-gate.mts
@@ -0,0 +1,357 @@
+/**
+ * CL-Bench (Continual) Codebase Adaptation — verifier-grounded selector gate.
+ *
+ * Of CL-Bench's six domains, codebase_adaptation is the ONLY deployable checker:
+ * its scorer applies the instance's provided `test_patch` and runs pytest in the
+ * instance's Docker image, keying off the exit code (the rest grade against realized
+ * outcomes the agent never has = oracles). That makes it the clean analogue of the
+ * HumanEval gate — an INDEPENDENT deployable check (tests ≠ the answer), unlike the
+ * CL-bench Context gate where the rubric judge IS the metric.
+ *
+ * Each instance is SWE-bench format (repo @ base_commit + a GitHub issue). The worker
+ * is a real sandbox rollout (opencode clones the repo, fixes the source, writes a diff
+ * to a file we read back — a large diff truncates in the chat stream). We score each
+ * candidate patch with CL-Bench's OWN scorer via `clbench_codebase_judge.py` (run in
+ * CL-Bench's venv) — verified to self-check (gold patch passes, empty fails). Two
+ * paired arms over the same instances:
+ *   random@K  — K identical-issue rollouts (the compute control)
+ *   diverse@K — K rollouts, the i-th with a strategy lens prepended (composeStrategies)
+ * verifierGroundedSelect picks by pytest-pass; we report blind / random@k / diverse@k /
+ * oracle@k with paired-bootstrap CIs, and write a corpus RunRecord/task the existing
+ * `corpus-replay --selector=verifier` + `corpus-report` consume unchanged. Fail loud:
+ * an infra-errored rollout is excluded (infraError), never scored 0.
+ *
+ *   dotenvx run -f … -- env N=4 K=3 WORKER_MODEL=deepseek-chat CONCURRENCY=2 \
+ *     CLBENCH_DIR=/tmp/clbench-continual CORPUS=/tmp/clbench-codebase.jsonl \
+ *     tsx src/clbench-codebase-gate.mts
+ */
+
+import { execFile } from 'node:child_process'
+import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { promisify } from 'node:util'
+import { acquireSandbox } from '@tangle-network/agent-runtime/loops'
+import { Sandbox } from '@tangle-network/sandbox'
+import { composeStrategies } from './directives'
+import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { verifierGroundedSelect } from './selector'
+
+const execFileAsync = promisify(execFile)
+const PATCH_PATH = '/tmp/solution.patch'
+const randomSuffix = () => Math.random().toString(36).slice(2, 10)
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+interface Instance {
+  instanceId: string
+  repo: string
+  baseCommit: string
+  problemStatement: string
+}
+
+/** Load CL-Bench codebase_adaptation instances (SWE-bench format) from the cloned
+ *  repo's final-dataset.jsonl. Fail loud on a malformed/short record. */
+function loadInstances(clbenchDir: string, limit: number, offset: number): Instance[] {
+  const path = join(clbenchDir, 'data/codebase_adaptation/final-dataset.jsonl')
+  const text = readFileSync(path, 'utf8')
+  const out: Instance[] = []
+  for (const line of text.split('\n')) {
+    if (line.trim() === '') continue
+    const d = JSON.parse(line) as { instance_id?: string; repo?: string; base_commit?: string; problem_statement?: string }
+    if (!d.instance_id || !d.repo || !d.base_commit || !d.problem_statement) {
+      throw new Error(`malformed codebase_adaptation record: ${line.slice(0, 120)}`)
+    }
+    out.push({ instanceId: d.instance_id, repo: d.repo, baseCommit: d.base_commit, problemStatement: d.problem_statement })
+  }
+  if (out.length === 0) throw new Error('codebase_adaptation parsed to 0 instances')
+  if (offset >= out.length) throw new Error(`OFFSET ${offset} >= dataset size ${out.length}`)
+  return out.slice(offset, offset + limit)
+}
+
+function rolloutPrompt(inst: Instance, lens: string | undefined): string {
+  return [
+    lens ? `${lens}\n` : '',
+    `Clone https://github.com/${inst.repo} into /work, then \`cd /work && git checkout ${inst.baseCommit}\`.`,
+    '',
+    'Resolve this GitHub issue by editing the SOURCE only (never the tests — the evaluation re-runs its own hidden tests on a fresh clone):',
+    '',
+    inst.problemStatement,
+    '',
+    'Work iteratively: install the package editable (`pip install -e .`), reproduce the issue, implement the fix, and re-run the existing tests until they pass.',
+    `When done, from /work run EXACTLY:`,
+    `  git add -A && git diff --cached -- . ':(exclude)*/test*' > ${PATCH_PATH}`,
+    `Then stop. The patch file is the only deliverable — do NOT paste the diff in your reply.`,
+  ].filter((s) => s !== '').join('\n')
+}
+
+interface ShotCfg {
+  sandboxBaseUrl: string
+  routerBaseUrl: string
+  routerKey: string
+  model: string
+  timeoutMs: number
+}
+
+interface Shot {
+  patch: string
+  /** rollout completed and produced a (possibly empty) patch; false ⇒ infra error (excluded). */
+  ran: boolean
+  detail?: string
+}
+
+/** One fault-isolated sandbox rollout → a patch (read from the box FS). ANY rollout
+ *  error becomes a recorded infra failure (ran=false), never a throw that kills the pool. */
+async function runRollout(inst: Instance, lens: string | undefined, cfg: ShotCfg): Promise<Shot> {
+  const client = new Sandbox({ baseUrl: cfg.sandboxBaseUrl, apiKey: cfg.routerKey })
+  let box: Awaited<ReturnType<typeof acquireSandbox>> | undefined
+  try {
+    box = await acquireSandbox(client, {
+      name: `clbench-cb-${inst.instanceId}-${randomSuffix()}`.replace(/[^a-zA-Z0-9_.-]/g, '_').slice(0, 60),
+      environment: 'universal',
+      env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl },
+      backend: { type: 'opencode', model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } },
+    })
+    const signal = cfg.timeoutMs > 0 ? AbortSignal.timeout(cfg.timeoutMs) : undefined
+    for await (const _ev of box.streamPrompt(rolloutPrompt(inst, lens), signal ? { signal } : {})) {
+      // drain; the deliverable is the patch FILE, not the stream
+    }
+    let patch = ''
+    try {
+      patch = await box.fs.read(PATCH_PATH)
+    } catch {
+      patch = '' // missing patch file ⇒ the agent produced nothing (a real empty, ran=true)
+    }
+    return { patch, ran: true }
+  } catch (err) {
+    return { patch: '', ran: false, detail: `rollout error: ${(err instanceof Error ? err.message : String(err)).slice(0, 200)}` }
+  } finally {
+    try {
+      if (box) await box.delete()
+    } catch {
+      // staging reaps on expiry
+    }
+  }
+}
+
+/** Score one candidate patch with CL-Bench's deployable pytest checker via the bridge
+ *  (run in CL-Bench's venv, cwd = its repo root so `src.tasks...` resolves). Returns
+ *  pass (0/1) or null on an infra/judge failure (excluded, never scored 0). */
+async function judgePatch(inst: Instance, patch: string, clbenchDir: string): Promise<number | null> {
+  if (patch.trim() === '') return 0 // empty patch is a legitimate fail, not an infra error
+  const dir = mkdtempSync(join(tmpdir(), 'clbench-cb-'))
+  const patchFile = join(dir, 'candidate.patch')
+  writeFileSync(patchFile, patch)
+  try {
+    const { stdout } = await execFileAsync(
+      join(clbenchDir, '.venv/bin/python'),
+      [
+        join(process.cwd(), 'scripts/clbench_codebase_judge.py'),
+        '--dataset',
+        join(clbenchDir, 'data/codebase_adaptation/final-dataset.jsonl'),
+        '--instance-id',
+        inst.instanceId,
+        '--patch-file',
+        patchFile,
+      ],
+      { cwd: clbenchDir, maxBuffer: 8 * 1024 * 1024, timeout: 600_000 },
+    )
+    const last = stdout.trim().split('\n').at(-1) ?? '{}'
+    const verdict = JSON.parse(last) as { success?: boolean }
+    return verdict.success ? 1 : 0
+  } catch (err) {
+    console.error(`  judge infra error ${inst.instanceId}: ${(err instanceof Error ? err.message : String(err)).slice(0, 160)}`)
+    return null
+  } finally {
+    rmSync(dir, { recursive: true, force: true })
+  }
+}
+
+async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
+  const results: R[] = new Array(items.length)
+  let next = 0
+  async function worker(): Promise<void> {
+    for (;;) {
+      const idx = next
+      next += 1
+      if (idx >= items.length) return
+      results[idx] = await fn(items[idx] as T, idx)
+    }
+  }
+  await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
+  return results
+}
+
+function makeRng(seed: number): () => number {
+  let s = seed | 0
+  return () => {
+    s = (s + 0x6d2b79f5) | 0
+    let t = Math.imul(s ^ (s >>> 15), 1 | s)
+    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+interface PairedLift {
+  point: number
+  low: number
+  high: number
+  pairs: number
+  discordant: number
+}
+
+function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
+  if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
+  const n = baseline.length
+  if (n === 0) throw new Error('pairedLift: no pairs')
+  const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
+  const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
+  const point = mean(deltas)
+  const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
+  const rng = makeRng(0x9e3779b9)
+  const rint = (m: number) => Math.floor(rng() * m)
+  const boots: number[] = []
+  for (let b = 0; b < bootstrapN; b += 1) {
+    let acc = 0
+    for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
+    boots.push(acc / n)
+  }
+  boots.sort((x, y) => x - y)
+  return { point, low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN, high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN, pairs: n, discordant }
+}
+
+const pct = (x: number) => `${(x * 100).toFixed(1)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 4)
+  const k = Number(process.env.K ?? 3)
+  const offset = Number(process.env.OFFSET ?? 0)
+  const model = process.env.WORKER_MODEL ?? 'deepseek-chat'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools'
+  const clbenchDir = process.env.CLBENCH_DIR ?? '/tmp/clbench-continual'
+  const rolloutConc = Number(process.env.CONCURRENCY ?? 3)
+  const judgeConc = Number(process.env.JUDGE_CONCURRENCY ?? 2)
+  const timeoutMs = process.env.SHOT_TIMEOUT_MS ? Number(process.env.SHOT_TIMEOUT_MS) : 900_000
+  const corpusPath = process.env.CORPUS ?? '/tmp/clbench-codebase.jsonl'
+  if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`)
+  if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`)
+
+  const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, timeoutMs }
+  console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${model} ===`)
+  console.log(`  sandbox=${sandboxBaseUrl}  judge=CL-Bench pytest-in-Docker (deployable)  clbench=${clbenchDir}`)
+
+  const instances = loadInstances(clbenchDir, n, offset)
+  console.log(`loaded ${instances.length} instance(s): ${instances.map((i) => i.instanceId).join(', ')}`)
+
+  type Unit = { instIdx: number; arm: 'random' | 'diverse'; shot: number; lens: string | undefined }
+  const units: Unit[] = []
+  for (let ii = 0; ii < instances.length; ii += 1) {
+    const lenses = composeStrategies('', k) // lens prefixes only ('' base ⇒ "<lens>\n\n")
+    for (let s = 0; s < k; s += 1) {
+      units.push({ instIdx: ii, arm: 'random', shot: s, lens: undefined })
+      units.push({ instIdx: ii, arm: 'diverse', shot: s, lens: (lenses[s] as string).trim() })
+    }
+  }
+  console.log(`\n▶ phase 1: ${units.length} rollouts (${instances.length}×${k}×2 arms) via sandbox, conc=${rolloutConc}`)
+  const shots = await pool(units, rolloutConc, async (u) => {
+    const inst = instances[u.instIdx] as Instance
+    const s = await runRollout(inst, u.lens, cfg)
+    console.log(`  rollout ${inst.instanceId} ${u.arm}#${u.shot}: ${s.ran ? `patch ${s.patch.length}B` : `INFRA (${s.detail})`}`)
+    return s
+  })
+
+  console.log(`\n▶ phase 2: judging ${shots.length} patches with the deployable checker, conc=${judgeConc}`)
+  const passes = await pool(units, judgeConc, async (u, i) => {
+    const shot = shots[i] as Shot
+    if (!shot.ran) return null // infra error ⇒ excluded
+    const inst = instances[u.instIdx] as Instance
+    const p = await judgePatch(inst, shot.patch, clbenchDir)
+    console.log(`  judge ${inst.instanceId} ${u.arm}#${u.shot}: ${p === null ? 'INFRA' : p ? 'PASS' : 'fail'}`)
+    return p
+  })
+
+  // Regroup; an attempt with a null pass (infra) is dropped from its arm.
+  const byInst = instances.map(() => ({ random: [] as (number | null)[], diverse: [] as (number | null)[], rPatch: [] as string[], dPatch: [] as string[] }))
+  units.forEach((u, i) => {
+    const grp = byInst[u.instIdx] as { random: (number | null)[]; diverse: (number | null)[]; rPatch: string[]; dPatch: string[] }
+    const pass = passes[i] as number | null
+    const patch = (shots[i] as Shot).patch
+    if (u.arm === 'random') { grp.random[u.shot] = pass; grp.rPatch[u.shot] = patch } else { grp.diverse[u.shot] = pass; grp.dPatch[u.shot] = patch }
+  })
+
+  // Per-instance {0,1} outcomes; instances with no valid attempt in BOTH arms are excluded.
+  const blind: number[] = []
+  const randomAtK: number[] = []
+  const diverseAtK: number[] = []
+  const oracleAtK: number[] = []
+  let excluded = 0
+  for (const grp of byInst) {
+    const rValid = grp.random.filter((p): p is number => p !== null && p !== undefined)
+    const dValid = grp.diverse.filter((p): p is number => p !== null && p !== undefined)
+    if (rValid.length === 0 || dValid.length === 0) { excluded += 1; continue }
+    blind.push(rValid[0] as number)
+    randomAtK.push(rValid[verifierGroundedSelect(rValid)] as number)
+    diverseAtK.push(dValid[verifierGroundedSelect(dValid)] as number)
+    oracleAtK.push(dValid.some((p) => p > 0) ? 1 : 0)
+  }
+  const rate = (xs: number[]) => (xs.length === 0 ? 0 : xs.reduce((s, x) => s + x, 0) / xs.length)
+
+  console.log(`\n${'='.repeat(78)}`)
+  console.log(`RESULTS · CL-Bench Codebase Adaptation · n=${blind.length} scored (${excluded} excluded) · k=${k} · model=${model}`)
+  console.log('='.repeat(78))
+  console.log(`  blind pass@1               ${pct(rate(blind))}`)
+  console.log(`  random@k (verifier-pick)   ${pct(rate(randomAtK))}`)
+  console.log(`  diverse@k (verifier-pick)  ${pct(rate(diverseAtK))}`)
+  console.log(`  oracle@k (diverse, any)    ${pct(rate(oracleAtK))}`)
+  if (blind.length >= 2) {
+    const row = (label: string, l: PairedLift) =>
+      console.log(`  ${label.padEnd(34)} ${pp(l.point).padStart(7)}   CI [${pp(l.low)}, ${pp(l.high)}]   (paired ${l.pairs}, discordant ${l.discordant})`)
+    console.log(`\n  PAIRED LIFTS (95% bootstrap CI):`)
+    row('random@k − blind (compute)', pairedLift(blind, randomAtK))
+    row('diverse@k − random@k (verifier)', pairedLift(randomAtK, diverseAtK))
+    row('diverse@k − blind (total)', pairedLift(blind, diverseAtK))
+  } else {
+    console.log('\n  (n<2 scored — paired CIs need ≥2; this is a plumbing smoke, not a signal)')
+  }
+
+  // Corpus: random@k arm, ranked by the deployable pytest verifier.
+  for (let ii = 0; ii < instances.length; ii += 1) {
+    const inst = instances[ii] as Instance
+    const grp = byInst[ii] as { random: (number | null)[]; rPatch: string[] }
+    const attempts: AttemptRecord[] = grp.random.map((p, round) => ({
+      round,
+      prompt: 'clbench-codebase-rollout',
+      output: (grp.rPatch[round] ?? '').slice(0, 4000),
+      ...(p === null ? { error: 'infra' } : { valid: p > 0, score: p }),
+      eventCount: 1,
+      eventTypes: { 'sandbox.stream': 1 },
+      traceTail: (grp.rPatch[round] ?? '').slice(-600),
+    }))
+    const validPasses = grp.random.filter((p): p is number => p !== null && p !== undefined)
+    const record: RunRecord = {
+      ts: new Date().toISOString(),
+      benchmark: 'clbench-codebase',
+      instanceId: inst.instanceId,
+      condition: `random@${k}`,
+      model,
+      blindResolved: validPasses[0] === 1,
+      resolved: validPasses.some((p) => p > 0),
+      attempts,
+      infraError: validPasses.length === 0,
+    }
+    await appendRunRecord(corpusPath, record)
+  }
+  console.log(`\n=== wrote ${instances.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)
+}
+
+main().catch((err) => {
+  console.error(`clbench-codebase-gate: ${err instanceof Error ? err.message : String(err)}`)
+  process.exit(1)
+})

From 2e6d9cb44f7f951399a574541435f0d43dd2fa34 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 6 Jun 2026 11:57:43 -0600
Subject: [PATCH 3/3] fix(bench): in-box opencode worker uses openai-compat
 provider so cheap router models resolve
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The in-box opencode agent validates `openai`/`anthropic` models against its registry, so
`openai/deepseek-chat` failed with "Model not found" (an empty-patch every rollout). The
`openai-compat` provider is the generic passthrough — it does NOT validate the model name —
so router-served cheap models (deepseek-chat, moonshotai/kimi-k2.6, glm) resolve in-box.
Default the worker to openai-compat (override via WORKER_PROVIDER); verified both deepseek
and kimi write a file in a live sandbox rollout.
---
 bench/src/clbench-codebase-gate.mts | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/bench/src/clbench-codebase-gate.mts b/bench/src/clbench-codebase-gate.mts
index ad8f475..eef179b 100644
--- a/bench/src/clbench-codebase-gate.mts
+++ b/bench/src/clbench-codebase-gate.mts
@@ -94,6 +94,11 @@ interface ShotCfg {
   routerBaseUrl: string
   routerKey: string
   model: string
+  /** in-box opencode provider. `openai-compat` (default) is the generic passthrough —
+   *  it does NOT validate the model against opencode's registry, so router-served cheap
+   *  models (deepseek-chat, moonshotai/kimi-k2.6, glm) work; `openai`/`anthropic` only
+   *  accept their registered model names (e.g. gpt-4.1). */
+  provider: string
   timeoutMs: number
 }
 
@@ -114,7 +119,7 @@ async function runRollout(inst: Instance, lens: string | undefined, cfg: ShotCfg
       name: `clbench-cb-${inst.instanceId}-${randomSuffix()}`.replace(/[^a-zA-Z0-9_.-]/g, '_').slice(0, 60),
       environment: 'universal',
       env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl },
-      backend: { type: 'opencode', model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } },
+      backend: { type: 'opencode', model: { provider: cfg.provider, model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } },
     })
     const signal = cfg.timeoutMs > 0 ? AbortSignal.timeout(cfg.timeoutMs) : undefined
     for await (const _ev of box.streamPrompt(rolloutPrompt(inst, lens), signal ? { signal } : {})) {
@@ -232,6 +237,8 @@ async function main(): Promise<void> {
   const k = Number(process.env.K ?? 3)
   const offset = Number(process.env.OFFSET ?? 0)
   const model = process.env.WORKER_MODEL ?? 'deepseek-chat'
+  // openai-compat = generic passthrough so cheap router models resolve in-box (see ShotCfg).
+  const provider = process.env.WORKER_PROVIDER ?? 'openai-compat'
   const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
   const routerKey = must('TANGLE_API_KEY')
   const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools'
@@ -243,8 +250,8 @@ async function main(): Promise<void> {
   if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`)
   if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`)
 
-  const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, timeoutMs }
-  console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${model} ===`)
+  const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, provider, timeoutMs }
+  console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${provider}/${model} ===`)
   console.log(`  sandbox=${sandboxBaseUrl}  judge=CL-Bench pytest-in-Docker (deployable)  clbench=${clbenchDir}`)
 
   const instances = loadInstances(clbenchDir, n, offset)