From d3e72e96cddff30e4d637bb76644ee6d6540b721 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 6 Jun 2026 11:36:56 -0600 Subject: [PATCH 1/3] feat(bench): CL-bench (Context Learning) deployable-selector gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate Tencent/Fudan CL-bench (arXiv:2602.03587) as a router-only selector gate. CL-bench grades a model's answer to an in-context-knowledge task against expert rubrics; the official metric is binary (pass ALL rubrics, avg ~63/task), but the per-rubric pass-count yields a CONTINUOUS score (fraction satisfied) — the within-task graded variance a verifier-grounded selector needs and that the pass/fail-deterministic benches (aec) lacked. The gate (modeled on humaneval-gate) runs two paired arms over the same tasks — random@K identical completions vs diverse@K strategy-lensed completions — grades each with the benchmark's own rubric judge (an LLM, run by us = deployable but noisy, so we rank by the variance-reduced fraction not the binary, judge model + temp pinned), verifier-selects by fraction, and reports paired-bootstrap lifts on BOTH the continuous fraction and the official binary. Writes a corpus RunRecord/ task that `corpus-replay --selector=verifier` + `corpus-report` consume unchanged. Router-only (no sandbox); fetches the public HF jsonl via curl|head so a smoke pulls only the first N records. Fail loud on a malformed/empty task set or a failed judge parse (a real zero, never masked). --- bench/src/clbench-context-gate.mts | 356 +++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 bench/src/clbench-context-gate.mts diff --git a/bench/src/clbench-context-gate.mts b/bench/src/clbench-context-gate.mts new file mode 100644 index 0000000..a3ad787 --- /dev/null +++ b/bench/src/clbench-context-gate.mts @@ -0,0 +1,356 @@ +/** + * CL-bench (Context Learning) deployable-selector gate — Tencent/Fudan's CL-bench + * (arXiv:2602.03587) repurposed for our verifier-grounded selector question. + * + * CL-bench asks whether a model can learn NEW knowledge from in-context material + * (a rule book, a framework doc, a conversation) at inference time: each record is + * a `messages` conversation (system + context-laden turns, the final turn the task) + * graded against expert `rubrics`. The official metric is BINARY — a task is solved + * only if the response passes EVERY rubric (avg 63 rubrics/task). We keep that binary + * as `resolved`, but the per-rubric pass-count gives a CONTINUOUS score (fraction of + * rubrics satisfied) — the within-task graded variance a selector needs, which the + * deterministic-but-pass/fail benches (aec) lacked. + * + * The CHECKER is the benchmark's own rubric judge (an LLM, per CL-bench's eval.py), + * run by us at inference time — deployable, but NOT deterministic, so treat the judge + * as a noisy verifier: we rank by the rubric FRACTION (variance-reduced over many + * rubrics), not the binary, and the judge model + temperature are pinned for + * test-retest stability. This is the LLM-judge analogue of the HumanEval Docker gate; + * read a positive result as "a deployable rubric-fraction verifier captures selection + * value on a hard context-learning domain", scoped by judge noise. + * + * Router-only (no sandbox): worker + judge are both router chat calls. Two paired arms + * over the same tasks, each "shot" = one stateless completion of the final turn: + * random@K — K completions over the unmodified conversation + * diverse@K — K completions, the i-th with a strategy lens prepended to the system turn + * verifierGroundedSelect ranks the K shots by rubric fraction. Metrics are reported on + * BOTH the continuous fraction (the gate-relevant signal) and the official binary. + * Writes a corpus RunRecord/task (condition random@k) so `corpus-replay --selector` and + * `corpus-report` consume it unchanged. Fail loud. + * + * dotenvx run -f … -- env N=20 K=4 WORKER_MODEL=deepseek-chat JUDGE_MODEL=deepseek-chat \ + * CORPUS=/tmp/clbench-ctx.jsonl tsx src/clbench-context-gate.mts + */ + +import { execFileSync } from 'node:child_process' +import { existsSync, readFileSync } from 'node:fs' +import { composeStrategies } from './directives' +import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus' +import { type RouterConfig, routerChatWithUsage } from './router-client' +import { selfConsistencySelect, verifierGroundedSelect } from './selector' + +const datasetUrl = 'https://huggingface.co/datasets/tencent/CL-bench/resolve/main/CL-bench.jsonl' + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +interface ChatMessage { + role: string + content: string +} + +interface CtxTask { + id: string + messages: ChatMessage[] + rubrics: string[] + category: string +} + +/** Fetch the first `count` lines of the (large, ~300MB) CL-bench JSONL via a piped + * `curl | head` so a smoke pulls only a few records, then slice [offset, offset+limit]. + * A local cached file (CLBENCH_CTX_FILE) short-circuits the fetch for powered runs. + * Fail loud on a malformed record — a silently-short task set would poison the gate. */ +function loadCtxTasks(limit: number, offset: number): CtxTask[] { + const need = offset + limit + let raw: string + const cached = process.env.CLBENCH_CTX_FILE + if (cached) { + if (!existsSync(cached)) throw new Error(`CLBENCH_CTX_FILE not found: ${cached}`) + raw = execFileSync('bash', ['-c', `head -n ${need} ${JSON.stringify(cached)}`], { maxBuffer: 1 << 30 }).toString('utf8') + } else { + // -fsSL: fail on HTTP error, follow redirects (HF resolve 302s to the CDN). `head` + // closing the pipe after `need` lines gives curl a benign SIGPIPE (exit 23) on a + // multi-hundred-MB file — suppress curl's stderr so it isn't mistaken for a fault; + // a real fetch failure surfaces as 0 parsed tasks below. + raw = execFileSync('bash', ['-c', `curl -fsSL ${JSON.stringify(datasetUrl)} 2>/dev/null | head -n ${need}`], { + maxBuffer: 1 << 30, + }).toString('utf8') + } + const tasks: CtxTask[] = [] + for (const line of raw.split('\n')) { + if (line.trim() === '') continue + const d = JSON.parse(line) as { + messages?: ChatMessage[] + rubrics?: unknown[] + metadata?: { task_id?: string; context_category?: string } + } + const messages = d.messages + const taskId = d.metadata?.task_id + if (!Array.isArray(messages) || messages.length === 0 || !taskId) { + throw new Error(`malformed CL-bench record: ${line.slice(0, 120)}`) + } + // Rubrics are strings or {rubric_criteria} objects (mirrors eval.py's build_rubrics_text). + const rubrics = (d.rubrics ?? []).map((r) => + typeof r === 'string' ? r : String((r as { rubric_criteria?: string }).rubric_criteria ?? '').trim(), + ).filter((r) => r.length > 0) + if (rubrics.length === 0) throw new Error(`CL-bench record ${taskId} has no rubrics`) + tasks.push({ id: taskId, messages, rubrics, category: d.metadata?.context_category ?? 'Unknown' }) + } + if (tasks.length === 0) throw new Error('CL-bench parsed to 0 tasks') + if (offset >= tasks.length) throw new Error(`OFFSET ${offset} >= fetched size ${tasks.length}`) + return tasks.slice(offset, offset + limit) +} + +/** Apply a diversity strategy lens to a conversation by prepending it to the system + * turn (or, if none, inserting a system turn). The non-system context turns — which + * carry the in-context knowledge the task is about — are never mutated. */ +function diversifyMessages(messages: ChatMessage[], lensSystem: string, baseSystem: string): ChatMessage[] { + if (messages[0]?.role === 'system') { + return [{ role: 'system', content: lensSystem }, ...messages.slice(1)] + } + return [{ role: 'system', content: composeStrategies(baseSystem, 1)[0] as string }, ...messages] +} + +const judgePrompt = (rubricsText: string, modelOutput: string): string => + 'You are a rigorous, strict grading teacher. Grade the student response against the 【Rubrics】, ' + + 'checking EACH requirement independently.\n\n' + + `【Rubrics】:\n${rubricsText}\n` + + `【Student Response】:\n${modelOutput}\n\n` + + 'Output ONLY this JSON (no other text):\n' + + '{\n "status": ["yes" or "no", ... one per rubric, in order],\n "all_pass": 0 or 1\n}\n' + +interface RubricVerdict { + /** fraction of rubrics satisfied (0..1) — the continuous within-task signal. */ + fraction: number + /** official binary: every rubric satisfied. */ + allPass: boolean + /** rubric count actually graded (for diagnostics). */ + graded: number +} + +function parseJudge(reply: string, rubricCount: number): RubricVerdict { + let text = reply.trim() + if (text.startsWith('```json')) text = text.slice(7) + if (text.startsWith('```')) text = text.slice(3) + if (text.endsWith('```')) text = text.slice(0, -3) + const obj = JSON.parse(text.trim()) as { status?: unknown[]; all_pass?: unknown } + const status = Array.isArray(obj.status) ? obj.status : [] + const yes = status.filter((s) => String(s).trim().toLowerCase() === 'yes').length + const graded = status.length > 0 ? status.length : rubricCount + const fraction = graded > 0 ? yes / graded : 0 + // Trust an explicit all_pass when given; else derive from the per-rubric list. + const allPass = obj.all_pass === 1 || obj.all_pass === '1' || (status.length === rubricCount && yes === rubricCount && rubricCount > 0) + return { fraction, allPass, graded } +} + +/** Grade one completion with the rubric judge. A judge API/parse failure is a real + * zero (the response could not be validated) — surfaced, never masked. */ +async function judgeRubrics(cfg: RouterConfig, task: CtxTask, output: string): Promise { + if (!output.trim()) return { fraction: 0, allPass: false, graded: 0 } + const rubricsText = task.rubrics.map((r, i) => `${i + 1}. ${r}`).join('\n') + const res = await routerChatWithUsage(cfg, [{ role: 'user', content: judgePrompt(rubricsText, output) }], { temperature: 0 }) + return parseJudge(typeof res.content === 'string' ? res.content : '', task.rubrics.length) +} + +async function pool(items: T[], limit: number, fn: (item: T, idx: number) => Promise): Promise { + const results: R[] = new Array(items.length) + let next = 0 + async function worker(): Promise { + for (;;) { + const idx = next + next += 1 + if (idx >= items.length) return + results[idx] = await fn(items[idx] as T, idx) + } + } + await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker())) + return results +} + +function makeRng(seed: number): () => number { + let s = seed | 0 + return () => { + s = (s + 0x6d2b79f5) | 0 + let t = Math.imul(s ^ (s >>> 15), 1 | s) + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +interface PairedLift { + point: number + low: number + high: number + pairs: number + discordant: number +} + +/** Paired lift = mean over tasks of (treatment − baseline) with a 95% bootstrap CI. + * Works on continuous per-task values (rubric fractions) as well as {0,1}. */ +function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift { + if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms') + const n = baseline.length + if (n === 0) throw new Error('pairedLift: no pairs') + const deltas = baseline.map((b, i) => (treatment[i] as number) - b) + const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length + const point = mean(deltas) + const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length + const rng = makeRng(0x9e3779b9) + const rint = (m: number) => Math.floor(rng() * m) + const boots: number[] = [] + for (let b = 0; b < bootstrapN; b += 1) { + let acc = 0 + for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number + boots.push(acc / n) + } + boots.sort((x, y) => x - y) + return { + point, + low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN, + high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN, + pairs: n, + discordant, + } +} + +const pct = (x: number) => `${(x * 100).toFixed(1)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +interface Shot { + output: string + verdict: RubricVerdict +} + +async function main(): Promise { + const n = Number(process.env.N ?? 20) + const k = Number(process.env.K ?? 4) + const offset = Number(process.env.OFFSET ?? 0) + const model = process.env.WORKER_MODEL ?? 'deepseek-chat' + const judgeModel = process.env.JUDGE_MODEL ?? model + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const solveConcurrency = Number(process.env.CONCURRENCY ?? 6) + const corpusPath = process.env.CORPUS ?? '/tmp/clbench-ctx.jsonl' + if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`) + if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`) + + const workerCfg: RouterConfig = { routerBaseUrl, routerKey, model } + const judgeCfg: RouterConfig = { routerBaseUrl, routerKey, model: judgeModel } + + console.log(`=== CL-bench (Context Learning) selector gate · N=${n} K=${k} offset=${offset} ===`) + console.log(` worker=${model} judge=${judgeModel} (rubric-fraction verifier) router=${routerBaseUrl}`) + console.log(' regime: STATELESS single completions — selector no-self-correction LOWER BOUND, judge is an LLM (noisy verifier)') + + const tasks = loadCtxTasks(n, offset) + console.log(`loaded ${tasks.length} task(s); rubrics/task: ${tasks.map((t) => t.rubrics.length).join(',')}`) + + type Unit = { taskIdx: number; arm: 'random' | 'diverse'; shot: number; messages: ChatMessage[] } + const units: Unit[] = [] + for (let ti = 0; ti < tasks.length; ti += 1) { + const task = tasks[ti] as CtxTask + const baseSystem = task.messages[0]?.role === 'system' ? (task.messages[0] as ChatMessage).content : 'You are a helpful assistant.' + const lenses = composeStrategies(baseSystem, k) + for (let s = 0; s < k; s += 1) { + units.push({ taskIdx: ti, arm: 'random', shot: s, messages: task.messages }) + units.push({ taskIdx: ti, arm: 'diverse', shot: s, messages: diversifyMessages(task.messages, lenses[s] as string, baseSystem) }) + } + } + console.log(`\n▶ solving ${units.length} attempts (${tasks.length} tasks × ${k} shots × 2 arms) via router, conc=${solveConcurrency}`) + const outputs = await pool(units, solveConcurrency, async (u) => { + const res = await routerChatWithUsage(workerCfg, u.messages, { temperature: Number(process.env.TEMPERATURE ?? '0.8') }) + return typeof res.content === 'string' ? res.content : '' + }) + + console.log(`▶ grading ${outputs.length} completions with the rubric judge (${judgeModel}), conc=${solveConcurrency}`) + const verdicts = await pool(units, solveConcurrency, (u, i) => judgeRubrics(judgeCfg, tasks[u.taskIdx] as CtxTask, outputs[i] as string)) + + // Regroup into per-task arms, shot order preserved. + const byTask = tasks.map(() => ({ random: [] as Shot[], diverse: [] as Shot[] })) + units.forEach((u, i) => { + const shot: Shot = { output: outputs[i] as string, verdict: verdicts[i] as RubricVerdict } + const grp = byTask[u.taskIdx] as { random: Shot[]; diverse: Shot[] } + if (u.arm === 'random') grp.random[u.shot] = shot + else grp.diverse[u.shot] = shot + }) + + // Per-task aligned metrics, on BOTH the continuous fraction and the official binary. + const fr = { blind: [] as number[], random: [] as number[], diverse: [] as number[], oracle: [] as number[], sc: [] as number[] } + const bin = { blind: [] as number[], random: [] as number[], diverse: [] as number[], oracle: [] as number[] } + for (const grp of byTask) { + const rFr = grp.random.map((s) => s.verdict.fraction) + const dFr = grp.diverse.map((s) => s.verdict.fraction) + const rIdx = verifierGroundedSelect(rFr) // rank random shots by rubric fraction + const dIdx = verifierGroundedSelect(dFr) + const scIdx = selfConsistencySelect(grp.diverse.map((s) => s.output)) + fr.blind.push((grp.random[0] as Shot).verdict.fraction) + fr.random.push((grp.random[rIdx] as Shot).verdict.fraction) + fr.diverse.push((grp.diverse[dIdx] as Shot).verdict.fraction) + fr.oracle.push(Math.max(...dFr)) + fr.sc.push((grp.diverse[scIdx] as Shot).verdict.fraction) + bin.blind.push((grp.random[0] as Shot).verdict.allPass ? 1 : 0) + bin.random.push((grp.random[rIdx] as Shot).verdict.allPass ? 1 : 0) + bin.diverse.push((grp.diverse[dIdx] as Shot).verdict.allPass ? 1 : 0) + bin.oracle.push(dFr.some((_, j) => (grp.diverse[j] as Shot).verdict.allPass) ? 1 : 0) + } + const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / xs.length + + console.log(`\n${'='.repeat(78)}`) + console.log(`RESULTS · CL-bench Context Learning · n=${tasks.length} · k=${k} · worker=${model} · judge=${judgeModel}`) + console.log('='.repeat(78)) + console.log(' — rubric FRACTION (continuous within-task signal, the gate-relevant metric) —') + console.log(` blind (shot 0) ${pct(rate(fr.blind))}`) + console.log(` random@k (verifier-pick) ${pct(rate(fr.random))}`) + console.log(` diverse@k (verifier-pick) ${pct(rate(fr.diverse))}`) + console.log(` oracle@k (max fraction) ${pct(rate(fr.oracle))}`) + console.log(` self-consistency@k ${pct(rate(fr.sc))}`) + console.log(' — official BINARY all-rubrics-pass (solving rate) —') + console.log(` blind ${pct(rate(bin.blind))} random@k ${pct(rate(bin.random))} diverse@k ${pct(rate(bin.diverse))} oracle@k ${pct(rate(bin.oracle))}`) + + const row = (label: string, l: PairedLift) => + console.log(` ${label.padEnd(36)} ${pp(l.point).padStart(7)} CI [${pp(l.low)}, ${pp(l.high)}] (paired ${l.pairs}, discordant ${l.discordant})`) + console.log(`\n PAIRED LIFTS on rubric fraction (95% bootstrap CI, B=10000):`) + row('random@k − blind (compute)', pairedLift(fr.blind, fr.random)) + row('diverse@k − random@k (verifier)', pairedLift(fr.random, fr.diverse)) + row('diverse@k − blind (total)', pairedLift(fr.blind, fr.diverse)) + row('verifier-pick − sc-pick (diverse)', pairedLift(fr.sc, fr.diverse)) + const ceiling = pairedLift(fr.random, fr.oracle) + row('oracle@k − random@k (ceiling)', ceiling) + + // Corpus: one RunRecord/task for the random@k arm, ranked by the rubric-fraction + // verifier — `corpus-replay --selector=verifier` + `corpus-report` consume it. + for (let ti = 0; ti < tasks.length; ti += 1) { + const task = tasks[ti] as CtxTask + const grp = byTask[ti] as { random: Shot[]; diverse: Shot[] } + const attempts: AttemptRecord[] = grp.random.map((s, round) => ({ + round, + prompt: 'clbench-context', + output: s.output.slice(0, 4000), + valid: s.verdict.allPass, + score: s.verdict.fraction, + eventCount: 1, + eventTypes: { 'router.chat': 1 }, + traceTail: s.output.slice(-600), + })) + const record: RunRecord = { + ts: new Date().toISOString(), + benchmark: 'clbench-context', + instanceId: task.id, + condition: `random@${k}`, + model, + blindResolved: (grp.random[0] as Shot).verdict.allPass, + resolved: grp.random.some((s) => s.verdict.allPass), + attempts, + infraError: false, + } + await appendRunRecord(corpusPath, record) + } + console.log(`\n=== wrote ${tasks.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`) +} + +main().catch((err) => { + console.error(`clbench-context-gate: ${err instanceof Error ? err.message : String(err)}`) + process.exit(1) +}) From ddfd30470ad4b2748ae452f92ba90e7ddcbf66a1 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 6 Jun 2026 11:50:26 -0600 Subject: [PATCH 2/3] feat(bench): CL-Bench (Continual) Codebase Adaptation verifier-grounded selector gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate pgasawa CL-Bench (arXiv:2606.05661) codebase_adaptation — the ONE of its six domains with a DEPLOYABLE checker (the rest grade against realized outcomes the agent never has = oracles). Its scorer applies the instance's provided test_patch and runs pytest in the instance's Docker image, keying off the exit code: an INDEPENDENT deployable check (tests ≠ answer), the clean analogue of the HumanEval gate and unlike the CL-bench Context gate where the rubric judge IS the metric. Two pieces: - scripts/clbench_codebase_judge.py — a thin bridge exposing CL-Bench's own `evaluate_submission(patch, instance)` as a standalone (instance_id, patch) -> verdict call (run in CL-Bench's venv). Verified to self-check: gold patch passes, empty fails. - src/clbench-codebase-gate.mts — the gate. Each instance is SWE-bench format; the worker is a fault-isolated sandbox rollout (opencode clones repo@base_commit, fixes source, writes a diff read off the box FS). Two paired arms (random@K identical vs diverse@K strategy-lensed), verifierGroundedSelect by pytest-pass, paired-bootstrap lifts on blind/random@k/diverse@k/oracle@k, and a corpus RunRecord/task that corpus-replay --selector=verifier consumes. Infra-errored rollouts/judges are excluded, never scored 0. Needs Docker + the CL-Bench images (`clbench setup codebase_adaptation`) for judging and a reachable sandbox for rollouts. Independent of the CL-bench (Context) gate (separate PR). --- bench/scripts/clbench_codebase_judge.py | 73 +++++ bench/src/clbench-codebase-gate.mts | 357 ++++++++++++++++++++++++ 2 files changed, 430 insertions(+) create mode 100644 bench/scripts/clbench_codebase_judge.py create mode 100644 bench/src/clbench-codebase-gate.mts diff --git a/bench/scripts/clbench_codebase_judge.py b/bench/scripts/clbench_codebase_judge.py new file mode 100644 index 0000000..b54ab60 --- /dev/null +++ b/bench/scripts/clbench_codebase_judge.py @@ -0,0 +1,73 @@ +"""Deployable-checker bridge for CL-Bench (Continual) Codebase Adaptation. + +CL-Bench's `codebase_adaptation` is the ONE domain whose scorer is a deployable +checker (not an oracle): it applies the instance's provided `test_patch` and runs +the project's pytest suite inside the instance's Docker image, keying off the exit +code — exactly the SWE-bench / commit0 regime. This bridge exposes that scorer as a +standalone (instance_id, patch) -> {success,status} call so our TypeScript gate can +rank K candidate patches by a verifier the agent could legitimately run itself. + +Run it with CL-Bench's OWN venv + repo root on the path (its `src.tasks...` package): + + /.venv/bin/python clbench_codebase_judge.py \ + --dataset /data/codebase_adaptation/final-dataset.jsonl \ + --instance-id jazzband__tablib-534 --patch-file /tmp/candidate.patch + # invoked with cwd= so `import src.tasks...` resolves + +Prints one JSON line: {"instance_id","success","status","error"}. Fail loud — a +Docker/import failure exits non-zero with the message on stderr, never a silent 0. +""" + +from __future__ import annotations + +import argparse +import json +import sys + +from src.tasks.codebase_adaptation.evaluator import evaluate_submission + + +def load_instance(dataset_path: str, instance_id: str) -> dict: + with open(dataset_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + row = json.loads(line) + if row.get("instance_id") == instance_id: + return row + raise SystemExit(f"instance_id not found in {dataset_path}: {instance_id}") + + +def main() -> None: + ap = argparse.ArgumentParser(description="CL-Bench codebase_adaptation deployable judge") + ap.add_argument("--dataset", required=True, help="path to final-dataset.jsonl") + ap.add_argument("--instance-id", required=True) + ap.add_argument("--patch-file", required=True, help="file holding the candidate unified git diff") + args = ap.parse_args() + + instance = load_instance(args.dataset, args.instance_id) + with open(args.patch_file, encoding="utf-8") as f: + patch = f.read() + + # evaluate_submission spins the instance's Docker image, applies test_patch + the + # candidate, runs pytest, and reports success on a clean exit. A genuine infra + # failure raises — let it propagate (non-zero exit) so the caller never reads a + # transport fault as a failed test. + result = evaluate_submission(patch, instance) + print(json.dumps({ + "instance_id": args.instance_id, + "success": bool(result.success), + "status": result.status, + "error": (result.error or "")[:500], + })) + + +if __name__ == "__main__": + try: + main() + except SystemExit: + raise + except Exception as exc: # infra/import failure — fail loud, do not emit a fake verdict + print(f"clbench_codebase_judge: {type(exc).__name__}: {exc}", file=sys.stderr) + sys.exit(2) diff --git a/bench/src/clbench-codebase-gate.mts b/bench/src/clbench-codebase-gate.mts new file mode 100644 index 0000000..ad8f475 --- /dev/null +++ b/bench/src/clbench-codebase-gate.mts @@ -0,0 +1,357 @@ +/** + * CL-Bench (Continual) Codebase Adaptation — verifier-grounded selector gate. + * + * Of CL-Bench's six domains, codebase_adaptation is the ONLY deployable checker: + * its scorer applies the instance's provided `test_patch` and runs pytest in the + * instance's Docker image, keying off the exit code (the rest grade against realized + * outcomes the agent never has = oracles). That makes it the clean analogue of the + * HumanEval gate — an INDEPENDENT deployable check (tests ≠ the answer), unlike the + * CL-bench Context gate where the rubric judge IS the metric. + * + * Each instance is SWE-bench format (repo @ base_commit + a GitHub issue). The worker + * is a real sandbox rollout (opencode clones the repo, fixes the source, writes a diff + * to a file we read back — a large diff truncates in the chat stream). We score each + * candidate patch with CL-Bench's OWN scorer via `clbench_codebase_judge.py` (run in + * CL-Bench's venv) — verified to self-check (gold patch passes, empty fails). Two + * paired arms over the same instances: + * random@K — K identical-issue rollouts (the compute control) + * diverse@K — K rollouts, the i-th with a strategy lens prepended (composeStrategies) + * verifierGroundedSelect picks by pytest-pass; we report blind / random@k / diverse@k / + * oracle@k with paired-bootstrap CIs, and write a corpus RunRecord/task the existing + * `corpus-replay --selector=verifier` + `corpus-report` consume unchanged. Fail loud: + * an infra-errored rollout is excluded (infraError), never scored 0. + * + * dotenvx run -f … -- env N=4 K=3 WORKER_MODEL=deepseek-chat CONCURRENCY=2 \ + * CLBENCH_DIR=/tmp/clbench-continual CORPUS=/tmp/clbench-codebase.jsonl \ + * tsx src/clbench-codebase-gate.mts + */ + +import { execFile } from 'node:child_process' +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { promisify } from 'node:util' +import { acquireSandbox } from '@tangle-network/agent-runtime/loops' +import { Sandbox } from '@tangle-network/sandbox' +import { composeStrategies } from './directives' +import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus' +import { verifierGroundedSelect } from './selector' + +const execFileAsync = promisify(execFile) +const PATCH_PATH = '/tmp/solution.patch' +const randomSuffix = () => Math.random().toString(36).slice(2, 10) + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +interface Instance { + instanceId: string + repo: string + baseCommit: string + problemStatement: string +} + +/** Load CL-Bench codebase_adaptation instances (SWE-bench format) from the cloned + * repo's final-dataset.jsonl. Fail loud on a malformed/short record. */ +function loadInstances(clbenchDir: string, limit: number, offset: number): Instance[] { + const path = join(clbenchDir, 'data/codebase_adaptation/final-dataset.jsonl') + const text = readFileSync(path, 'utf8') + const out: Instance[] = [] + for (const line of text.split('\n')) { + if (line.trim() === '') continue + const d = JSON.parse(line) as { instance_id?: string; repo?: string; base_commit?: string; problem_statement?: string } + if (!d.instance_id || !d.repo || !d.base_commit || !d.problem_statement) { + throw new Error(`malformed codebase_adaptation record: ${line.slice(0, 120)}`) + } + out.push({ instanceId: d.instance_id, repo: d.repo, baseCommit: d.base_commit, problemStatement: d.problem_statement }) + } + if (out.length === 0) throw new Error('codebase_adaptation parsed to 0 instances') + if (offset >= out.length) throw new Error(`OFFSET ${offset} >= dataset size ${out.length}`) + return out.slice(offset, offset + limit) +} + +function rolloutPrompt(inst: Instance, lens: string | undefined): string { + return [ + lens ? `${lens}\n` : '', + `Clone https://github.com/${inst.repo} into /work, then \`cd /work && git checkout ${inst.baseCommit}\`.`, + '', + 'Resolve this GitHub issue by editing the SOURCE only (never the tests — the evaluation re-runs its own hidden tests on a fresh clone):', + '', + inst.problemStatement, + '', + 'Work iteratively: install the package editable (`pip install -e .`), reproduce the issue, implement the fix, and re-run the existing tests until they pass.', + `When done, from /work run EXACTLY:`, + ` git add -A && git diff --cached -- . ':(exclude)*/test*' > ${PATCH_PATH}`, + `Then stop. The patch file is the only deliverable — do NOT paste the diff in your reply.`, + ].filter((s) => s !== '').join('\n') +} + +interface ShotCfg { + sandboxBaseUrl: string + routerBaseUrl: string + routerKey: string + model: string + timeoutMs: number +} + +interface Shot { + patch: string + /** rollout completed and produced a (possibly empty) patch; false ⇒ infra error (excluded). */ + ran: boolean + detail?: string +} + +/** One fault-isolated sandbox rollout → a patch (read from the box FS). ANY rollout + * error becomes a recorded infra failure (ran=false), never a throw that kills the pool. */ +async function runRollout(inst: Instance, lens: string | undefined, cfg: ShotCfg): Promise { + const client = new Sandbox({ baseUrl: cfg.sandboxBaseUrl, apiKey: cfg.routerKey }) + let box: Awaited> | undefined + try { + box = await acquireSandbox(client, { + name: `clbench-cb-${inst.instanceId}-${randomSuffix()}`.replace(/[^a-zA-Z0-9_.-]/g, '_').slice(0, 60), + environment: 'universal', + env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl }, + backend: { type: 'opencode', model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } }, + }) + const signal = cfg.timeoutMs > 0 ? AbortSignal.timeout(cfg.timeoutMs) : undefined + for await (const _ev of box.streamPrompt(rolloutPrompt(inst, lens), signal ? { signal } : {})) { + // drain; the deliverable is the patch FILE, not the stream + } + let patch = '' + try { + patch = await box.fs.read(PATCH_PATH) + } catch { + patch = '' // missing patch file ⇒ the agent produced nothing (a real empty, ran=true) + } + return { patch, ran: true } + } catch (err) { + return { patch: '', ran: false, detail: `rollout error: ${(err instanceof Error ? err.message : String(err)).slice(0, 200)}` } + } finally { + try { + if (box) await box.delete() + } catch { + // staging reaps on expiry + } + } +} + +/** Score one candidate patch with CL-Bench's deployable pytest checker via the bridge + * (run in CL-Bench's venv, cwd = its repo root so `src.tasks...` resolves). Returns + * pass (0/1) or null on an infra/judge failure (excluded, never scored 0). */ +async function judgePatch(inst: Instance, patch: string, clbenchDir: string): Promise { + if (patch.trim() === '') return 0 // empty patch is a legitimate fail, not an infra error + const dir = mkdtempSync(join(tmpdir(), 'clbench-cb-')) + const patchFile = join(dir, 'candidate.patch') + writeFileSync(patchFile, patch) + try { + const { stdout } = await execFileAsync( + join(clbenchDir, '.venv/bin/python'), + [ + join(process.cwd(), 'scripts/clbench_codebase_judge.py'), + '--dataset', + join(clbenchDir, 'data/codebase_adaptation/final-dataset.jsonl'), + '--instance-id', + inst.instanceId, + '--patch-file', + patchFile, + ], + { cwd: clbenchDir, maxBuffer: 8 * 1024 * 1024, timeout: 600_000 }, + ) + const last = stdout.trim().split('\n').at(-1) ?? '{}' + const verdict = JSON.parse(last) as { success?: boolean } + return verdict.success ? 1 : 0 + } catch (err) { + console.error(` judge infra error ${inst.instanceId}: ${(err instanceof Error ? err.message : String(err)).slice(0, 160)}`) + return null + } finally { + rmSync(dir, { recursive: true, force: true }) + } +} + +async function pool(items: T[], limit: number, fn: (item: T, idx: number) => Promise): Promise { + const results: R[] = new Array(items.length) + let next = 0 + async function worker(): Promise { + for (;;) { + const idx = next + next += 1 + if (idx >= items.length) return + results[idx] = await fn(items[idx] as T, idx) + } + } + await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker())) + return results +} + +function makeRng(seed: number): () => number { + let s = seed | 0 + return () => { + s = (s + 0x6d2b79f5) | 0 + let t = Math.imul(s ^ (s >>> 15), 1 | s) + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +interface PairedLift { + point: number + low: number + high: number + pairs: number + discordant: number +} + +function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift { + if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms') + const n = baseline.length + if (n === 0) throw new Error('pairedLift: no pairs') + const deltas = baseline.map((b, i) => (treatment[i] as number) - b) + const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length + const point = mean(deltas) + const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length + const rng = makeRng(0x9e3779b9) + const rint = (m: number) => Math.floor(rng() * m) + const boots: number[] = [] + for (let b = 0; b < bootstrapN; b += 1) { + let acc = 0 + for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number + boots.push(acc / n) + } + boots.sort((x, y) => x - y) + return { point, low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN, high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN, pairs: n, discordant } +} + +const pct = (x: number) => `${(x * 100).toFixed(1)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 4) + const k = Number(process.env.K ?? 3) + const offset = Number(process.env.OFFSET ?? 0) + const model = process.env.WORKER_MODEL ?? 'deepseek-chat' + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools' + const clbenchDir = process.env.CLBENCH_DIR ?? '/tmp/clbench-continual' + const rolloutConc = Number(process.env.CONCURRENCY ?? 3) + const judgeConc = Number(process.env.JUDGE_CONCURRENCY ?? 2) + const timeoutMs = process.env.SHOT_TIMEOUT_MS ? Number(process.env.SHOT_TIMEOUT_MS) : 900_000 + const corpusPath = process.env.CORPUS ?? '/tmp/clbench-codebase.jsonl' + if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`) + if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`) + + const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, timeoutMs } + console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${model} ===`) + console.log(` sandbox=${sandboxBaseUrl} judge=CL-Bench pytest-in-Docker (deployable) clbench=${clbenchDir}`) + + const instances = loadInstances(clbenchDir, n, offset) + console.log(`loaded ${instances.length} instance(s): ${instances.map((i) => i.instanceId).join(', ')}`) + + type Unit = { instIdx: number; arm: 'random' | 'diverse'; shot: number; lens: string | undefined } + const units: Unit[] = [] + for (let ii = 0; ii < instances.length; ii += 1) { + const lenses = composeStrategies('', k) // lens prefixes only ('' base ⇒ "\n\n") + for (let s = 0; s < k; s += 1) { + units.push({ instIdx: ii, arm: 'random', shot: s, lens: undefined }) + units.push({ instIdx: ii, arm: 'diverse', shot: s, lens: (lenses[s] as string).trim() }) + } + } + console.log(`\n▶ phase 1: ${units.length} rollouts (${instances.length}×${k}×2 arms) via sandbox, conc=${rolloutConc}`) + const shots = await pool(units, rolloutConc, async (u) => { + const inst = instances[u.instIdx] as Instance + const s = await runRollout(inst, u.lens, cfg) + console.log(` rollout ${inst.instanceId} ${u.arm}#${u.shot}: ${s.ran ? `patch ${s.patch.length}B` : `INFRA (${s.detail})`}`) + return s + }) + + console.log(`\n▶ phase 2: judging ${shots.length} patches with the deployable checker, conc=${judgeConc}`) + const passes = await pool(units, judgeConc, async (u, i) => { + const shot = shots[i] as Shot + if (!shot.ran) return null // infra error ⇒ excluded + const inst = instances[u.instIdx] as Instance + const p = await judgePatch(inst, shot.patch, clbenchDir) + console.log(` judge ${inst.instanceId} ${u.arm}#${u.shot}: ${p === null ? 'INFRA' : p ? 'PASS' : 'fail'}`) + return p + }) + + // Regroup; an attempt with a null pass (infra) is dropped from its arm. + const byInst = instances.map(() => ({ random: [] as (number | null)[], diverse: [] as (number | null)[], rPatch: [] as string[], dPatch: [] as string[] })) + units.forEach((u, i) => { + const grp = byInst[u.instIdx] as { random: (number | null)[]; diverse: (number | null)[]; rPatch: string[]; dPatch: string[] } + const pass = passes[i] as number | null + const patch = (shots[i] as Shot).patch + if (u.arm === 'random') { grp.random[u.shot] = pass; grp.rPatch[u.shot] = patch } else { grp.diverse[u.shot] = pass; grp.dPatch[u.shot] = patch } + }) + + // Per-instance {0,1} outcomes; instances with no valid attempt in BOTH arms are excluded. + const blind: number[] = [] + const randomAtK: number[] = [] + const diverseAtK: number[] = [] + const oracleAtK: number[] = [] + let excluded = 0 + for (const grp of byInst) { + const rValid = grp.random.filter((p): p is number => p !== null && p !== undefined) + const dValid = grp.diverse.filter((p): p is number => p !== null && p !== undefined) + if (rValid.length === 0 || dValid.length === 0) { excluded += 1; continue } + blind.push(rValid[0] as number) + randomAtK.push(rValid[verifierGroundedSelect(rValid)] as number) + diverseAtK.push(dValid[verifierGroundedSelect(dValid)] as number) + oracleAtK.push(dValid.some((p) => p > 0) ? 1 : 0) + } + const rate = (xs: number[]) => (xs.length === 0 ? 0 : xs.reduce((s, x) => s + x, 0) / xs.length) + + console.log(`\n${'='.repeat(78)}`) + console.log(`RESULTS · CL-Bench Codebase Adaptation · n=${blind.length} scored (${excluded} excluded) · k=${k} · model=${model}`) + console.log('='.repeat(78)) + console.log(` blind pass@1 ${pct(rate(blind))}`) + console.log(` random@k (verifier-pick) ${pct(rate(randomAtK))}`) + console.log(` diverse@k (verifier-pick) ${pct(rate(diverseAtK))}`) + console.log(` oracle@k (diverse, any) ${pct(rate(oracleAtK))}`) + if (blind.length >= 2) { + const row = (label: string, l: PairedLift) => + console.log(` ${label.padEnd(34)} ${pp(l.point).padStart(7)} CI [${pp(l.low)}, ${pp(l.high)}] (paired ${l.pairs}, discordant ${l.discordant})`) + console.log(`\n PAIRED LIFTS (95% bootstrap CI):`) + row('random@k − blind (compute)', pairedLift(blind, randomAtK)) + row('diverse@k − random@k (verifier)', pairedLift(randomAtK, diverseAtK)) + row('diverse@k − blind (total)', pairedLift(blind, diverseAtK)) + } else { + console.log('\n (n<2 scored — paired CIs need ≥2; this is a plumbing smoke, not a signal)') + } + + // Corpus: random@k arm, ranked by the deployable pytest verifier. + for (let ii = 0; ii < instances.length; ii += 1) { + const inst = instances[ii] as Instance + const grp = byInst[ii] as { random: (number | null)[]; rPatch: string[] } + const attempts: AttemptRecord[] = grp.random.map((p, round) => ({ + round, + prompt: 'clbench-codebase-rollout', + output: (grp.rPatch[round] ?? '').slice(0, 4000), + ...(p === null ? { error: 'infra' } : { valid: p > 0, score: p }), + eventCount: 1, + eventTypes: { 'sandbox.stream': 1 }, + traceTail: (grp.rPatch[round] ?? '').slice(-600), + })) + const validPasses = grp.random.filter((p): p is number => p !== null && p !== undefined) + const record: RunRecord = { + ts: new Date().toISOString(), + benchmark: 'clbench-codebase', + instanceId: inst.instanceId, + condition: `random@${k}`, + model, + blindResolved: validPasses[0] === 1, + resolved: validPasses.some((p) => p > 0), + attempts, + infraError: validPasses.length === 0, + } + await appendRunRecord(corpusPath, record) + } + console.log(`\n=== wrote ${instances.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`) +} + +main().catch((err) => { + console.error(`clbench-codebase-gate: ${err instanceof Error ? err.message : String(err)}`) + process.exit(1) +}) From 2e6d9cb44f7f951399a574541435f0d43dd2fa34 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 6 Jun 2026 11:57:43 -0600 Subject: [PATCH 3/3] fix(bench): in-box opencode worker uses openai-compat provider so cheap router models resolve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The in-box opencode agent validates `openai`/`anthropic` models against its registry, so `openai/deepseek-chat` failed with "Model not found" (an empty-patch every rollout). The `openai-compat` provider is the generic passthrough — it does NOT validate the model name — so router-served cheap models (deepseek-chat, moonshotai/kimi-k2.6, glm) resolve in-box. Default the worker to openai-compat (override via WORKER_PROVIDER); verified both deepseek and kimi write a file in a live sandbox rollout. --- bench/src/clbench-codebase-gate.mts | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bench/src/clbench-codebase-gate.mts b/bench/src/clbench-codebase-gate.mts index ad8f475..eef179b 100644 --- a/bench/src/clbench-codebase-gate.mts +++ b/bench/src/clbench-codebase-gate.mts @@ -94,6 +94,11 @@ interface ShotCfg { routerBaseUrl: string routerKey: string model: string + /** in-box opencode provider. `openai-compat` (default) is the generic passthrough — + * it does NOT validate the model against opencode's registry, so router-served cheap + * models (deepseek-chat, moonshotai/kimi-k2.6, glm) work; `openai`/`anthropic` only + * accept their registered model names (e.g. gpt-4.1). */ + provider: string timeoutMs: number } @@ -114,7 +119,7 @@ async function runRollout(inst: Instance, lens: string | undefined, cfg: ShotCfg name: `clbench-cb-${inst.instanceId}-${randomSuffix()}`.replace(/[^a-zA-Z0-9_.-]/g, '_').slice(0, 60), environment: 'universal', env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl }, - backend: { type: 'opencode', model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } }, + backend: { type: 'opencode', model: { provider: cfg.provider, model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey } }, }) const signal = cfg.timeoutMs > 0 ? AbortSignal.timeout(cfg.timeoutMs) : undefined for await (const _ev of box.streamPrompt(rolloutPrompt(inst, lens), signal ? { signal } : {})) { @@ -232,6 +237,8 @@ async function main(): Promise { const k = Number(process.env.K ?? 3) const offset = Number(process.env.OFFSET ?? 0) const model = process.env.WORKER_MODEL ?? 'deepseek-chat' + // openai-compat = generic passthrough so cheap router models resolve in-box (see ShotCfg). + const provider = process.env.WORKER_PROVIDER ?? 'openai-compat' const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' const routerKey = must('TANGLE_API_KEY') const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools' @@ -243,8 +250,8 @@ async function main(): Promise { if (!Number.isInteger(n) || n < 1) throw new Error(`N must be a positive integer, got ${process.env.N}`) if (!Number.isInteger(k) || k < 1) throw new Error(`K must be a positive integer, got ${process.env.K}`) - const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, timeoutMs } - console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${model} ===`) + const cfg: ShotCfg = { sandboxBaseUrl, routerBaseUrl, routerKey, model, provider, timeoutMs } + console.log(`=== CL-Bench Codebase Adaptation selector gate · N=${n} K=${k} offset=${offset} model=${provider}/${model} ===`) console.log(` sandbox=${sandboxBaseUrl} judge=CL-Bench pytest-in-Docker (deployable) clbench=${clbenchDir}`) const instances = loadInstances(clbenchDir, n, offset)