diff --git a/.gitignore b/.gitignore index 254f72f..b1c20b4 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,10 @@ vite.config.ts.timestamp-* .angular docs/superpowers + +evals/intent-discovery/.vitest-evals/ +evals/intent-discovery/runs/* +!evals/intent-discovery/runs/.gitkeep +!evals/intent-discovery/runs/latest/ +evals/intent-discovery/runs/latest/* +!evals/intent-discovery/runs/latest/.gitkeep diff --git a/eslint.config.js b/eslint.config.js index bc64866..8907d2f 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -6,6 +6,10 @@ import unusedImports from 'eslint-plugin-unused-imports' /** @type {import('eslint').Linter.Config[]} */ const config = [ ...tanstackConfig, + { + name: 'intent/eval-fixture-ignores', + ignores: ['evals/intent-discovery/fixtures/**/*.tsx'], + }, { name: 'tanstack/temp', plugins: { @@ -73,6 +77,15 @@ const config = [ ], }, }, + { + name: 'intent/evals', + files: ['evals/intent-discovery/**/*.ts'], + languageOptions: { + parserOptions: { + project: './evals/intent-discovery/tsconfig.json', + }, + }, + }, ] export default config diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md new file mode 100644 index 0000000..77b880d --- /dev/null +++ b/evals/intent-discovery/README.md @@ -0,0 +1,44 @@ +# Intent discovery eval + +Opt-in eval suite for measuring whether Copilot discovers and invokes Intent surfaces without direct user instruction. + +## Commands + +- `pnpm eval:intent-discovery` runs the saved-transcript eval suite. +- `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`. +- `pnpm eval:intent-discovery:live` runs the eval suite with the local Copilot CLI adapter enabled. +- `pnpm eval:intent-discovery:live:json` writes a JSON report that includes live Copilot condition cases. +- `pnpm eval:intent-discovery:judge` optionally annotates the latest JSON report with an OpenAI-backed output-quality judge when `OPENAI_API_KEY` is set. +- `pnpm eval:intent-discovery:report` serves the saved JSON report. +- `pnpm eval:intent-discovery:summary` writes `summary.json` and `summary.md` from the latest JSON report. + +The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot condition matrix in the report artifact, run: + +```sh +pnpm eval:intent-discovery:live:json +pnpm eval:intent-discovery:summary +pnpm eval:intent-discovery:report +``` + +Set `INTENT_DISCOVERY_RUN_COUNT=3` with the live commands to run each live condition three times and include `pass@k` / `pass^k` in the generated summary. + +The optional LLM judge is secondary. It can annotate whether final answers appear to apply loaded guidance, but it never changes deterministic scores such as `StrictIntentInvocation`, `CorrectSkillLoaded`, or `AutonomousDiscoverySuccess`. + +## Current scope + +This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`. + +The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9. + +Live Router runs compare four setup conditions: + +- `no-intent`: no Intent guidance or allowlist is added. +- `current-intent`: `package.json#intent.skills` plus the current install-style `AGENTS.md` skill-loading guidance. +- `mapped-intent`: `package.json#intent.skills` plus `AGENTS.md` task-to-skill mappings like `install --map`. +- `explicit-intent-control`: current install-style setup plus a prompt that explicitly asks the agent to run Intent. This condition is diagnostic and excluded from autonomous scoring. + +The live Copilot harness can run an opt-in command backend through `INTENT_DISCOVERY_COPILOT_COMMAND`. When that environment variable is unset, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. The command runs inside a prepared fixture workspace with task metadata in `INTENT_DISCOVERY_TASK_ID`, `INTENT_DISCOVERY_FIXTURE`, `INTENT_DISCOVERY_PROMPT`, `INTENT_DISCOVERY_RUN_ID`, and `INTENT_DISCOVERY_WORKSPACE`. + +`pnpm eval:intent-discovery:live` sets `INTENT_DISCOVERY_RUN_LIVE=1` and `INTENT_DISCOVERY_COPILOT_COMMAND` to the repo-local Copilot CLI adapter. The adapter calls `copilot -p` in the prepared fixture workspace, writes a Copilot share transcript under the generated run directory, and prints the transcript for command capture. Live runs attach the same strict efficacy scores as saved transcripts, so a passing harness run can still report `AutonomousDiscoverySuccess: 0` when Copilot did not invoke Intent or loaded the wrong skill. Do not put API keys or tokens in the command or prompt; provide credentials through the normal Copilot CLI login or secret environment configuration. + +Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/bin/copilot-cli-adapter.mjs b/evals/intent-discovery/bin/copilot-cli-adapter.mjs new file mode 100644 index 0000000..80a7c86 --- /dev/null +++ b/evals/intent-discovery/bin/copilot-cli-adapter.mjs @@ -0,0 +1,87 @@ +#!/usr/bin/env node + +import { existsSync, mkdirSync, readFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { spawnSync } from 'node:child_process' + +const workspace = requiredEnv('INTENT_DISCOVERY_WORKSPACE') +const taskId = requiredEnv('INTENT_DISCOVERY_TASK_ID') +const fixture = requiredEnv('INTENT_DISCOVERY_FIXTURE') +const prompt = requiredEnv('INTENT_DISCOVERY_PROMPT') +const runId = requiredEnv('INTENT_DISCOVERY_RUN_ID') +const sharePath = join( + workspace, + '.intent-eval', + `${sanitizeFileName(runId)}.md`, +) + +mkdirSync(dirname(sharePath), { recursive: true }) + +const copilotPrompt = [ + `Task id: ${taskId}`, + `Fixture: ${fixture}`, + '', + prompt, + '', + 'Work in the current repository. Use the available project context and tools as you normally would. Do not summarize this prompt; complete the task and report what you changed.', +].join('\n') + +const args = [ + '-p', + copilotPrompt, + '-C', + workspace, + '--allow-all-tools', + '--add-dir', + workspace, + '--no-ask-user', + '--no-color', + '--plain-diff', + '--share', + sharePath, +] + +const result = spawnSync('copilot', args, { + cwd: workspace, + encoding: 'utf8', + env: { + ...process.env, + NO_COLOR: '1', + }, + stdio: ['ignore', 'pipe', 'pipe'], +}) + +if (result.error) { + console.error(result.error.message) + process.exit(1) +} + +if (result.stdout.trim()) { + console.log(result.stdout.trim()) +} + +if (existsSync(sharePath)) { + console.log(`\nTRANSCRIPT_PATH: ${sharePath}`) + console.log(readFileSync(sharePath, 'utf8')) +} + +if (result.stderr.trim()) { + console.error(result.stderr.trim()) +} + +process.exit(result.status ?? 1) + +function requiredEnv(name) { + const value = process.env[name] + + if (!value) { + console.error(`Missing required environment variable: ${name}`) + process.exit(1) + } + + return value +} + +function sanitizeFileName(value) { + return value.replace(/[^a-z0-9.-]+/gi, '-') +} diff --git a/evals/intent-discovery/bin/llm-judge.mjs b/evals/intent-discovery/bin/llm-judge.mjs new file mode 100644 index 0000000..0e3bd57 --- /dev/null +++ b/evals/intent-discovery/bin/llm-judge.mjs @@ -0,0 +1,149 @@ +#!/usr/bin/env node + +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' + +const reportPath = + process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json' +const apiKey = process.env.OPENAI_API_KEY +const model = process.env.INTENT_DISCOVERY_LLM_JUDGE_MODEL ?? 'gpt-4o-mini' +const requestTimeoutMs = Number( + process.env.INTENT_DISCOVERY_LLM_JUDGE_TIMEOUT_MS ?? '30000', +) + +if (!apiKey) { + console.log('Skipped LLM judge: OPENAI_API_KEY is not set.') + process.exit(0) +} + +const report = JSON.parse(readFileSync(reportPath, 'utf8')) +const cases = reportCases(report) +const judgments = [] + +for (const item of cases) { + judgments.push(await judgeCase({ apiKey, item, model })) +} + +const output = { + generatedAt: new Date().toISOString(), + judgments, + model, +} +const outDir = dirname(reportPath) +mkdirSync(outDir, { recursive: true }) +writeFileSync( + join(outDir, 'llm-judge.json'), + `${JSON.stringify(output, null, 2)}\n`, +) +console.log(JSON.stringify(output, null, 2)) + +function reportCases(report) { + return (report.testResults ?? []).flatMap((suite) => + (suite.assertionResults ?? []) + .filter((test) => test.meta?.eval) + .map((test) => { + const run = test.meta.harness?.run ?? {} + const artifacts = run.artifacts ?? {} + const scores = Object.fromEntries( + (test.meta.eval.scores ?? []).map((score) => [ + score.name, + score.score ?? 0, + ]), + ) + + return { + artifacts: pick(artifacts, [ + 'condition', + 'expectedSkillAreas', + 'intentCommandsInvoked', + 'loadedSkills', + 'runnerStatus', + 'taskId', + ]), + finalAnswer: test.meta.eval.output?.finalAnswer ?? '', + scores, + title: test.title, + } + }), + ) +} + +async function judgeCase({ apiKey, item, model }) { + const controller = new AbortController() + const timeout = setTimeout(() => controller.abort(), requestTimeoutMs) + let response + + try { + response = await fetch('https://api.openai.com/v1/chat/completions', { + body: JSON.stringify({ + messages: [ + { + role: 'system', + content: + 'You judge whether a coding agent output appears to apply loaded library skill guidance. You must not decide whether Intent was invoked; that is provided by deterministic scores. Return strict JSON only.', + }, + { + role: 'user', + content: JSON.stringify({ + instruction: + 'Assess final output quality only. Return {"appliedGuidance":"yes"|"no"|"unknown","rationale":"..."}. Use unknown when evidence is insufficient.', + item, + }), + }, + ], + model, + response_format: { type: 'json_object' }, + temperature: 0, + }), + headers: { + authorization: `Bearer ${apiKey}`, + 'content-type': 'application/json', + }, + method: 'POST', + signal: controller.signal, + }) + } catch (error) { + return { + deterministicScores: item.scores, + error: `LLM judge request failed: ${String(error)}`, + title: item.title, + } + } finally { + clearTimeout(timeout) + } + + if (!response.ok) { + return { + error: await response.text(), + title: item.title, + } + } + + const body = await response.json() + const content = body.choices?.[0]?.message?.content ?? '{}' + let judgment + try { + judgment = JSON.parse(content) + } catch (error) { + return { + deterministicScores: item.scores, + error: `Invalid JSON from model: ${String(error)}`, + rawContent: content, + title: item.title, + } + } + + return { + deterministicScores: item.scores, + judgment, + title: item.title, + } +} + +function pick(value, keys) { + return Object.fromEntries( + keys + .filter((key) => Object.prototype.hasOwnProperty.call(value, key)) + .map((key) => [key, value[key]]), + ) +} diff --git a/evals/intent-discovery/bin/summarize-results.mjs b/evals/intent-discovery/bin/summarize-results.mjs new file mode 100644 index 0000000..7032e14 --- /dev/null +++ b/evals/intent-discovery/bin/summarize-results.mjs @@ -0,0 +1,174 @@ +#!/usr/bin/env node + +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' + +const reportPath = + process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json' +const report = JSON.parse(readFileSync(reportPath, 'utf8')) +const summary = summarizeReport(report) +const outDir = dirname(reportPath) + +mkdirSync(outDir, { recursive: true }) +writeFileSync( + join(outDir, 'summary.json'), + `${JSON.stringify(summary, null, 2)}\n`, +) +writeFileSync(join(outDir, 'summary.md'), `${formatSummaryMarkdown(summary)}\n`) +console.log(formatSummaryMarkdown(summary)) + +export function summarizeReport(report) { + const cases = reportCases(report) + const byCondition = groupBy(cases, (item) => item.condition ?? 'unknown') + const conditionSummaries = Object.fromEntries( + [...byCondition.entries()].map(([condition, items]) => [ + condition, + summarizeCases(items), + ]), + ) + + return { + generatedAt: new Date().toISOString(), + totals: { + reportCases: cases.length, + testFailures: report.numFailedTests ?? 0, + testPasses: report.numPassedTests ?? 0, + testSuites: report.numTotalTestSuites ?? 0, + }, + byCondition: conditionSummaries, + failureClasses: countBy( + cases.map((item) => item.failureClass ?? 'unknown'), + ), + repeatedRuns: repeatedRunSummary(cases), + } +} + +function reportCases(report) { + return (report.testResults ?? []).flatMap((suite) => + (suite.assertionResults ?? []) + .filter((test) => test.meta?.eval) + .map((test) => { + const artifacts = test.meta.harness?.run?.artifacts ?? {} + const scores = Object.fromEntries( + (test.meta.eval.scores ?? []).map((score) => [ + score.name, + score.score ?? 0, + ]), + ) + const firstScore = test.meta.eval.scores?.[0] + + return { + condition: artifacts.condition, + failureClass: firstScore?.metadata?.failureClass, + fixture: artifacts.fixture, + loadedSkills: artifacts.loadedSkills ?? [], + scores, + taskId: artifacts.taskId ?? test.title, + title: test.title, + } + }), + ) +} + +function summarizeCases(cases) { + return { + autonomousSuccessRate: rate(cases, 'AutonomousDiscoverySuccess'), + correctSkillLoadedRate: rate(cases, 'CorrectSkillLoaded'), + count: cases.length, + referenceOnlyFalsePositiveRate: rate(cases, 'NoReferenceOnlyFalsePositive'), + strictInvocationRate: rate(cases, 'StrictIntentInvocation'), + } +} + +function repeatedRunSummary(cases) { + const liveCases = cases.filter((item) => item.title.includes('/run-')) + const grouped = groupBy(liveCases, (item) => + item.title.replace(/\/run-\d+$/, ''), + ) + + return Object.fromEntries( + [...grouped.entries()].map(([key, items]) => { + const successes = items.map( + (item) => item.scores.AutonomousDiscoverySuccess === 1, + ) + + return [ + key, + { + passAtK: successes.some(Boolean), + passHatK: successes.every(Boolean), + runs: items.length, + successes: successes.filter(Boolean).length, + }, + ] + }), + ) +} + +function formatSummaryMarkdown(summary) { + const lines = [ + '# Intent discovery eval summary', + '', + `Report cases: ${summary.totals.reportCases}`, + `Tests: ${summary.totals.testPasses} passed, ${summary.totals.testFailures} failed`, + '', + '## By condition', + '', + '| Condition | Cases | Strict invocation | Correct skill | Autonomous success | No reference-only false positive |', + '| --- | ---: | ---: | ---: | ---: | ---: |', + ] + + for (const [condition, item] of Object.entries(summary.byCondition)) { + lines.push( + `| ${condition} | ${item.count} | ${percent(item.strictInvocationRate)} | ${percent(item.correctSkillLoadedRate)} | ${percent(item.autonomousSuccessRate)} | ${percent(item.referenceOnlyFalsePositiveRate)} |`, + ) + } + + lines.push('', '## Failure classes', '') + for (const [failureClass, count] of Object.entries(summary.failureClasses)) { + lines.push(`- ${failureClass}: ${count}`) + } + + lines.push('', '## Repeated runs', '') + const repeated = Object.entries(summary.repeatedRuns) + if (repeated.length === 0) { + lines.push('No repeated live runs found.') + } else { + for (const [key, item] of repeated) { + lines.push( + `- ${key}: pass@k=${item.passAtK}, pass^k=${item.passHatK}, successes=${item.successes}/${item.runs}`, + ) + } + } + + return lines.join('\n') +} + +function groupBy(items, keyFn) { + const grouped = new Map() + for (const item of items) { + const key = keyFn(item) + grouped.set(key, [...(grouped.get(key) ?? []), item]) + } + return grouped +} + +function countBy(items) { + return Object.fromEntries( + [...groupBy(items, (item) => item).entries()].map(([key, values]) => [ + key, + values.length, + ]), + ) +} + +function rate(cases, scoreName) { + if (cases.length === 0) return 0 + return ( + cases.filter((item) => item.scores[scoreName] === 1).length / cases.length + ) +} + +function percent(value) { + return `${Math.round(value * 100)}%` +} diff --git a/evals/intent-discovery/condition-setup.eval.ts b/evals/intent-discovery/condition-setup.eval.ts new file mode 100644 index 0000000..53e5bf1 --- /dev/null +++ b/evals/intent-discovery/condition-setup.eval.ts @@ -0,0 +1,106 @@ +import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import { applyIntentCondition } from './harness/setup-intent-condition' +import { prepareFixtureWorkspace } from './harness/prepare-fixture' + +describe('Intent discovery condition setup', () => { + it('leaves no-intent workspaces without Intent guidance', () => { + const prepared = prepareInTemp() + + try { + const result = applyIntentCondition({ + condition: 'no-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + + expect(result.filesWritten).toEqual([]) + expect(existsSync(join(prepared.workspacePath, 'AGENTS.md'))).toBe(false) + expect( + readFileSync(join(prepared.workspacePath, 'package.json'), 'utf8'), + ).not.toContain('"intent"') + } finally { + prepared.cleanup() + } + }) + + it('writes current Intent guidance without mappings', () => { + const prepared = prepareInTemp() + + try { + const result = applyIntentCondition({ + condition: 'current-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + const agents = readFileSync( + join(prepared.workspacePath, 'AGENTS.md'), + 'utf8', + ) + const packageJson = readFileSync( + join(prepared.workspacePath, 'package.json'), + 'utf8', + ) + + expect(result.filesWritten).toHaveLength(4) + expect(agents).toContain('Skill Loading') + expect(agents).toContain('npx @tanstack/intent@latest list') + expect(agents).not.toContain('\nskills:\n') + expect(packageJson).toContain('"@tanstack/router"') + expect( + existsSync( + join( + prepared.workspacePath, + 'node_modules', + '@tanstack', + 'router', + 'skills', + 'routing', + 'SKILL.md', + ), + ), + ).toBe(true) + } finally { + prepared.cleanup() + } + }) + + it('writes mapped Intent guidance with use values', () => { + const prepared = prepareInTemp() + + try { + applyIntentCondition({ + condition: 'mapped-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + const agents = readFileSync( + join(prepared.workspacePath, 'AGENTS.md'), + 'utf8', + ) + + expect(agents).toContain('skills:') + expect(agents).toContain('use: "@tanstack/router#routing"') + } finally { + prepared.cleanup() + } + }) +}) + +function prepareInTemp() { + const parentDir = mkdtempSync(join(tmpdir(), 'intent-eval-condition-')) + const prepared = prepareFixtureWorkspace({ + fixture: 'router-basic', + parentDir, + }) + + return { + ...prepared, + cleanup() { + prepared.cleanup() + rmSync(parentDir, { recursive: true, force: true }) + }, + } +} diff --git a/evals/intent-discovery/corpus/conditions.ts b/evals/intent-discovery/corpus/conditions.ts new file mode 100644 index 0000000..656d067 --- /dev/null +++ b/evals/intent-discovery/corpus/conditions.ts @@ -0,0 +1,46 @@ +const intentDiscoveryConditions = [ + { + id: 'no-intent', + countsTowardAutonomousScore: true, + }, + { + id: 'plain-docs', + countsTowardAutonomousScore: true, + }, + { + id: 'current-intent', + countsTowardAutonomousScore: true, + }, + { + id: 'mapped-intent', + countsTowardAutonomousScore: true, + }, + { + id: 'explicit-intent-control', + countsTowardAutonomousScore: false, + }, +] as const + +export type IntentDiscoveryCondition = + (typeof intentDiscoveryConditions)[number]['id'] + +const promptExplicitnessLevels = [0, 1, 2, 3, 4] as const + +export type PromptExplicitnessLevel = (typeof promptExplicitnessLevels)[number] + +export function countsTowardAutonomousScore({ + condition, + explicitnessLevel, +}: { + condition: IntentDiscoveryCondition + explicitnessLevel: PromptExplicitnessLevel +}): boolean { + if (explicitnessLevel === 4) { + return false + } + + return ( + intentDiscoveryConditions.find((candidate) => candidate.id === condition) + ?.countsTowardAutonomousScore ?? false + ) +} diff --git a/evals/intent-discovery/corpus/fixtures.ts b/evals/intent-discovery/corpus/fixtures.ts new file mode 100644 index 0000000..de38f74 --- /dev/null +++ b/evals/intent-discovery/corpus/fixtures.ts @@ -0,0 +1,29 @@ +import type { ExpectedSkillArea, IntentDiscoveryFixture } from './tasks' + +export type IntentDiscoveryFixtureDefinition = { + id: IntentDiscoveryFixture + purpose: string + skillAreas: Array + files: Array +} + +export const fixtures = { + 'router-basic': { + id: 'router-basic', + purpose: 'Route discovery and route loader changes.', + skillAreas: ['router'], + files: ['package.json', 'src/routes/users.$userId.tsx'], + }, + 'start-basic': { + id: 'start-basic', + purpose: 'TanStack Start server function and route loader behavior.', + skillAreas: ['start'], + files: ['package.json', 'src/routes/users.tsx'], + }, + 'table-v9-basic': { + id: 'table-v9-basic', + purpose: 'TanStack Table v9 column definitions and sorting behavior.', + skillAreas: ['table-v9'], + files: ['package.json', 'src/user-table.tsx'], + }, +} satisfies Record diff --git a/evals/intent-discovery/corpus/live-tasks.ts b/evals/intent-discovery/corpus/live-tasks.ts new file mode 100644 index 0000000..d0977de --- /dev/null +++ b/evals/intent-discovery/corpus/live-tasks.ts @@ -0,0 +1,63 @@ +import type { IntentDiscoveryTask } from './tasks' + +const routerPrompt = + 'Add a route that loads user data before rendering the page.' + +export const liveTasks: Array = [ + { + id: 'live-router-no-intent', + fixture: 'router-basic', + condition: 'no-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: false, + correctSkillLoaded: false, + referenceOnly: true, + failureClass: 'reference-only', + }, + }, + { + id: 'live-router-current-intent', + fixture: 'router-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'live-router-mapped-intent', + fixture: 'router-basic', + condition: 'mapped-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'live-router-explicit-intent-control', + fixture: 'router-basic', + condition: 'explicit-intent-control', + explicitnessLevel: 4, + prompt: `${routerPrompt}\n\nRun intent list, load the relevant skill, and use the loaded guidance before changing files.`, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, +] diff --git a/evals/intent-discovery/corpus/skill-uses.ts b/evals/intent-discovery/corpus/skill-uses.ts new file mode 100644 index 0000000..c3142f9 --- /dev/null +++ b/evals/intent-discovery/corpus/skill-uses.ts @@ -0,0 +1,13 @@ +import type { ExpectedSkillArea } from './tasks' + +export const expectedSkillUseByArea = { + router: '@tanstack/router#routing', + start: '@tanstack/start#routing', + 'table-v9': '@tanstack/table#v9-columns', +} satisfies Record + +export const packageAllowlistByArea = { + router: '@tanstack/router', + start: '@tanstack/start', + 'table-v9': '@tanstack/table', +} satisfies Record diff --git a/evals/intent-discovery/corpus/tasks.ts b/evals/intent-discovery/corpus/tasks.ts new file mode 100644 index 0000000..ca425e3 --- /dev/null +++ b/evals/intent-discovery/corpus/tasks.ts @@ -0,0 +1,90 @@ +import type { + IntentDiscoveryCondition, + PromptExplicitnessLevel, +} from './conditions' + +const expectedSkillAreas = ['router', 'start', 'table-v9'] as const + +export type ExpectedSkillArea = (typeof expectedSkillAreas)[number] + +export type IntentDiscoveryFixture = + | 'router-basic' + | 'start-basic' + | 'table-v9-basic' + +export type IntentDiscoveryFailureClass = + | 'strict-success' + | 'no-discovery-attempt' + | 'instruction-ignored' + | 'wrong-surface' + | 'command-unknown' + | 'command-attempted-but-failed' + | 'wrong-skill-selected' + | 'late-load' + | 'reference-only' + | 'final-output-only' + | 'context-saturation' + | 'prompt-too-vague' + | 'harness-error' + +type IntentDiscoveryExpected = { + strictInvocation: boolean + correctSkillLoaded: boolean + referenceOnly: boolean + failureClass: IntentDiscoveryFailureClass +} + +export type IntentDiscoveryTask = { + id: string + fixture: IntentDiscoveryFixture + condition: IntentDiscoveryCondition + explicitnessLevel: PromptExplicitnessLevel + prompt: string + expectedSkillAreas: Array + expected: IntentDiscoveryExpected +} + +export const tasks: Array = [ + { + id: 'router-current-intent-loads-router', + fixture: 'router-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: 'Add a route that loads user data before rendering the page.', + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'router-plain-docs-reference-only', + fixture: 'router-basic', + condition: 'plain-docs', + explicitnessLevel: 2, + prompt: 'Add a route that loads user data before rendering the page.', + expectedSkillAreas: ['router'], + expected: { + strictInvocation: false, + correctSkillLoaded: false, + referenceOnly: true, + failureClass: 'reference-only', + }, + }, + { + id: 'table-v9-current-intent-loads-wrong-skill', + fixture: 'table-v9-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: 'Add a TanStack Table v9 column with sortable user roles.', + expectedSkillAreas: ['table-v9'], + expected: { + strictInvocation: true, + correctSkillLoaded: false, + referenceOnly: false, + failureClass: 'wrong-skill-selected', + }, + }, +] diff --git a/evals/intent-discovery/fixture-corpus.eval.ts b/evals/intent-discovery/fixture-corpus.eval.ts new file mode 100644 index 0000000..1ab4d7a --- /dev/null +++ b/evals/intent-discovery/fixture-corpus.eval.ts @@ -0,0 +1,42 @@ +import { existsSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { describe, expect, it } from 'vitest' +import { fixtures } from './corpus/fixtures' +import { tasks } from './corpus/tasks' +import type { IntentDiscoveryFixtureDefinition } from './corpus/fixtures' + +const fixturesDir = join(dirname(fileURLToPath(import.meta.url)), 'fixtures') + +describe('Intent discovery fixture corpus', () => { + it('has source files for every declared fixture', () => { + for (const fixture of Object.values(fixtures)) { + for (const file of fixture.files) { + expect( + existsSync(join(fixturesDir, fixture.id, file)), + `${fixture.id} is missing ${file}`, + ).toBe(true) + } + } + }) + + it('points each task at a fixture that covers its expected skill areas', () => { + for (const task of tasks) { + const fixture = ( + fixtures as Partial> + )[task.fixture] + + expect(fixture, `${task.id} uses an unknown fixture`).toBeDefined() + if (!fixture) { + continue + } + + expect( + task.expectedSkillAreas.every((area) => + fixture.skillAreas.includes(area), + ), + `${task.id} expects ${task.expectedSkillAreas.join(', ')} but ${fixture.id} covers ${fixture.skillAreas.join(', ')}`, + ).toBe(true) + } + }) +}) diff --git a/evals/intent-discovery/fixtures/router-basic/package.json b/evals/intent-discovery/fixtures/router-basic/package.json new file mode 100644 index 0000000..268e5c7 --- /dev/null +++ b/evals/intent-discovery/fixtures/router-basic/package.json @@ -0,0 +1,10 @@ +{ + "name": "intent-eval-router-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-router": "1.170.16", + "react": "19.2.0", + "react-dom": "19.2.0" + } +} diff --git a/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx b/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx new file mode 100644 index 0000000..09c25e3 --- /dev/null +++ b/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx @@ -0,0 +1,27 @@ +import { createFileRoute } from '@tanstack/react-router' + +type User = { + id: string + name: string +} + +async function fetchUser(userId: string): Promise { + const response = await fetch(`/api/users/${userId}`) + + if (!response.ok) { + throw new Error('Unable to load user') + } + + return response.json() as Promise +} + +export const Route = createFileRoute('/users/$userId')({ + loader: ({ params }) => fetchUser(params.userId), + component: UserRoute, +}) + +function UserRoute() { + const user = Route.useLoaderData() + + return

{user.name}

+} diff --git a/evals/intent-discovery/fixtures/saved-transcripts.ts b/evals/intent-discovery/fixtures/saved-transcripts.ts new file mode 100644 index 0000000..69c05d7 --- /dev/null +++ b/evals/intent-discovery/fixtures/saved-transcripts.ts @@ -0,0 +1,162 @@ +import { tasks } from '../corpus/tasks' +import type { NormalizedMessage, ToolCallRecord } from 'vitest-evals' +import type { IntentDiscoveryTask } from '../corpus/tasks' + +export type SavedTranscriptCase = IntentDiscoveryTask & { + finalAnswer: string + messages: Array + toolCalls: Array + commandsInvoked: Array + intentCommandsInvoked: Array + intentCommandOutputs: Array + loadedSkills: Array + agentErrors: Array +} + +const taskById = new Map(tasks.map((task) => [task.id, task])) + +export const savedTranscriptCases: Array = [ + savedTranscript('router-current-intent-loads-router', { + finalAnswer: + 'I loaded @tanstack/router#routing and used its route loader guidance before making the route change.', + messages: [ + { + role: 'user', + content: 'Add a route that loads user data before rendering the page.', + }, + { + role: 'assistant', + content: 'I will inspect the installed Intent skills first.', + }, + { + role: 'tool', + content: + '$ pnpm exec intent list\n@tanstack/router#routing - Router route and loader guidance', + }, + { + role: 'tool', + content: + '$ pnpm exec intent load @tanstack/router#routing\nLoaded @tanstack/router#routing', + }, + { + role: 'assistant', + content: + 'I loaded @tanstack/router#routing and used its route loader guidance before making the route change.', + }, + ], + toolCalls: [ + { + name: 'shell_command', + arguments: { command: 'pnpm exec intent list' }, + result: '@tanstack/router#routing - Router route and loader guidance', + }, + { + name: 'shell_command', + arguments: { + command: 'pnpm exec intent load @tanstack/router#routing', + }, + result: 'Loaded @tanstack/router#routing', + }, + ], + commandsInvoked: [ + 'pnpm exec intent list', + 'pnpm exec intent load @tanstack/router#routing', + ], + intentCommandsInvoked: [ + 'pnpm exec intent list', + 'pnpm exec intent load @tanstack/router#routing', + ], + intentCommandOutputs: [ + '@tanstack/router#routing - Router route and loader guidance', + 'Loaded @tanstack/router#routing', + ], + loadedSkills: ['@tanstack/router#routing'], + agentErrors: [], + }), + savedTranscript('router-plain-docs-reference-only', { + finalAnswer: + 'This looks like a TanStack Router loader task. I would follow the Router docs and add the loader to the route.', + messages: [ + { + role: 'user', + content: 'Add a route that loads user data before rendering the page.', + }, + { + role: 'assistant', + content: + 'This looks like a TanStack Router loader task. I would follow the Router docs and add the loader to the route.', + }, + ], + toolCalls: [], + commandsInvoked: [], + intentCommandsInvoked: [], + intentCommandOutputs: [], + loadedSkills: [], + agentErrors: [], + }), + savedTranscript('table-v9-current-intent-loads-wrong-skill', { + finalAnswer: + 'I loaded @tanstack/router#routing, but the task needed TanStack Table v9 column guidance.', + messages: [ + { + role: 'user', + content: 'Add a TanStack Table v9 column with sortable user roles.', + }, + { + role: 'tool', + content: + '$ intent list\n@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', + }, + { + role: 'tool', + content: + '$ intent load @tanstack/router#routing\nLoaded @tanstack/router#routing', + }, + { + role: 'assistant', + content: + 'I loaded @tanstack/router#routing, but the task needed TanStack Table v9 column guidance.', + }, + ], + toolCalls: [ + { + name: 'shell_command', + arguments: { command: 'intent list' }, + result: + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', + }, + { + name: 'shell_command', + arguments: { command: 'intent load @tanstack/router#routing' }, + result: 'Loaded @tanstack/router#routing', + }, + ], + commandsInvoked: ['intent list', 'intent load @tanstack/router#routing'], + intentCommandsInvoked: [ + 'intent list', + 'intent load @tanstack/router#routing', + ], + intentCommandOutputs: [ + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', + 'Loaded @tanstack/router#routing', + ], + loadedSkills: ['@tanstack/router#routing'], + agentErrors: [], + }), +] + +function savedTranscript( + taskId: string, + transcript: Omit, +): SavedTranscriptCase { + const task = taskById.get(taskId) + + if (!task) { + throw new Error(`Unknown saved transcript task: ${taskId}`) + } + + return { + ...task, + ...transcript, + } +} diff --git a/evals/intent-discovery/fixtures/start-basic/package.json b/evals/intent-discovery/fixtures/start-basic/package.json new file mode 100644 index 0000000..3204434 --- /dev/null +++ b/evals/intent-discovery/fixtures/start-basic/package.json @@ -0,0 +1,11 @@ +{ + "name": "intent-eval-start-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-router": "1.170.16", + "@tanstack/react-start": "1.168.26", + "react": "19.2.0", + "react-dom": "19.2.0" + } +} diff --git a/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx b/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx new file mode 100644 index 0000000..19561d3 --- /dev/null +++ b/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx @@ -0,0 +1,33 @@ +import { createFileRoute } from '@tanstack/react-router' +import { createServerFn } from '@tanstack/react-start' + +type User = { + id: string + name: string +} + +const getUsers = createServerFn({ method: 'GET' }).handler(async () => { + const users: Array = [ + { id: '1', name: 'Ada Lovelace' }, + { id: '2', name: 'Grace Hopper' }, + ] + + return users +}) + +export const Route = createFileRoute('/users')({ + loader: () => getUsers(), + component: UsersRoute, +}) + +function UsersRoute() { + const users = Route.useLoaderData() + + return ( +
    + {users.map((user) => ( +
  • {user.name}
  • + ))} +
+ ) +} diff --git a/evals/intent-discovery/fixtures/table-v9-basic/package.json b/evals/intent-discovery/fixtures/table-v9-basic/package.json new file mode 100644 index 0000000..0a17270 --- /dev/null +++ b/evals/intent-discovery/fixtures/table-v9-basic/package.json @@ -0,0 +1,10 @@ +{ + "name": "intent-eval-table-v9-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-table": "9.0.0-beta.16", + "react": "19.2.0", + "react-dom": "19.2.0" + } +} diff --git a/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx new file mode 100644 index 0000000..5c07dce --- /dev/null +++ b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx @@ -0,0 +1,70 @@ +import { + createColumnHelper, + flexRender, + getCoreRowModel, + getSortedRowModel, + useReactTable, + type SortingState, +} from '@tanstack/react-table' +import { useState } from 'react' + +type User = { + id: string + name: string + role: string +} + +const columnHelper = createColumnHelper() + +const columns = [ + columnHelper.accessor('name', { + header: 'Name', + cell: (info) => info.getValue(), + }), + columnHelper.accessor('role', { + header: 'Role', + cell: (info) => info.getValue(), + }), +] + +export function UserTable({ users }: { users: Array }) { + const [sorting, setSorting] = useState([]) + const table = useReactTable({ + data: users, + columns, + state: { sorting }, + onSortingChange: setSorting, + getCoreRowModel: getCoreRowModel(), + getSortedRowModel: getSortedRowModel(), + }) + + return ( + + + {table.getHeaderGroups().map((headerGroup) => ( + + {headerGroup.headers.map((header) => ( + + ))} + + ))} + + + {table.getRowModel().rows.map((row) => ( + + {row.getVisibleCells().map((cell) => ( + + ))} + + ))} + +
+ {flexRender( + header.column.columnDef.header, + header.getContext(), + )} +
+ {flexRender(cell.column.columnDef.cell, cell.getContext())} +
+ ) +} diff --git a/evals/intent-discovery/graders/correct-skill-loaded.ts b/evals/intent-discovery/graders/correct-skill-loaded.ts new file mode 100644 index 0000000..b24e307 --- /dev/null +++ b/evals/intent-discovery/graders/correct-skill-loaded.ts @@ -0,0 +1,25 @@ +import { loadedSkillUsesFromRun } from '../harness/parse-intent-commands' +import { listIncludesExpectedSkillArea } from './skill-areas' +import type { HarnessRun } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' + +export type CorrectSkillLoadedResult = { + passed: boolean + loadedSkills: Array +} + +export function correctSkillLoaded( + run: HarnessRun, + expectedSkillAreas: Array, +): CorrectSkillLoadedResult { + const loadedSkills = loadedSkillsFromRun(run) + + return { + passed: listIncludesExpectedSkillArea(loadedSkills, expectedSkillAreas), + loadedSkills, + } +} + +function loadedSkillsFromRun(run: HarnessRun): Array { + return loadedSkillUsesFromRun(run) +} diff --git a/evals/intent-discovery/graders/eval-metadata.ts b/evals/intent-discovery/graders/eval-metadata.ts new file mode 100644 index 0000000..d09aaf1 --- /dev/null +++ b/evals/intent-discovery/graders/eval-metadata.ts @@ -0,0 +1,53 @@ +import { toolCalls } from 'vitest-evals' +import type { HarnessRun, JsonValue, JudgeResult } from 'vitest-evals' + +export type NamedJudgeResult = JudgeResult & { name: string } + +export type RuntimeTask = { + meta: { + harness?: unknown + eval?: unknown + } +} + +export function score( + name: string, + passed: boolean, + metadata?: NamedJudgeResult['metadata'], +): NamedJudgeResult { + return { + name, + score: passed ? 1 : 0, + metadata, + } +} + +export function attachEvalMetadata({ + harnessName, + run, + scores, + task, +}: { + harnessName: string + run: HarnessRun + scores: Array + task: RuntimeTask +}): void { + const avgScore = + scores.length === 0 + ? 0 + : scores.reduce((total, item) => total + (item.score ?? 0), 0) / + scores.length + + task.meta.harness = { + name: harnessName, + run, + } + task.meta.eval = { + scores, + avgScore, + output: run.output, + toolCalls: toolCalls(run), + thresholdFailed: false, + } +} diff --git a/evals/intent-discovery/graders/failure-classifier.ts b/evals/intent-discovery/graders/failure-classifier.ts new file mode 100644 index 0000000..82873c3 --- /dev/null +++ b/evals/intent-discovery/graders/failure-classifier.ts @@ -0,0 +1,38 @@ +import { correctSkillLoaded } from './correct-skill-loaded' +import { referenceOnly } from './reference-only' +import { strictIntentInvocation } from './strict-invocation' +import type { + ExpectedSkillArea, + IntentDiscoveryFailureClass, +} from '../corpus/tasks' +import type { HarnessRun } from 'vitest-evals' + +export function classifyFailure( + run: HarnessRun, + expectedSkillAreas: Array, +): IntentDiscoveryFailureClass { + if (run.errors.length > 0) { + return 'harness-error' + } + + const strict = strictIntentInvocation(run) + const skillLoaded = correctSkillLoaded(run, expectedSkillAreas) + + if (strict.passed && skillLoaded.passed) { + return 'strict-success' + } + + if (strict.passed && skillLoaded.loadedSkills.length > 0) { + return 'wrong-skill-selected' + } + + if (strict.passed) { + return 'command-attempted-but-failed' + } + + if (referenceOnly(run, expectedSkillAreas)) { + return 'reference-only' + } + + return 'no-discovery-attempt' +} diff --git a/evals/intent-discovery/graders/reference-only.ts b/evals/intent-discovery/graders/reference-only.ts new file mode 100644 index 0000000..38f2a87 --- /dev/null +++ b/evals/intent-discovery/graders/reference-only.ts @@ -0,0 +1,20 @@ +import { jsonToSearchableText, textMatchesSkillArea } from './skill-areas' +import { strictIntentInvocation } from './strict-invocation' +import type { HarnessRun } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' + +export function referenceOnly( + run: HarnessRun, + expectedSkillAreas: Array, +): boolean { + if (strictIntentInvocation(run).passed) { + return false + } + + const transcriptText = run.session.messages + .filter((message) => message.role !== 'user') + .map((message) => jsonToSearchableText(message.content)) + .join('\n') + + return textMatchesSkillArea(transcriptText, expectedSkillAreas) +} diff --git a/evals/intent-discovery/graders/skill-areas.ts b/evals/intent-discovery/graders/skill-areas.ts new file mode 100644 index 0000000..64c869b --- /dev/null +++ b/evals/intent-discovery/graders/skill-areas.ts @@ -0,0 +1,41 @@ +import type { JsonValue } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' + +const skillAreaPatterns: Record> = { + router: [/router/i, /routing/i, /@tanstack\/router/i], + start: [/tanstack start/i, /react-start/i, /server function/i, /full-stack/i], + 'table-v9': [ + /tanstack table/i, + /react-table/i, + /@tanstack\/react-table/i, + /\btable[\s-]?v9\b/i, + ], +} + +export function jsonToSearchableText(value: JsonValue | undefined): string { + if (value === undefined || value === null) { + return '' + } + + if (typeof value === 'string') { + return value + } + + return JSON.stringify(value) +} + +export function textMatchesSkillArea( + text: string, + expectedSkillAreas: Array, +): boolean { + return expectedSkillAreas.some((area) => + skillAreaPatterns[area].some((pattern) => pattern.test(text)), + ) +} + +export function listIncludesExpectedSkillArea( + values: Array, + expectedSkillAreas: Array, +): boolean { + return values.some((value) => textMatchesSkillArea(value, expectedSkillAreas)) +} diff --git a/evals/intent-discovery/graders/strict-invocation.ts b/evals/intent-discovery/graders/strict-invocation.ts new file mode 100644 index 0000000..06e4ff3 --- /dev/null +++ b/evals/intent-discovery/graders/strict-invocation.ts @@ -0,0 +1,24 @@ +import { intentCommandsFromRun } from '../harness/parse-intent-commands' +import type { HarnessRun } from 'vitest-evals' + +export type StrictInvocationResult = { + passed: boolean + matchedCommand?: string + source?: 'tool-call' | 'tool-message' +} + +export function strictIntentInvocation( + run: HarnessRun, +): StrictInvocationResult { + const command = intentCommandsFromRun(run)[0] + + if (!command) { + return { passed: false } + } + + return { + passed: true, + matchedCommand: command.raw, + source: command.source, + } +} diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts new file mode 100644 index 0000000..a38dbb8 --- /dev/null +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -0,0 +1,257 @@ +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, +} from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import { fixtures } from './corpus/fixtures' +import { tasks } from './corpus/tasks' +import { referenceOnly } from './graders/reference-only' +import { + intentCommandsFromToolCalls, + parseIntentCommand, +} from './harness/parse-intent-commands' +import { prepareFixtureWorkspace } from './harness/prepare-fixture' +import type { ToolCallRecord } from 'vitest-evals' + +describe('Intent discovery harness capture', () => { + it('parses accepted Intent command forms from tool calls', () => { + const calls: Array = [ + { name: 'shell_command', arguments: { command: 'intent list' } }, + { + name: 'shell_command', + arguments: { + command: 'pnpm exec intent load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { + command: 'npx @tanstack/intent load @tanstack/start#routing', + }, + }, + { + name: 'shell_command', + arguments: { + command: + 'cd /tmp/eval/router-basic && npx @tanstack/intent@latest load @tanstack/router#routing 2>&1', + }, + }, + { + name: 'shell_command', + arguments: { + command: + 'pnpm dlx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'pnpm dlx @tanstack/intent@latest list' }, + }, + { + name: 'shell_command', + arguments: { command: 'pnpm dlx @tanstack/intent list' }, + }, + { + name: 'shell_command', + arguments: { + command: + 'yarn dlx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'yarn dlx @tanstack/intent@latest list' }, + }, + { + name: 'shell_command', + arguments: { command: 'yarn dlx @tanstack/intent list' }, + }, + { + name: 'shell_command', + arguments: { + command: 'bunx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'bunx @tanstack/intent@latest list' }, + }, + { + name: 'shell_command', + arguments: { command: 'bunx @tanstack/intent list' }, + }, + ] + + expect(intentCommandsFromToolCalls(calls)).toEqual([ + { + raw: 'intent list', + executable: 'intent', + action: 'list', + source: 'tool-call', + }, + { + raw: 'pnpm exec intent load @tanstack/router#routing', + executable: 'pnpm exec intent', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'npx @tanstack/intent load @tanstack/start#routing', + executable: 'npx @tanstack/intent', + action: 'load', + skillUse: '@tanstack/start#routing', + source: 'tool-call', + }, + { + raw: 'npx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'npx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'pnpm dlx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'pnpm dlx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'pnpm dlx @tanstack/intent@latest list', + executable: 'pnpm dlx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, + { + raw: 'pnpm dlx @tanstack/intent list', + executable: 'pnpm dlx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, + { + raw: 'yarn dlx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'yarn dlx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'yarn dlx @tanstack/intent@latest list', + executable: 'yarn dlx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, + { + raw: 'yarn dlx @tanstack/intent list', + executable: 'yarn dlx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, + { + raw: 'bunx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'bunx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'bunx @tanstack/intent@latest list', + executable: 'bunx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, + { + raw: 'bunx @tanstack/intent list', + executable: 'bunx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, + ]) + }) + + it('does not parse prose mentions as strict invocation', () => { + expect( + parseIntentCommand( + 'I would run intent load @tanstack/router#routing', + 'tool-message', + ), + ).toBeUndefined() + }) + + it('does not treat user prompt skill mentions as reference-only evidence', () => { + expect( + referenceOnly( + { + errors: [], + output: { finalAnswer: 'Done.' }, + session: { + messages: [ + { + role: 'user', + content: 'Use TanStack Router if needed.', + }, + { + role: 'assistant', + content: 'Done.', + }, + ], + }, + usage: {}, + }, + ['router'], + ), + ).toBe(false) + }) + + it('prepares an isolated workspace for every task fixture', () => { + const parentDir = mkdtempSync(join(tmpdir(), 'intent-eval-fixtures-')) + + try { + for (const task of tasks) { + const prepared = prepareFixtureWorkspace({ + fixture: task.fixture, + parentDir, + }) + const fixture = fixtures[task.fixture] + + for (const file of fixture.files) { + expect(existsSync(join(prepared.workspacePath, file))).toBe(true) + } + + mkdirSync(join(prepared.workspacePath, 'src', 'generated'), { + recursive: true, + }) + prepared.cleanup() + expect(existsSync(prepared.workspacePath)).toBe(false) + } + } finally { + rmSync(parentDir, { recursive: true, force: true }) + } + }) + + it('does not mutate the source fixture while preparing a workspace', () => { + const prepared = prepareFixtureWorkspace({ fixture: 'router-basic' }) + + try { + const sourcePackageJson = readFileSync( + join(prepared.sourcePath, 'package.json'), + 'utf8', + ) + const copiedPackageJson = readFileSync( + join(prepared.workspacePath, 'package.json'), + 'utf8', + ) + + expect(copiedPackageJson).toBe(sourcePackageJson) + expect(prepared.workspacePath).not.toBe(prepared.sourcePath) + } finally { + prepared.cleanup() + } + }) +}) diff --git a/evals/intent-discovery/harness/live-copilot-harness.ts b/evals/intent-discovery/harness/live-copilot-harness.ts new file mode 100644 index 0000000..decd58b --- /dev/null +++ b/evals/intent-discovery/harness/live-copilot-harness.ts @@ -0,0 +1,188 @@ +import { createHarness } from 'vitest-evals' +import { intentCommandsFromToolCalls } from './parse-intent-commands' +import { prepareFixtureWorkspace } from './prepare-fixture' +import { + LiveCopilotRunnerUnavailableError, + runCopilotTask, +} from './run-copilot-task' +import { applyIntentCondition } from './setup-intent-condition' +import type { IntentDiscoveryTask } from '../corpus/tasks' + +export type LiveCopilotOutput = { + finalAnswer: string + runId: string +} + +export const liveCopilotHarness = createHarness< + IntentDiscoveryTask, + LiveCopilotOutput +>({ + name: 'intent-discovery-live-copilot', + run: async ({ input, setArtifact }) => { + const runId = `live:${input.id}` + let prepared: ReturnType | undefined + + try { + prepared = prepareFixtureWorkspace({ fixture: input.fixture }) + const appliedCondition = applyIntentCondition({ + condition: input.condition, + expectedSkillAreas: input.expectedSkillAreas, + workspacePath: prepared.workspacePath, + }) + + setCommonArtifacts({ + input, + runId, + setupFilesWritten: appliedCondition.filesWritten, + workspacePath: prepared.workspacePath, + setArtifact, + }) + + const run = await runCopilotTask({ + task: input, + runId, + sourcePath: prepared.sourcePath, + workspacePath: prepared.workspacePath, + }) + const intentCommands = intentCommandsFromToolCalls(run.toolCalls) + + setArtifact('transcriptPath', run.transcriptPath ?? '') + setArtifact('commandsInvoked', run.commandsInvoked) + setArtifact( + 'intentCommandsInvoked', + run.intentCommandsInvoked.length > 0 + ? run.intentCommandsInvoked + : intentCommands.map((command) => command.raw), + ) + setArtifact('intentCommandOutputs', run.intentCommandOutputs) + setArtifact('loadedSkills', run.loadedSkills) + setArtifact('fileDiff', run.fileDiff ?? '') + setArtifact('agentErrors', run.agentErrors) + + return { + output: { + finalAnswer: run.finalAnswer, + runId: run.runId, + }, + messages: run.messages, + toolCalls: run.toolCalls, + usage: run.usage ?? { + provider: 'copilot', + model: 'unknown', + }, + artifacts: { + runKind: 'live-copilot', + runnerStatus: 'completed', + }, + traces: [ + { + id: runId, + name: 'live Copilot run', + spans: [ + { + id: `${runId}:copilot`, + name: 'run Copilot task', + kind: 'agent', + status: 'ok', + }, + ], + }, + ], + errors: run.agentErrors, + } + } catch (error) { + const normalizedError = normalizeRunnerError(error) + + setArtifact('transcriptPath', '') + setArtifact('commandsInvoked', []) + setArtifact('intentCommandsInvoked', []) + setArtifact('intentCommandOutputs', []) + setArtifact('loadedSkills', []) + setArtifact('fileDiff', '') + setArtifact('agentErrors', [normalizedError.message]) + + return { + output: { + finalAnswer: '', + runId, + }, + messages: [ + { + role: 'user', + content: input.prompt, + }, + ], + toolCalls: [], + usage: { + provider: 'copilot', + model: 'unknown', + }, + artifacts: { + runKind: 'live-copilot', + runnerStatus: + error instanceof LiveCopilotRunnerUnavailableError + ? 'unsupported' + : 'failed', + }, + traces: [ + { + id: runId, + name: 'live Copilot run', + spans: [ + { + id: `${runId}:copilot`, + name: 'run Copilot task', + kind: 'agent', + status: 'error', + error: normalizedError, + }, + ], + }, + ], + errors: [normalizedError], + } + } finally { + prepared?.cleanup() + } + }, +}) + +function setCommonArtifacts({ + input, + runId, + setupFilesWritten, + workspacePath, + setArtifact, +}: { + input: IntentDiscoveryTask + runId: string + setupFilesWritten: Array + workspacePath: string + setArtifact: (name: string, value: string | Array) => void +}): void { + setArtifact('runId', runId) + setArtifact('taskId', input.id) + setArtifact('condition', input.condition) + setArtifact('fixture', input.fixture) + setArtifact('prompt', input.prompt) + setArtifact('expectedSkillAreas', input.expectedSkillAreas) + setArtifact('setupFilesWritten', setupFilesWritten) + setArtifact('workspacePath', workspacePath) +} + +function normalizeRunnerError(error: unknown): { + message: string + type: string +} { + if (error instanceof Error) { + return { + message: error.message, + type: error.name, + } + } + + return { + message: String(error ?? 'Unknown live Copilot runner error'), + type: 'Error', + } +} diff --git a/evals/intent-discovery/harness/parse-intent-commands.ts b/evals/intent-discovery/harness/parse-intent-commands.ts new file mode 100644 index 0000000..fda01cd --- /dev/null +++ b/evals/intent-discovery/harness/parse-intent-commands.ts @@ -0,0 +1,156 @@ +import { toolCalls } from 'vitest-evals' +import { jsonToSearchableText } from '../graders/skill-areas' +import type { HarnessRun, ToolCallRecord } from 'vitest-evals' + +export type ParsedIntentCommand = { + raw: string + executable: + | 'bunx @tanstack/intent' + | 'bunx @tanstack/intent@latest' + | 'intent' + | 'pnpm exec intent' + | 'pnpm dlx @tanstack/intent' + | 'pnpm dlx @tanstack/intent@latest' + | 'npx @tanstack/intent' + | 'npx @tanstack/intent@latest' + | 'yarn dlx @tanstack/intent' + | 'yarn dlx @tanstack/intent@latest' + action: 'list' | 'load' + skillUse?: string + source: 'tool-call' | 'tool-message' +} + +const commandPattern = + /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:bunx\s+@tanstack\/intent(?:@latest)?)|(?:pnpm\s+exec\s+intent)|(?:pnpm\s+dlx\s+@tanstack\/intent(?:@latest)?)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:yarn\s+dlx\s+@tanstack\/intent(?:@latest)?)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i + +export function parseIntentCommand( + raw: string, + source: ParsedIntentCommand['source'], +): ParsedIntentCommand | undefined { + const match = raw.match(commandPattern) + + if (!match?.[1] || !match[2]) { + return undefined + } + + const executable = match[1].replace( + /\s+/g, + ' ', + ) as ParsedIntentCommand['executable'] + const action = match[2].toLowerCase() as ParsedIntentCommand['action'] + const skillUse = action === 'load' ? match[3] : undefined + + if (action === 'load' && !skillUse) { + return undefined + } + + return { + raw: `${executable} ${action}${skillUse ? ` ${skillUse}` : ''}`, + executable, + action, + skillUse, + source, + } +} + +export function intentCommandsFromRun( + run: HarnessRun, +): Array { + return [ + ...intentCommandsFromToolCalls(toolCalls(run)), + ...intentCommandsFromToolMessages(run), + ] +} + +export function intentCommandsFromToolCalls( + calls: Array, +): Array { + return calls.flatMap((call) => { + const command = commandFromToolCall(call) + const parsed = command + ? parseIntentCommand(command, 'tool-call') + : intentCommandFromToolName(call) + + return parsed ? [parsed] : [] + }) +} + +export function loadedSkillUsesFromRun(run: HarnessRun): Array { + const artifactSkills = Array.isArray(run.artifacts?.loadedSkills) + ? run.artifacts.loadedSkills.filter( + (candidate): candidate is string => typeof candidate === 'string', + ) + : [] + const commandSkills = intentCommandsFromRun(run) + .filter((command) => command.action === 'load' && Boolean(command.skillUse)) + .map((command) => command.skillUse as string) + + return [...new Set([...artifactSkills, ...commandSkills])] +} + +function intentCommandsFromToolMessages( + run: HarnessRun, +): Array { + return run.session.messages.flatMap((message) => { + if (message.role !== 'tool') { + return [] + } + + return jsonToSearchableText(message.content) + .split('\n') + .flatMap((line) => { + const parsed = parseIntentCommand(line, 'tool-message') + + return parsed ? [parsed] : [] + }) + }) +} + +function commandFromToolCall(call: ToolCallRecord): string | undefined { + return ( + stringRecordValue(call.arguments, 'command') ?? + stringRecordValue(call.arguments, 'cmd') ?? + stringRecordValue(call.arguments, 'input') ?? + stringRecordValue(call.metadata, 'command') + ) +} + +function intentCommandFromToolName( + call: ToolCallRecord, +): ParsedIntentCommand | undefined { + if (call.name === 'intent_list') { + return { + raw: call.name, + executable: 'intent', + action: 'list', + source: 'tool-call', + } + } + + if (call.name !== 'intent_load') { + return undefined + } + + const skillUse = stringRecordValue(call.arguments, 'use') + + if (!skillUse) { + return undefined + } + + return { + raw: `${call.name} ${skillUse}`, + executable: 'intent', + action: 'load', + skillUse, + source: 'tool-call', + } +} + +function stringRecordValue( + value: Record | undefined, + key: string, +): string | undefined { + const candidate = value?.[key] + + return typeof candidate === 'string' ? candidate : undefined +} diff --git a/evals/intent-discovery/harness/prepare-fixture.ts b/evals/intent-discovery/harness/prepare-fixture.ts new file mode 100644 index 0000000..dabca4f --- /dev/null +++ b/evals/intent-discovery/harness/prepare-fixture.ts @@ -0,0 +1,66 @@ +import { + cpSync, + existsSync, + mkdirSync, + mkdtempSync, + realpathSync, + rmSync, +} from 'node:fs' +import { tmpdir } from 'node:os' +import { basename, dirname, join, sep } from 'node:path' +import { fileURLToPath } from 'node:url' +import type { IntentDiscoveryFixture } from '../corpus/tasks' + +const evalDir = dirname(dirname(fileURLToPath(import.meta.url))) +const fixturesDir = join(evalDir, 'fixtures') + +export type PreparedFixtureWorkspace = { + fixture: IntentDiscoveryFixture + sourcePath: string + workspacePath: string + cleanup: () => void +} + +function fixturePath(fixture: IntentDiscoveryFixture): string { + return join(fixturesDir, fixture) +} + +export function prepareFixtureWorkspace({ + fixture, + parentDir, +}: { + fixture: IntentDiscoveryFixture + parentDir?: string +}): PreparedFixtureWorkspace { + const sourcePath = fixturePath(fixture) + + if (!existsSync(sourcePath)) { + throw new Error(`Fixture does not exist: ${fixture}`) + } + + const rootDir = + parentDir ?? mkdtempSync(join(realpathSync(tmpdir()), 'intent-eval-')) + mkdirSync(rootDir, { recursive: true }) + + const workspacePath = join(rootDir, basename(sourcePath)) + rmSync(workspacePath, { recursive: true, force: true }) + cpSync(sourcePath, workspacePath, { + recursive: true, + verbatimSymlinks: true, + filter: (source) => !source.includes(`${fixturesDir}${sep}runs${sep}`), + }) + + return { + fixture, + sourcePath, + workspacePath, + cleanup() { + if (parentDir) { + rmSync(workspacePath, { recursive: true, force: true }) + return + } + + rmSync(rootDir, { recursive: true, force: true }) + }, + } +} diff --git a/evals/intent-discovery/harness/run-copilot-task.ts b/evals/intent-discovery/harness/run-copilot-task.ts new file mode 100644 index 0000000..f1295aa --- /dev/null +++ b/evals/intent-discovery/harness/run-copilot-task.ts @@ -0,0 +1,299 @@ +import { mkdirSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { spawn } from 'node:child_process' +import { parseIntentCommand } from './parse-intent-commands' +import type { IntentDiscoveryTask } from '../corpus/tasks' +import type { + NormalizedMessage, + ToolCallRecord, + UsageSummary, +} from 'vitest-evals' + +const evalDir = dirname(dirname(fileURLToPath(import.meta.url))) +const transcriptDir = join(evalDir, 'runs', 'latest', 'transcripts') +const commandTimeoutMs = Number( + process.env.INTENT_DISCOVERY_COMMAND_TIMEOUT_MS ?? '300000', +) + +export class LiveCopilotRunnerUnavailableError extends Error { + constructor() { + super( + 'Live Copilot runner is not wired yet. Use saved transcripts until the runner can launch Copilot and capture transcript, command, and diff evidence.', + ) + this.name = 'LiveCopilotRunnerUnavailableError' + } +} + +export type RunCopilotTaskInput = { + task: IntentDiscoveryTask + runId: string + sourcePath: string + workspacePath: string +} + +export type CopilotTaskRun = { + finalAnswer: string + runId: string + messages: Array + toolCalls: Array + usage?: UsageSummary + transcriptPath?: string + commandsInvoked: Array + intentCommandsInvoked: Array + intentCommandOutputs: Array + loadedSkills: Array + fileDiff?: string + agentErrors: Array +} + +export async function runCopilotTask( + input: RunCopilotTaskInput, +): Promise { + const command = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + if (!command) { + throw new LiveCopilotRunnerUnavailableError() + } + + const result = await runCommand({ command, input }) + const transcript = transcriptFromCommandResult(result) + const transcriptPath = writeTranscript(input.runId, transcript) + const intentCommandCaptures = captureIntentCommands(transcript) + const fileDiff = await collectFileDiff(input.sourcePath, input.workspacePath) + const agentErrors = + result.exitCode === 0 ? [] : [result.stderr || result.stdout] + + return { + finalAnswer: finalAnswerFromTranscript(transcript), + runId: input.runId, + messages: [ + { + role: 'user', + content: input.task.prompt, + }, + { + role: 'tool', + content: transcript, + }, + { + role: 'assistant', + content: finalAnswerFromTranscript(transcript), + toolCalls: intentCommandCaptures.map((capture) => capture.toolCall), + }, + ], + toolCalls: intentCommandCaptures.map((capture) => capture.toolCall), + usage: { + provider: 'copilot-command', + model: process.env.INTENT_DISCOVERY_COPILOT_MODEL ?? 'unknown', + }, + transcriptPath, + commandsInvoked: intentCommandCaptures.map((capture) => capture.command), + intentCommandsInvoked: intentCommandCaptures.map( + (capture) => capture.command, + ), + intentCommandOutputs: intentCommandCaptures.map( + (capture) => capture.output, + ), + loadedSkills: [ + ...new Set( + intentCommandCaptures + .map((capture) => capture.skillUse) + .filter((skillUse): skillUse is string => Boolean(skillUse)), + ), + ], + fileDiff, + agentErrors, + } +} + +type CommandResult = { + stdout: string + stderr: string + exitCode: number | null +} + +type IntentCommandCapture = { + command: string + output: string + skillUse?: string + toolCall: ToolCallRecord +} + +async function runCommand({ + command, + input, +}: { + command: string + input: RunCopilotTaskInput +}): Promise { + return new Promise((resolve, reject) => { + let settled = false + const child = spawn(command, { + cwd: input.workspacePath, + shell: true, + env: { + ...process.env, + INTENT_DISCOVERY_TASK_ID: input.task.id, + INTENT_DISCOVERY_FIXTURE: input.task.fixture, + INTENT_DISCOVERY_PROMPT: input.task.prompt, + INTENT_DISCOVERY_RUN_ID: input.runId, + INTENT_DISCOVERY_WORKSPACE: input.workspacePath, + }, + }) + const stdoutChunks: Array = [] + const stderrChunks: Array = [] + const timeout = setTimeout(() => { + if (settled) return + settled = true + child.kill('SIGKILL') + reject(new Error(`Copilot command timed out after ${commandTimeoutMs}ms`)) + }, commandTimeoutMs) + + child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) + child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) + child.on('error', (error) => { + if (settled) return + settled = true + clearTimeout(timeout) + reject(error) + }) + child.on('close', (exitCode) => { + if (settled) return + settled = true + clearTimeout(timeout) + resolve({ + stdout: Buffer.concat(stdoutChunks).toString('utf8'), + stderr: Buffer.concat(stderrChunks).toString('utf8'), + exitCode, + }) + }) + }) +} + +function transcriptFromCommandResult(result: CommandResult): string { + return [result.stdout.trim(), result.stderr.trim()].filter(Boolean).join('\n') +} + +function finalAnswerFromTranscript(transcript: string): string { + const finalAnswerLine = transcript + .split('\n') + .find((line) => line.startsWith('FINAL_ANSWER:')) + + return finalAnswerLine?.replace(/^FINAL_ANSWER:\s*/, '') ?? transcript.trim() +} + +function writeTranscript(runId: string, transcript: string): string { + mkdirSync(transcriptDir, { recursive: true }) + const transcriptPath = join(transcriptDir, `${sanitizeFileName(runId)}.txt`) + + writeFileSync(transcriptPath, transcript) + + return transcriptPath +} + +function captureIntentCommands( + transcript: string, +): Array { + const lines = transcript.split('\n') + const captures: Array = [] + + for (let index = 0; index < lines.length; index += 1) { + const line = lines[index] + const command = parseIntentCommand(line ?? '', 'tool-message') + + if (!command) { + continue + } + + const output = outputAfterCommand(lines, index) + + captures.push({ + command: command.raw, + output, + skillUse: command.skillUse, + toolCall: { + name: 'shell_command', + arguments: { + command: command.raw, + }, + result: output, + }, + }) + } + + return captures +} + +function outputAfterCommand( + lines: Array, + commandIndex: number, +): string { + const output: Array = [] + + for (let index = commandIndex + 1; index < lines.length; index += 1) { + const line = lines[index] ?? '' + + if (parseIntentCommand(line, 'tool-message')) { + break + } + + output.push(line) + } + + return output.join('\n').trim() +} + +async function collectFileDiff( + sourcePath: string, + workspacePath: string, +): Promise { + const result = await runDiff(sourcePath, workspacePath) + + if (result.exitCode !== 0 && result.exitCode !== 1) { + return result.stderr + } + + return result.stdout +} + +async function runDiff( + sourcePath: string, + workspacePath: string, +): Promise { + return new Promise((resolve, reject) => { + let settled = false + const child = spawn('diff', ['-ruN', sourcePath, workspacePath]) + const stdoutChunks: Array = [] + const stderrChunks: Array = [] + const timeout = setTimeout(() => { + if (settled) return + settled = true + child.kill('SIGKILL') + reject(new Error(`diff timed out after ${commandTimeoutMs}ms`)) + }, commandTimeoutMs) + + child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) + child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) + child.on('error', (error) => { + if (settled) return + settled = true + clearTimeout(timeout) + reject(error) + }) + child.on('close', (exitCode) => { + if (settled) return + settled = true + clearTimeout(timeout) + resolve({ + stdout: Buffer.concat(stdoutChunks).toString('utf8'), + stderr: Buffer.concat(stderrChunks).toString('utf8'), + exitCode, + }) + }) + }) +} + +function sanitizeFileName(value: string): string { + return value.replace(/[^a-z0-9.-]+/gi, '-') +} diff --git a/evals/intent-discovery/harness/saved-transcript-harness.ts b/evals/intent-discovery/harness/saved-transcript-harness.ts new file mode 100644 index 0000000..2263e61 --- /dev/null +++ b/evals/intent-discovery/harness/saved-transcript-harness.ts @@ -0,0 +1,127 @@ +import { createHarness } from 'vitest-evals' +import type { NormalizedMessage, ToolCallRecord } from 'vitest-evals' +import type { SavedTranscriptCase } from '../fixtures/saved-transcripts' + +export type IntentDiscoveryOutput = { + finalAnswer: string + runId: string +} + +export const savedTranscriptHarness = createHarness< + SavedTranscriptCase, + IntentDiscoveryOutput +>({ + name: 'intent-discovery-saved-transcript', + run: ({ input, setArtifact }) => { + const runId = `saved:${input.id}` + + setArtifact('runId', runId) + setArtifact('taskId', input.id) + setArtifact('condition', input.condition) + setArtifact('fixture', input.fixture) + setArtifact('prompt', input.prompt) + setArtifact('expectedSkillAreas', input.expectedSkillAreas) + setArtifact( + 'transcriptPath', + 'evals/intent-discovery/fixtures/saved-transcripts.ts', + ) + setArtifact('commandsInvoked', input.commandsInvoked) + setArtifact('intentCommandsInvoked', input.intentCommandsInvoked) + setArtifact('intentCommandOutputs', input.intentCommandOutputs) + setArtifact('loadedSkills', input.loadedSkills) + setArtifact('agentErrors', input.agentErrors) + + return { + output: { + finalAnswer: input.finalAnswer, + runId, + }, + messages: messagesWithToolCalls(input.messages, input.toolCalls), + toolCalls: input.toolCalls, + usage: { + provider: 'saved-transcript', + model: 'synthetic', + }, + artifacts: { + runKind: 'saved-transcript', + }, + traces: [ + { + id: runId, + name: 'saved transcript grading', + spans: [ + { + id: `${runId}:load`, + name: 'load saved transcript', + kind: 'custom', + status: 'ok', + attributes: { + taskId: input.id, + fixture: input.fixture, + condition: input.condition, + }, + }, + ], + }, + ], + errors: input.agentErrors, + } + }, +}) + +function messagesWithToolCalls( + messages: Array, + toolCalls: Array, +): Array { + if (toolCalls.length === 0) { + return messages + } + + const firstAssistantIndex = messages.findIndex( + (message) => message.role === 'assistant', + ) + + if (firstAssistantIndex === -1) { + return [ + ...messages, + { + role: 'assistant', + toolCalls, + }, + ] + } + + return messages.map((message, index) => + index === firstAssistantIndex + ? { + ...message, + toolCalls: mergeToolCalls(message.toolCalls ?? [], toolCalls), + } + : message, + ) +} + +function mergeToolCalls( + existing: Array, + incoming: Array, +): Array { + const seen = new Set( + existing.map( + (call) => `${call.name}:${JSON.stringify(call.arguments ?? {})}`, + ), + ) + + return [ + ...existing, + ...incoming.filter((call) => { + const key = `${call.name}:${JSON.stringify(call.arguments ?? {})}` + + if (seen.has(key)) { + return false + } + + seen.add(key) + return true + }), + ] +} diff --git a/evals/intent-discovery/harness/setup-intent-condition.ts b/evals/intent-discovery/harness/setup-intent-condition.ts new file mode 100644 index 0000000..3ae9da5 --- /dev/null +++ b/evals/intent-discovery/harness/setup-intent-condition.ts @@ -0,0 +1,188 @@ +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { join } from 'node:path' +import { + buildIntentSkillGuidanceBlock, + buildIntentSkillsBlock, +} from '../../../packages/intent/src/commands/install-writer.js' +import { + expectedSkillUseByArea, + packageAllowlistByArea, +} from '../corpus/skill-uses' +import type { IntentDiscoveryCondition } from '../corpus/conditions' +import type { ExpectedSkillArea } from '../corpus/tasks' +import type { ScanResult } from '../../../packages/intent/src/types.js' + +export type AppliedIntentCondition = { + condition: IntentDiscoveryCondition + filesWritten: Array +} + +export function applyIntentCondition({ + condition, + expectedSkillAreas, + workspacePath, +}: { + condition: IntentDiscoveryCondition + expectedSkillAreas: Array + workspacePath: string +}): AppliedIntentCondition { + if (condition === 'no-intent' || condition === 'plain-docs') { + return { condition, filesWritten: [] } + } + + const filesWritten = [ + writePackageAllowlist(workspacePath, expectedSkillAreas), + writeAgentsFile({ condition, expectedSkillAreas, workspacePath }), + ...writeSkillPackages(workspacePath, expectedSkillAreas), + ] + + return { condition, filesWritten } +} + +function writeSkillPackages( + workspacePath: string, + expectedSkillAreas: Array, +): Array { + return expectedSkillAreas.flatMap((area) => { + const packageName = packageAllowlistByArea[area] + const use = expectedSkillUseByArea[area] + const skillName = use.split('#')[1] + + if (!skillName) { + throw new Error(`Invalid expected skill use for ${area}: ${use}`) + } + + const packageRoot = join( + workspacePath, + 'node_modules', + ...packageName.split('/'), + ) + const skillDir = join(packageRoot, 'skills', skillName) + const packageJsonPath = join(packageRoot, 'package.json') + const skillPath = join(skillDir, 'SKILL.md') + + mkdirSync(skillDir, { recursive: true }) + writeFileSync( + packageJsonPath, + `${JSON.stringify( + { + name: packageName, + version: '0.0.0-intent-eval', + intent: { + version: 1, + repo: `TanStack/${area}`, + docs: 'docs/', + }, + }, + null, + 2, + )}\n`, + ) + writeFileSync( + skillPath, + `---\nname: "${skillName}"\ndescription: "Guidance for ${area} eval tasks"\n---\n\nUse this skill for ${area} eval tasks.\n`, + ) + + return [packageJsonPath, skillPath] + }) +} + +function writePackageAllowlist( + workspacePath: string, + expectedSkillAreas: Array, +): string { + const packageJsonPath = join(workspacePath, 'package.json') + const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8')) as { + intent?: { skills?: Array } + } + + packageJson.intent = { + ...packageJson.intent, + skills: expectedSkillAreas.map((area) => packageAllowlistByArea[area]), + } + writeFileSync(packageJsonPath, `${JSON.stringify(packageJson, null, 2)}\n`) + + return packageJsonPath +} + +function writeAgentsFile({ + condition, + expectedSkillAreas, + workspacePath, +}: { + condition: IntentDiscoveryCondition + expectedSkillAreas: Array + workspacePath: string +}): string { + const agentsPath = join(workspacePath, 'AGENTS.md') + const block = + condition === 'mapped-intent' + ? mappedGuidanceBlock(expectedSkillAreas) + : loadingGuidanceBlock() + + writeFileSync(agentsPath, `${block}\n`) + + return agentsPath +} + +function loadingGuidanceBlock(): string { + return buildIntentSkillGuidanceBlock('npm').block.trimEnd() +} + +function mappedGuidanceBlock( + expectedSkillAreas: Array, +): string { + return buildIntentSkillsBlock( + scanResultForAreas(expectedSkillAreas), + ).block.trimEnd() +} + +function scanResultForAreas( + expectedSkillAreas: Array, +): ScanResult { + return { + conflicts: [], + nodeModules: { + global: { detected: false, exists: false, path: null, scanned: false }, + local: { + detected: true, + exists: true, + path: 'node_modules', + scanned: true, + }, + }, + notices: [], + packageManager: 'npm', + packages: expectedSkillAreas.map((area) => { + const packageName = packageAllowlistByArea[area] + const use = expectedSkillUseByArea[area] + const skillName = use.split('#')[1] + + if (!skillName) { + throw new Error(`Invalid expected skill use for ${area}: ${use}`) + } + + return { + intent: { + docs: 'docs/', + repo: `TanStack/${area}`, + version: 1, + }, + kind: 'npm', + name: packageName, + packageRoot: `node_modules/${packageName}`, + skills: [ + { + description: `Guidance for ${area} eval tasks`, + name: skillName, + path: `node_modules/${packageName}/skills/${skillName}/SKILL.md`, + }, + ], + source: 'local', + version: '0.0.0-intent-eval', + } + }), + stats: { packageJsonCacheHits: 0, packageJsonReadCount: 0 }, + warnings: [], + } +} diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts new file mode 100644 index 0000000..76f4e66 --- /dev/null +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from 'vitest' +import { failedSpans, toolCalls } from 'vitest-evals' +import { countsTowardAutonomousScore } from './corpus/conditions' +import { correctSkillLoaded } from './graders/correct-skill-loaded' +import { attachEvalMetadata, score } from './graders/eval-metadata' +import { classifyFailure } from './graders/failure-classifier' +import { referenceOnly } from './graders/reference-only' +import { strictIntentInvocation } from './graders/strict-invocation' +import { savedTranscriptCases } from './fixtures/saved-transcripts' +import { savedTranscriptHarness } from './harness/saved-transcript-harness' +import type { HarnessContext } from 'vitest-evals' + +describe('Intent discovery saved transcripts', () => { + for (const evalCase of savedTranscriptCases) { + it(evalCase.id, async (context) => { + const result = await runSavedTranscript(evalCase) + const strict = strictIntentInvocation(result) + const loaded = correctSkillLoaded(result, evalCase.expectedSkillAreas) + const reference = referenceOnly(result, evalCase.expectedSkillAreas) + const failureClass = classifyFailure(result, evalCase.expectedSkillAreas) + const autonomous = countsTowardAutonomousScore({ + condition: evalCase.condition, + explicitnessLevel: evalCase.explicitnessLevel, + }) + const scores = [ + score( + 'AutonomousDiscoverySuccess', + autonomous && strict.passed && loaded.passed, + { + rationale: + 'Scores only autonomous runs where Copilot invoked Intent and loaded the expected skill.', + failureClass, + }, + ), + score('StrictIntentInvocation', strict.passed, { + matchedCommand: strict.matchedCommand, + source: strict.source, + }), + score('CorrectSkillLoaded', loaded.passed, { + loadedSkills: loaded.loadedSkills, + expectedSkillAreas: evalCase.expectedSkillAreas, + }), + score('NoReferenceOnlyFalsePositive', !reference, { + referenceOnly: reference, + }), + ] + + attachEvalMetadata({ + harnessName: savedTranscriptHarness.name, + run: result, + scores, + task: context.task, + }) + + expect(result.errors).toHaveLength(0) + expect(failedSpans(result)).toHaveLength(0) + expect(result.output.finalAnswer.length).toBeGreaterThan(0) + expect(toolCalls(result).length).toBe(evalCase.toolCalls.length) + expect(strict.passed).toBe(evalCase.expected.strictInvocation) + expect(loaded.passed).toBe(evalCase.expected.correctSkillLoaded) + expect(reference).toBe(evalCase.expected.referenceOnly) + expect(failureClass).toBe(evalCase.expected.failureClass) + expect(autonomous).toBe(evalCase.explicitnessLevel !== 4) + }) + } +}) + +async function runSavedTranscript( + evalCase: (typeof savedTranscriptCases)[number], +) { + const artifacts: HarnessContext['artifacts'] = {} + const context: HarnessContext = { + artifacts, + setArtifact(name, value) { + artifacts[name] = value + }, + } + + return savedTranscriptHarness.run(evalCase, context) +} diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts new file mode 100644 index 0000000..631f7cf --- /dev/null +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -0,0 +1,223 @@ +import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import { failedSpans, toolCalls } from 'vitest-evals' +import { countsTowardAutonomousScore } from './corpus/conditions' +import { liveTasks } from './corpus/live-tasks' +import { tasks } from './corpus/tasks' +import { correctSkillLoaded } from './graders/correct-skill-loaded' +import { attachEvalMetadata, score } from './graders/eval-metadata' +import { classifyFailure } from './graders/failure-classifier' +import { referenceOnly } from './graders/reference-only' +import { strictIntentInvocation } from './graders/strict-invocation' +import { liveCopilotHarness } from './harness/live-copilot-harness' +import type { IntentDiscoveryTask } from './corpus/tasks' +import type { LiveCopilotOutput } from './harness/live-copilot-harness' +import type { HarnessContext, HarnessRun } from 'vitest-evals' + +const routerTask = tasks.find( + (task) => task.id === 'router-current-intent-loads-router', +) +const liveRunCount = liveRunCountFromEnv() + +if (!routerTask) { + throw new Error('Missing router-current-intent-loads-router task') +} + +describe('Intent discovery live Copilot harness', () => { + it('returns an explicit unsupported result until live capture is wired', async () => { + const result = await withoutCopilotCommand(() => runLiveHarness(routerTask)) + + expect(result.output).toEqual({ + finalAnswer: '', + runId: `live:${routerTask.id}`, + }) + expect(result.artifacts?.runKind).toBe('live-copilot') + expect(result.artifacts?.runnerStatus).toBe('unsupported') + expect(result.artifacts?.workspacePath).toEqual(expect.any(String)) + expect(toolCalls(result)).toHaveLength(0) + expect(result.errors).toEqual([ + { + message: + 'Live Copilot runner is not wired yet. Use saved transcripts until the runner can launch Copilot and capture transcript, command, and diff evidence.', + type: 'LiveCopilotRunnerUnavailableError', + }, + ]) + expect(failedSpans(result)).toHaveLength(1) + }) + + it('runs an opt-in command backend and captures command, skill, transcript, and diff evidence', async () => { + const tempDir = mkdtempSync(join(tmpdir(), 'intent-eval-command-')) + const fakeRunnerPath = join(tempDir, 'fake-runner.mjs') + const previousCommand = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + writeFileSync( + fakeRunnerPath, + [ + "import { writeFileSync } from 'node:fs'", + "writeFileSync('agent-output.txt', process.env.INTENT_DISCOVERY_TASK_ID ?? '')", + "console.log('$ intent list')", + "console.log('@tanstack/router#routing - Router route guidance')", + "console.log('$ intent load @tanstack/router#routing')", + "console.log('Loaded @tanstack/router#routing')", + "console.log('FINAL_ANSWER: Loaded router guidance and updated the fixture.')", + ].join('\n'), + ) + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = `node ${fakeRunnerPath}` + + try { + const result = await runLiveHarness(routerTask) + + expect(result.errors).toEqual([]) + expect(result.output.finalAnswer).toBe( + 'Loaded router guidance and updated the fixture.', + ) + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.artifacts?.intentCommandsInvoked).toEqual([ + 'intent list', + 'intent load @tanstack/router#routing', + ]) + expect(result.artifacts?.loadedSkills).toEqual([ + '@tanstack/router#routing', + ]) + expect(result.artifacts?.fileDiff).toEqual( + expect.stringContaining('agent-output.txt'), + ) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(existsSync(String(result.artifacts?.transcriptPath))).toBe(true) + expect(toolCalls(result)).toHaveLength(2) + expect(failedSpans(result)).toHaveLength(0) + } finally { + if (previousCommand === undefined) { + delete process.env.INTENT_DISCOVERY_COPILOT_COMMAND + } else { + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = previousCommand + } + rmSync(tempDir, { recursive: true, force: true }) + } + }) + + for (const liveTask of liveTasks) { + for (let runIndex = 1; runIndex <= liveRunCount; runIndex += 1) { + it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( + `live/${liveTask.condition}/${liveTask.fixture}/run-${runIndex}`, + async (context) => { + const task = liveRunTask(liveTask, runIndex) + const result = await runLiveHarness(task) + + attachLiveEvalMetadata({ + contextTask: context.task, + result, + task, + }) + + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.output.runId).toBe(`live:${task.id}`) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) + expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) + expect(result.artifacts?.setupFilesWritten).toEqual(expect.any(Array)) + }, + 300_000, + ) + } + } +}) + +function liveRunCountFromEnv(): number { + const value = Number(process.env.INTENT_DISCOVERY_RUN_COUNT ?? '1') + + if (!Number.isInteger(value) || value < 1) { + return 1 + } + + return value +} + +function liveRunTask( + task: IntentDiscoveryTask, + runIndex: number, +): IntentDiscoveryTask { + return { + ...task, + id: `${task.id}-run-${runIndex}`, + } +} + +function attachLiveEvalMetadata({ + contextTask, + result, + task, +}: { + contextTask: Parameters[0]['task'] + result: HarnessRun + task: IntentDiscoveryTask +}): void { + const strict = strictIntentInvocation(result) + const loaded = correctSkillLoaded(result, task.expectedSkillAreas) + const reference = referenceOnly(result, task.expectedSkillAreas) + const failureClass = classifyFailure(result, task.expectedSkillAreas) + const autonomous = countsTowardAutonomousScore({ + condition: task.condition, + explicitnessLevel: task.explicitnessLevel, + }) + + attachEvalMetadata({ + harnessName: liveCopilotHarness.name, + run: result, + scores: [ + score( + 'AutonomousDiscoverySuccess', + autonomous && strict.passed && loaded.passed, + { + rationale: + 'Scores only autonomous live runs where Copilot invoked Intent and loaded the expected skill.', + condition: task.condition, + failureClass, + runnerStatus: String(result.artifacts?.runnerStatus ?? ''), + }, + ), + score('StrictIntentInvocation', strict.passed, { + matchedCommand: strict.matchedCommand, + source: strict.source, + }), + score('CorrectSkillLoaded', loaded.passed, { + loadedSkills: loaded.loadedSkills, + expectedSkillAreas: task.expectedSkillAreas, + }), + score('NoReferenceOnlyFalsePositive', !reference, { + referenceOnly: reference, + }), + ], + task: contextTask, + }) +} + +async function withoutCopilotCommand(run: () => Promise): Promise { + const previousCommand = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + delete process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + try { + return await run() + } finally { + if (previousCommand !== undefined) { + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = previousCommand + } + } +} + +async function runLiveHarness( + task: IntentDiscoveryTask, +): Promise> { + const artifacts: HarnessContext['artifacts'] = {} + const context: HarnessContext = { + artifacts, + setArtifact(name, value) { + artifacts[name] = value + }, + } + + return liveCopilotHarness.run(task, context) +} diff --git a/evals/intent-discovery/runs/.gitkeep b/evals/intent-discovery/runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/intent-discovery/runs/latest/.gitkeep b/evals/intent-discovery/runs/latest/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/intent-discovery/tsconfig.json b/evals/intent-discovery/tsconfig.json new file mode 100644 index 0000000..5291cc3 --- /dev/null +++ b/evals/intent-discovery/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "declaration": false, + "types": ["node", "vitest"] + }, + "include": ["**/*.ts"] +} diff --git a/evals/intent-discovery/vitest.evals.config.ts b/evals/intent-discovery/vitest.evals.config.ts new file mode 100644 index 0000000..28a386a --- /dev/null +++ b/evals/intent-discovery/vitest.evals.config.ts @@ -0,0 +1,15 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + include: ['evals/intent-discovery/**/*.eval.ts'], + testTimeout: 120_000, + hookTimeout: 120_000, + reporters: ['default'], + env: { + VITEST_EVALS_REPLAY_DIR: + process.env.VITEST_EVALS_REPLAY_DIR ?? + 'evals/intent-discovery/.vitest-evals/recordings', + }, + }, +}) diff --git a/knip.json b/knip.json index 1622a2c..fb1b4d4 100644 --- a/knip.json +++ b/knip.json @@ -2,7 +2,13 @@ "$schema": "https://unpkg.com/knip@5/schema.json", "workspaces": { ".": { - "entry": ["scripts/*.ts"] + "entry": [ + "scripts/*.ts", + "evals/intent-discovery/*.eval.ts", + "evals/intent-discovery/bin/*.mjs" + ], + "ignoreBinaries": ["copilot", "diff"], + "ignoreFiles": ["evals/intent-discovery/fixtures/**/src/**/*"] }, "packages/intent": { "entry": ["src/index.ts", "src/cli.ts", "src/core.ts", "src/setup.ts"], diff --git a/package.json b/package.json index 360be17..bcc3247 100644 --- a/package.json +++ b/package.json @@ -25,9 +25,16 @@ "format": "prettier --experimental-cli --ignore-unknown '**/*' --write", "lint:fix": "nx affected --target=lint:fix --exclude=examples/**", "lint:fix:all": "nx run-many --targets=lint --fix", + "generate-docs": "node scripts/generate-docs.ts", + "eval:intent-discovery": "vitest run --config evals/intent-discovery/vitest.evals.config.ts", + "eval:intent-discovery:json": "vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:live": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts", + "eval:intent-discovery:live:json": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:judge": "node evals/intent-discovery/bin/llm-judge.mjs evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:report": "vitest-evals serve evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:summary": "node evals/intent-discovery/bin/summarize-results.mjs evals/intent-discovery/runs/latest/vitest-results.json", "test": "pnpm run test:ci", "test:ci": "tsc --noEmit && nx run-many --targets=test:eslint,test:sherif,test:knip,test:docs,test:lib,test:integration,test:types,build", - "generate-docs": "node scripts/generate-docs.ts", "test:docs": "node scripts/verify-links.ts", "test:eslint": "nx affected --target=test:eslint --exclude=examples/**", "test:knip": "knip", @@ -60,6 +67,7 @@ "sherif": "^1.11.1", "tinyglobby": "^0.2.17", "typescript": "6.0.3", - "vitest": "4.1.8" + "vitest": "4.1.8", + "vitest-evals": "^0.13.1" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8d95985..4d62027 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -59,6 +59,9 @@ importers: vitest: specifier: 4.1.8 version: 4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)) + vitest-evals: + specifier: ^0.13.1 + version: 0.13.1(tinyrainbow@3.1.0)(vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)))(zod@4.3.5) benchmarks/intent: devDependencies: @@ -1469,6 +1472,12 @@ packages: resolution: {integrity: sha512-LTV6/Kcr8pS//iDjDitfYi1bp0AlKBUuvNoHAbI8tMnj0PLOUtRWJxId5FwuE+z3oNBeDLeOWPoXVtqKjl277Q==} engines: {node: '>=18'} + '@vitest-evals/core@0.13.1': + resolution: {integrity: sha512-YX5bRG+J0GCzwJiNoq7UHJVRrtqx07lF3cYUrHnvfRLrn/R5nfBkFkm9eluAYlMFbWehFw+fFIW7bPuyL+3pMg==} + + '@vitest-evals/report-ui@0.13.1': + resolution: {integrity: sha512-uA0OSe8UFhSP8i92hUNSFbdJ7Lwi0b06DVfvPb9lnEADgZrExv8IiHy9mkRuU+aMwo7zQI75ZZz1qx07XzPczA==} + '@vitest/expect@4.1.8': resolution: {integrity: sha512-h3nDO677RDLEGlBxyQ5CW8RlMThSKSRLUePLOx09gNIWRL40edgA1GCZSZgf1W55MFAG6/Sw14KeaAnqv0NKdQ==} @@ -3507,18 +3516,10 @@ packages: tinybench@2.9.0: resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==} - tinyexec@1.0.2: - resolution: {integrity: sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==} - engines: {node: '>=18'} - tinyexec@1.2.4: resolution: {integrity: sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg==} engines: {node: '>=18'} - tinyglobby@0.2.15: - resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==} - engines: {node: '>=12.0.0'} - tinyglobby@0.2.17: resolution: {integrity: sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==} engines: {node: '>=12.0.0'} @@ -3788,6 +3789,20 @@ packages: yaml: optional: true + vitest-evals@0.13.1: + resolution: {integrity: sha512-UCA3drMFVxtYB3F/0AjQEBSp7EPc2Du2Au85kLHtQg4V6p2mpifP4m5VEfwgxVXq8UfrnsMk8SJvOB/5EiDC0g==} + hasBin: true + peerDependencies: + ai: '>=4 <7' + tinyrainbow: '>=2 <4' + vitest: '>=4 <5' + zod: '>=3 <5' + peerDependenciesMeta: + ai: + optional: true + zod: + optional: true + vitest@4.1.8: resolution: {integrity: sha512-flY6ScbCIt9HThs+C5HS7jvGOB560DJtk/Z15IQROTA6zEy49Nh8T/dofWTQL+n3vswqn87sbJNiuqw1SDp5Ig==} engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} @@ -5217,6 +5232,14 @@ snapshots: lodash: 4.18.1 minimatch: 7.4.9 + '@vitest-evals/core@0.13.1': + dependencies: + zod: 4.3.5 + + '@vitest-evals/report-ui@0.13.1': + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest/expect@4.1.8': dependencies: '@standard-schema/spec': 1.1.0 @@ -6027,10 +6050,6 @@ snapshots: dependencies: walk-up-path: 4.0.0 - fdir@6.5.0(picomatch@4.0.3): - optionalDependencies: - picomatch: 4.0.3 - fdir@6.5.0(picomatch@4.0.4): optionalDependencies: picomatch: 4.0.4 @@ -7463,15 +7482,8 @@ snapshots: tinybench@2.9.0: {} - tinyexec@1.0.2: {} - tinyexec@1.2.4: {} - tinyglobby@0.2.15: - dependencies: - fdir: 6.5.0(picomatch@4.0.3) - picomatch: 4.0.3 - tinyglobby@0.2.17: dependencies: fdir: 6.5.0(picomatch@4.0.4) @@ -7511,7 +7523,7 @@ snapshots: ts-declaration-location@1.0.7(typescript@6.0.3): dependencies: - picomatch: 4.0.3 + picomatch: 4.0.4 typescript: 6.0.3 tsconfig-paths@4.2.0: @@ -7749,6 +7761,15 @@ snapshots: jiti: 2.7.0 yaml: 2.9.0 + vitest-evals@0.13.1(tinyrainbow@3.1.0)(vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)))(zod@4.3.5): + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest-evals/report-ui': 0.13.1 + tinyrainbow: 3.1.0 + vitest: 4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)) + optionalDependencies: + zod: 4.3.5 + vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)): dependencies: '@vitest/expect': 4.1.8 @@ -7763,11 +7784,11 @@ snapshots: magic-string: 0.30.21 obug: 2.1.1 pathe: 2.0.3 - picomatch: 4.0.3 + picomatch: 4.0.4 std-env: 4.1.0 tinybench: 2.9.0 - tinyexec: 1.0.2 - tinyglobby: 0.2.15 + tinyexec: 1.2.4 + tinyglobby: 0.2.17 tinyrainbow: 3.1.0 vite: 7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0) why-is-node-running: 2.3.0