From dd90404a982529dd24b70f7aed63ae0b4e30709a Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 08:31:43 -0700 Subject: [PATCH 01/13] Add intent discovery evaluation suite and related configurations - Introduced new README.md for intent discovery eval with usage instructions. - Added conditions and tasks for intent discovery evaluation. - Implemented grading functions for skill loading and failure classification. - Created harness for running saved transcripts in evaluations. - Configured Vitest for running intent discovery tests. - Updated package.json and pnpm-lock.yaml to include vitest-evals dependency. - Added .gitignore entries for evaluation runs and vitest artifacts. --- .gitignore | 7 + evals/intent-discovery/README.md | 15 ++ evals/intent-discovery/corpus/conditions.ts | 42 +++++ evals/intent-discovery/corpus/tasks.ts | 99 +++++++++++ .../fixtures/saved-transcripts.ts | 163 ++++++++++++++++++ .../graders/correct-skill-loaded.ts | 54 ++++++ .../graders/failure-classifier.ts | 38 ++++ .../graders/reference-only.ts | 19 ++ evals/intent-discovery/graders/skill-areas.ts | 38 ++++ .../graders/strict-invocation.ts | 75 ++++++++ .../harness/saved-transcript-harness.ts | 102 +++++++++++ .../intent-discovery/intent-discovery.eval.ts | 132 ++++++++++++++ evals/intent-discovery/runs/.gitkeep | 0 evals/intent-discovery/runs/latest/.gitkeep | 0 evals/intent-discovery/vitest.evals.config.ts | 15 ++ package.json | 8 +- pnpm-lock.yaml | 67 ++++--- 17 files changed, 849 insertions(+), 25 deletions(-) create mode 100644 evals/intent-discovery/README.md create mode 100644 evals/intent-discovery/corpus/conditions.ts create mode 100644 evals/intent-discovery/corpus/tasks.ts create mode 100644 evals/intent-discovery/fixtures/saved-transcripts.ts create mode 100644 evals/intent-discovery/graders/correct-skill-loaded.ts create mode 100644 evals/intent-discovery/graders/failure-classifier.ts create mode 100644 evals/intent-discovery/graders/reference-only.ts create mode 100644 evals/intent-discovery/graders/skill-areas.ts create mode 100644 evals/intent-discovery/graders/strict-invocation.ts create mode 100644 evals/intent-discovery/harness/saved-transcript-harness.ts create mode 100644 evals/intent-discovery/intent-discovery.eval.ts create mode 100644 evals/intent-discovery/runs/.gitkeep create mode 100644 evals/intent-discovery/runs/latest/.gitkeep create mode 100644 evals/intent-discovery/vitest.evals.config.ts diff --git a/.gitignore b/.gitignore index 254f72f..b1c20b4 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,10 @@ vite.config.ts.timestamp-* .angular docs/superpowers + +evals/intent-discovery/.vitest-evals/ +evals/intent-discovery/runs/* +!evals/intent-discovery/runs/.gitkeep +!evals/intent-discovery/runs/latest/ +evals/intent-discovery/runs/latest/* +!evals/intent-discovery/runs/latest/.gitkeep diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md new file mode 100644 index 0000000..3b43d0a --- /dev/null +++ b/evals/intent-discovery/README.md @@ -0,0 +1,15 @@ +# Intent discovery eval + +Opt-in eval suite for measuring whether Copilot discovers and invokes Intent surfaces without direct user instruction. + +## Commands + +- `pnpm eval:intent-discovery` runs the saved-transcript eval suite. +- `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`. +- `pnpm eval:intent-discovery:report` serves the saved JSON report. + +## Current scope + +This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`. + +Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/corpus/conditions.ts b/evals/intent-discovery/corpus/conditions.ts new file mode 100644 index 0000000..cafb9d2 --- /dev/null +++ b/evals/intent-discovery/corpus/conditions.ts @@ -0,0 +1,42 @@ +export const intentDiscoveryConditions = [ + { + id: 'no-intent', + countsTowardAutonomousScore: true, + }, + { + id: 'plain-docs', + countsTowardAutonomousScore: true, + }, + { + id: 'current-intent', + countsTowardAutonomousScore: true, + }, + { + id: 'explicit-intent-control', + countsTowardAutonomousScore: false, + }, +] as const + +export type IntentDiscoveryCondition = + (typeof intentDiscoveryConditions)[number]['id'] + +export const promptExplicitnessLevels = [0, 1, 2, 3, 4] as const + +export type PromptExplicitnessLevel = (typeof promptExplicitnessLevels)[number] + +export function countsTowardAutonomousScore({ + condition, + explicitnessLevel, +}: { + condition: IntentDiscoveryCondition + explicitnessLevel: PromptExplicitnessLevel +}): boolean { + if (explicitnessLevel === 4) { + return false + } + + return ( + intentDiscoveryConditions.find((candidate) => candidate.id === condition) + ?.countsTowardAutonomousScore ?? false + ) +} diff --git a/evals/intent-discovery/corpus/tasks.ts b/evals/intent-discovery/corpus/tasks.ts new file mode 100644 index 0000000..d01826c --- /dev/null +++ b/evals/intent-discovery/corpus/tasks.ts @@ -0,0 +1,99 @@ +import type { + IntentDiscoveryCondition, + PromptExplicitnessLevel, +} from './conditions' + +export const expectedSkillAreas = [ + 'router', + 'query', + 'table', + 'form', + 'start', +] as const + +export type ExpectedSkillArea = (typeof expectedSkillAreas)[number] + +export type IntentDiscoveryFixture = + | 'router-basic' + | 'query-basic' + | 'table-basic' + | 'form-basic' + | 'start-basic' + | 'mixed-app' + +export type IntentDiscoveryFailureClass = + | 'strict-success' + | 'no-discovery-attempt' + | 'instruction-ignored' + | 'wrong-surface' + | 'command-unknown' + | 'command-attempted-but-failed' + | 'wrong-skill-selected' + | 'late-load' + | 'reference-only' + | 'final-output-only' + | 'context-saturation' + | 'prompt-too-vague' + | 'harness-error' + +export type IntentDiscoveryExpected = { + strictInvocation: boolean + correctSkillLoaded: boolean + referenceOnly: boolean + failureClass: IntentDiscoveryFailureClass +} + +export type IntentDiscoveryTask = { + id: string + fixture: IntentDiscoveryFixture + condition: IntentDiscoveryCondition + explicitnessLevel: PromptExplicitnessLevel + prompt: string + expectedSkillAreas: ExpectedSkillArea[] + expected: IntentDiscoveryExpected +} + +export const tasks: IntentDiscoveryTask[] = [ + { + id: 'router-current-intent-loads-router', + fixture: 'router-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: 'Add a route that loads user data before rendering the page.', + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'router-plain-docs-reference-only', + fixture: 'router-basic', + condition: 'plain-docs', + explicitnessLevel: 2, + prompt: 'Add a route that loads user data before rendering the page.', + expectedSkillAreas: ['router'], + expected: { + strictInvocation: false, + correctSkillLoaded: false, + referenceOnly: true, + failureClass: 'reference-only', + }, + }, + { + id: 'query-current-intent-loads-wrong-skill', + fixture: 'query-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: 'Add a mutation that invalidates the user list query after save.', + expectedSkillAreas: ['query'], + expected: { + strictInvocation: true, + correctSkillLoaded: false, + referenceOnly: false, + failureClass: 'wrong-skill-selected', + }, + }, +] diff --git a/evals/intent-discovery/fixtures/saved-transcripts.ts b/evals/intent-discovery/fixtures/saved-transcripts.ts new file mode 100644 index 0000000..9ff337e --- /dev/null +++ b/evals/intent-discovery/fixtures/saved-transcripts.ts @@ -0,0 +1,163 @@ +import type { NormalizedMessage, SimpleToolCallRecord } from 'vitest-evals' +import type { IntentDiscoveryTask } from '../corpus/tasks' +import { tasks } from '../corpus/tasks' + +export type SavedTranscriptCase = IntentDiscoveryTask & { + finalAnswer: string + messages: NormalizedMessage[] + toolCalls: SimpleToolCallRecord[] + commandsInvoked: string[] + intentCommandsInvoked: string[] + intentCommandOutputs: string[] + loadedSkills: string[] + agentErrors: string[] +} + +const taskById = new Map(tasks.map((task) => [task.id, task])) + +export const savedTranscriptCases: SavedTranscriptCase[] = [ + savedTranscript('router-current-intent-loads-router', { + finalAnswer: + 'I loaded @tanstack/router#routing and used its route loader guidance before making the route change.', + messages: [ + { + role: 'user', + content: 'Add a route that loads user data before rendering the page.', + }, + { + role: 'assistant', + content: 'I will inspect the installed Intent skills first.', + }, + { + role: 'tool', + content: + '$ pnpm exec intent list\n@tanstack/router#routing - Router route and loader guidance', + }, + { + role: 'tool', + content: + '$ pnpm exec intent load @tanstack/router#routing\nLoaded @tanstack/router#routing', + }, + { + role: 'assistant', + content: + 'I loaded @tanstack/router#routing and used its route loader guidance before making the route change.', + }, + ], + toolCalls: [ + { + name: 'shell_command', + arguments: { command: 'pnpm exec intent list' }, + result: '@tanstack/router#routing - Router route and loader guidance', + }, + { + name: 'shell_command', + arguments: { + command: 'pnpm exec intent load @tanstack/router#routing', + }, + result: 'Loaded @tanstack/router#routing', + }, + ], + commandsInvoked: [ + 'pnpm exec intent list', + 'pnpm exec intent load @tanstack/router#routing', + ], + intentCommandsInvoked: [ + 'pnpm exec intent list', + 'pnpm exec intent load @tanstack/router#routing', + ], + intentCommandOutputs: [ + '@tanstack/router#routing - Router route and loader guidance', + 'Loaded @tanstack/router#routing', + ], + loadedSkills: ['@tanstack/router#routing'], + agentErrors: [], + }), + savedTranscript('router-plain-docs-reference-only', { + finalAnswer: + 'This looks like a TanStack Router loader task. I would follow the Router docs and add the loader to the route.', + messages: [ + { + role: 'user', + content: 'Add a route that loads user data before rendering the page.', + }, + { + role: 'assistant', + content: + 'This looks like a TanStack Router loader task. I would follow the Router docs and add the loader to the route.', + }, + ], + toolCalls: [], + commandsInvoked: [], + intentCommandsInvoked: [], + intentCommandOutputs: [], + loadedSkills: [], + agentErrors: [], + }), + savedTranscript('query-current-intent-loads-wrong-skill', { + finalAnswer: + 'I loaded @tanstack/router#routing, but the task needed TanStack Query mutation guidance.', + messages: [ + { + role: 'user', + content: + 'Add a mutation that invalidates the user list query after save.', + }, + { + role: 'tool', + content: + '$ intent list\n@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + }, + { + role: 'tool', + content: + '$ intent load @tanstack/router#routing\nLoaded @tanstack/router#routing', + }, + { + role: 'assistant', + content: + 'I loaded @tanstack/router#routing, but the task needed TanStack Query mutation guidance.', + }, + ], + toolCalls: [ + { + name: 'shell_command', + arguments: { command: 'intent list' }, + result: + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + }, + { + name: 'shell_command', + arguments: { command: 'intent load @tanstack/router#routing' }, + result: 'Loaded @tanstack/router#routing', + }, + ], + commandsInvoked: ['intent list', 'intent load @tanstack/router#routing'], + intentCommandsInvoked: [ + 'intent list', + 'intent load @tanstack/router#routing', + ], + intentCommandOutputs: [ + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + 'Loaded @tanstack/router#routing', + ], + loadedSkills: ['@tanstack/router#routing'], + agentErrors: [], + }), +] + +function savedTranscript( + taskId: string, + transcript: Omit, +): SavedTranscriptCase { + const task = taskById.get(taskId) + + if (!task) { + throw new Error(`Unknown saved transcript task: ${taskId}`) + } + + return { + ...task, + ...transcript, + } +} diff --git a/evals/intent-discovery/graders/correct-skill-loaded.ts b/evals/intent-discovery/graders/correct-skill-loaded.ts new file mode 100644 index 0000000..dde849e --- /dev/null +++ b/evals/intent-discovery/graders/correct-skill-loaded.ts @@ -0,0 +1,54 @@ +import type { HarnessRun } from 'vitest-evals' +import { toolCalls } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' +import { listIncludesExpectedSkillArea } from './skill-areas' + +export type CorrectSkillLoadedResult = { + passed: boolean + loadedSkills: string[] +} + +export function correctSkillLoaded( + run: HarnessRun, + expectedSkillAreas: ExpectedSkillArea[], +): CorrectSkillLoadedResult { + const loadedSkills = loadedSkillsFromRun(run) + + return { + passed: listIncludesExpectedSkillArea(loadedSkills, expectedSkillAreas), + loadedSkills, + } +} + +function loadedSkillsFromRun(run: HarnessRun): string[] { + const artifactSkills = stringArrayArtifact(run.artifacts?.loadedSkills) + const commandSkills = toolCalls(run) + .map((call) => commandString(call.arguments?.command)) + .filter((command): command is string => Boolean(command)) + .map((command) => skillFromLoadCommand(command)) + .filter((skill): skill is string => Boolean(skill)) + + return [...new Set([...artifactSkills, ...commandSkills])] +} + +function stringArrayArtifact(value: unknown): string[] { + if (!Array.isArray(value)) { + return [] + } + + return value.filter( + (candidate): candidate is string => typeof candidate === 'string', + ) +} + +function commandString(value: unknown): string | undefined { + return typeof value === 'string' ? value : undefined +} + +function skillFromLoadCommand(command: string): string | undefined { + const match = command.match( + /(?:^|\s)(?:(?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+load\s+(\S+)/i, + ) + + return match?.[1] +} diff --git a/evals/intent-discovery/graders/failure-classifier.ts b/evals/intent-discovery/graders/failure-classifier.ts new file mode 100644 index 0000000..ce71635 --- /dev/null +++ b/evals/intent-discovery/graders/failure-classifier.ts @@ -0,0 +1,38 @@ +import type { HarnessRun } from 'vitest-evals' +import type { + ExpectedSkillArea, + IntentDiscoveryFailureClass, +} from '../corpus/tasks' +import { correctSkillLoaded } from './correct-skill-loaded' +import { referenceOnly } from './reference-only' +import { strictIntentInvocation } from './strict-invocation' + +export function classifyFailure( + run: HarnessRun, + expectedSkillAreas: ExpectedSkillArea[], +): IntentDiscoveryFailureClass { + if (run.errors.length > 0) { + return 'harness-error' + } + + const strict = strictIntentInvocation(run) + const skillLoaded = correctSkillLoaded(run, expectedSkillAreas) + + if (strict.passed && skillLoaded.passed) { + return 'strict-success' + } + + if (strict.passed && skillLoaded.loadedSkills.length > 0) { + return 'wrong-skill-selected' + } + + if (strict.passed) { + return 'command-attempted-but-failed' + } + + if (referenceOnly(run, expectedSkillAreas)) { + return 'reference-only' + } + + return 'no-discovery-attempt' +} diff --git a/evals/intent-discovery/graders/reference-only.ts b/evals/intent-discovery/graders/reference-only.ts new file mode 100644 index 0000000..70285ca --- /dev/null +++ b/evals/intent-discovery/graders/reference-only.ts @@ -0,0 +1,19 @@ +import type { HarnessRun } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' +import { jsonToSearchableText, textMatchesSkillArea } from './skill-areas' +import { strictIntentInvocation } from './strict-invocation' + +export function referenceOnly( + run: HarnessRun, + expectedSkillAreas: ExpectedSkillArea[], +): boolean { + if (strictIntentInvocation(run).passed) { + return false + } + + const transcriptText = run.session.messages + .map((message) => jsonToSearchableText(message.content)) + .join('\n') + + return textMatchesSkillArea(transcriptText, expectedSkillAreas) +} diff --git a/evals/intent-discovery/graders/skill-areas.ts b/evals/intent-discovery/graders/skill-areas.ts new file mode 100644 index 0000000..c1acf9b --- /dev/null +++ b/evals/intent-discovery/graders/skill-areas.ts @@ -0,0 +1,38 @@ +import type { JsonValue } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' + +const skillAreaPatterns: Record = { + router: [/router/i, /routing/i, /@tanstack\/router/i], + query: [/query/i, /mutation/i, /@tanstack\/query/i], + table: [/table/i, /column/i, /sorting/i, /@tanstack\/table/i], + form: [/form/i, /validation/i, /submit/i, /@tanstack\/form/i], + start: [/start/i, /full-stack/i, /@tanstack\/start/i], +} + +export function jsonToSearchableText(value: JsonValue | undefined): string { + if (value === undefined || value === null) { + return '' + } + + if (typeof value === 'string') { + return value + } + + return JSON.stringify(value) +} + +export function textMatchesSkillArea( + text: string, + expectedSkillAreas: ExpectedSkillArea[], +): boolean { + return expectedSkillAreas.some((area) => + skillAreaPatterns[area].some((pattern) => pattern.test(text)), + ) +} + +export function listIncludesExpectedSkillArea( + values: string[], + expectedSkillAreas: ExpectedSkillArea[], +): boolean { + return values.some((value) => textMatchesSkillArea(value, expectedSkillAreas)) +} diff --git a/evals/intent-discovery/graders/strict-invocation.ts b/evals/intent-discovery/graders/strict-invocation.ts new file mode 100644 index 0000000..7fe5911 --- /dev/null +++ b/evals/intent-discovery/graders/strict-invocation.ts @@ -0,0 +1,75 @@ +import type { HarnessRun, ToolCallRecord } from 'vitest-evals' +import { toolCalls } from 'vitest-evals' +import { jsonToSearchableText } from './skill-areas' + +export type StrictInvocationResult = { + passed: boolean + matchedCommand?: string + source?: 'tool-call' | 'tool-message' +} + +const intentToolNames = new Set(['intent_list', 'intent_load']) + +const acceptedIntentCommandPattern = + /(?:^|\s)(?:(?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+(?:list|load\s+\S+)/i + +export function strictIntentInvocation( + run: HarnessRun, +): StrictInvocationResult { + for (const call of toolCalls(run)) { + if (intentToolNames.has(call.name)) { + return { + passed: true, + matchedCommand: call.name, + source: 'tool-call', + } + } + + const command = commandFromToolCall(call) + + if (command && acceptedIntentCommandPattern.test(command)) { + return { + passed: true, + matchedCommand: command, + source: 'tool-call', + } + } + } + + for (const message of run.session.messages) { + if (message.role !== 'tool') { + continue + } + + const content = jsonToSearchableText(message.content) + const match = content.match(acceptedIntentCommandPattern) + + if (match?.[0]) { + return { + passed: true, + matchedCommand: match[0].trim(), + source: 'tool-message', + } + } + } + + return { passed: false } +} + +function commandFromToolCall(call: ToolCallRecord): string | undefined { + return ( + stringRecordValue(call.arguments, 'command') ?? + stringRecordValue(call.arguments, 'cmd') ?? + stringRecordValue(call.arguments, 'input') ?? + stringRecordValue(call.metadata, 'command') + ) +} + +function stringRecordValue( + value: Record | undefined, + key: string, +): string | undefined { + const candidate = value?.[key] + + return typeof candidate === 'string' ? candidate : undefined +} diff --git a/evals/intent-discovery/harness/saved-transcript-harness.ts b/evals/intent-discovery/harness/saved-transcript-harness.ts new file mode 100644 index 0000000..6e560be --- /dev/null +++ b/evals/intent-discovery/harness/saved-transcript-harness.ts @@ -0,0 +1,102 @@ +import { createHarness } from 'vitest-evals' +import type { NormalizedMessage, SimpleToolCallRecord } from 'vitest-evals' +import type { SavedTranscriptCase } from '../fixtures/saved-transcripts' + +export type IntentDiscoveryOutput = { + finalAnswer: string + runId: string +} + +export const savedTranscriptHarness = createHarness< + SavedTranscriptCase, + IntentDiscoveryOutput +>({ + name: 'intent-discovery-saved-transcript', + run: ({ input, setArtifact }) => { + const runId = `saved:${input.id}` + + setArtifact('runId', runId) + setArtifact('taskId', input.id) + setArtifact('condition', input.condition) + setArtifact('fixture', input.fixture) + setArtifact('prompt', input.prompt) + setArtifact('expectedSkillAreas', input.expectedSkillAreas) + setArtifact( + 'transcriptPath', + 'evals/intent-discovery/fixtures/saved-transcripts.ts', + ) + setArtifact('commandsInvoked', input.commandsInvoked) + setArtifact('intentCommandsInvoked', input.intentCommandsInvoked) + setArtifact('intentCommandOutputs', input.intentCommandOutputs) + setArtifact('loadedSkills', input.loadedSkills) + setArtifact('agentErrors', input.agentErrors) + + return { + output: { + finalAnswer: input.finalAnswer, + runId, + }, + messages: messagesWithToolCalls(input.messages, input.toolCalls), + toolCalls: input.toolCalls, + usage: { + provider: 'saved-transcript', + model: 'synthetic', + }, + artifacts: { + runKind: 'saved-transcript', + }, + traces: [ + { + id: runId, + name: 'saved transcript grading', + spans: [ + { + id: `${runId}:load`, + name: 'load saved transcript', + kind: 'custom', + status: 'ok', + attributes: { + taskId: input.id, + fixture: input.fixture, + condition: input.condition, + }, + }, + ], + }, + ], + errors: input.agentErrors, + } + }, +}) + +function messagesWithToolCalls( + messages: NormalizedMessage[], + toolCalls: SimpleToolCallRecord[], +): NormalizedMessage[] { + if (toolCalls.length === 0) { + return messages + } + + const firstAssistantIndex = messages.findIndex( + (message) => message.role === 'assistant', + ) + + if (firstAssistantIndex === -1) { + return [ + ...messages, + { + role: 'assistant', + toolCalls, + }, + ] + } + + return messages.map((message, index) => + index === firstAssistantIndex + ? { + ...message, + toolCalls: [...(message.toolCalls ?? []), ...toolCalls], + } + : message, + ) +} diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts new file mode 100644 index 0000000..741d742 --- /dev/null +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -0,0 +1,132 @@ +import type { HarnessContext, HarnessRun, JudgeResult } from 'vitest-evals' +import { describe, expect, it } from 'vitest' +import { failedSpans, toolCalls } from 'vitest-evals' +import { countsTowardAutonomousScore } from './corpus/conditions' +import { correctSkillLoaded } from './graders/correct-skill-loaded' +import { classifyFailure } from './graders/failure-classifier' +import { referenceOnly } from './graders/reference-only' +import { strictIntentInvocation } from './graders/strict-invocation' +import { savedTranscriptCases } from './fixtures/saved-transcripts' +import { + savedTranscriptHarness, + type IntentDiscoveryOutput, +} from './harness/saved-transcript-harness' + +describe('Intent discovery saved transcripts', () => { + for (const evalCase of savedTranscriptCases) { + it(evalCase.id, async (context) => { + const result = await runSavedTranscript(evalCase) + const strict = strictIntentInvocation(result) + const loaded = correctSkillLoaded(result, evalCase.expectedSkillAreas) + const reference = referenceOnly(result, evalCase.expectedSkillAreas) + const failureClass = classifyFailure(result, evalCase.expectedSkillAreas) + const autonomous = countsTowardAutonomousScore({ + condition: evalCase.condition, + explicitnessLevel: evalCase.explicitnessLevel, + }) + const scores = [ + score( + 'AutonomousDiscoverySuccess', + autonomous && strict.passed && loaded.passed, + { + rationale: + 'Scores only autonomous runs where Copilot invoked Intent and loaded the expected skill.', + failureClass, + }, + ), + score('StrictIntentInvocation', strict.passed, { + matchedCommand: strict.matchedCommand, + source: strict.source, + }), + score('CorrectSkillLoaded', loaded.passed, { + loadedSkills: loaded.loadedSkills, + expectedSkillAreas: evalCase.expectedSkillAreas, + }), + score('NoReferenceOnlyFalsePositive', !reference, { + referenceOnly: reference, + }), + ] + + attachEvalMetadata({ + harnessName: savedTranscriptHarness.name, + run: result, + scores, + task: context.task, + }) + + expect(result.errors).toHaveLength(0) + expect(failedSpans(result)).toHaveLength(0) + expect(result.output.finalAnswer.length).toBeGreaterThan(0) + expect(toolCalls(result).length).toBe(evalCase.toolCalls.length) + expect(strict.passed).toBe(evalCase.expected.strictInvocation) + expect(loaded.passed).toBe(evalCase.expected.correctSkillLoaded) + expect(reference).toBe(evalCase.expected.referenceOnly) + expect(failureClass).toBe(evalCase.expected.failureClass) + expect(autonomous).toBe(evalCase.explicitnessLevel !== 4) + }) + } +}) + +type NamedJudgeResult = JudgeResult & { name: string } + +function score( + name: string, + passed: boolean, + metadata?: NamedJudgeResult['metadata'], +): NamedJudgeResult { + return { + name, + score: passed ? 1 : 0, + metadata, + } +} + +function attachEvalMetadata({ + harnessName, + run, + scores, + task, +}: { + harnessName: string + run: HarnessRun + scores: NamedJudgeResult[] + task: RuntimeTask +}): void { + const avgScore = + scores.reduce((total, item) => total + (item.score ?? 0), 0) / scores.length + + task.meta.harness = { + name: harnessName, + run, + } + task.meta.eval = { + scores, + avgScore, + output: run.output, + toolCalls: toolCalls(run), + thresholdFailed: false, + } +} + +type RuntimeTask = { + meta: { + harness?: unknown + eval?: unknown + } +} + +async function runSavedTranscript( + evalCase: (typeof savedTranscriptCases)[number], +) { + const artifacts: HarnessContext['artifacts'] = {} + const context: HarnessContext = { + artifacts, + setArtifact(name, value) { + artifacts[name] = value + }, + } + + return savedTranscriptHarness.run(evalCase, context) as Promise< + HarnessRun + > +} diff --git a/evals/intent-discovery/runs/.gitkeep b/evals/intent-discovery/runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/intent-discovery/runs/latest/.gitkeep b/evals/intent-discovery/runs/latest/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/intent-discovery/vitest.evals.config.ts b/evals/intent-discovery/vitest.evals.config.ts new file mode 100644 index 0000000..28a386a --- /dev/null +++ b/evals/intent-discovery/vitest.evals.config.ts @@ -0,0 +1,15 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + include: ['evals/intent-discovery/**/*.eval.ts'], + testTimeout: 120_000, + hookTimeout: 120_000, + reporters: ['default'], + env: { + VITEST_EVALS_REPLAY_DIR: + process.env.VITEST_EVALS_REPLAY_DIR ?? + 'evals/intent-discovery/.vitest-evals/recordings', + }, + }, +}) diff --git a/package.json b/package.json index 360be17..a38903c 100644 --- a/package.json +++ b/package.json @@ -25,9 +25,12 @@ "format": "prettier --experimental-cli --ignore-unknown '**/*' --write", "lint:fix": "nx affected --target=lint:fix --exclude=examples/**", "lint:fix:all": "nx run-many --targets=lint --fix", + "generate-docs": "node scripts/generate-docs.ts", + "eval:intent-discovery": "vitest run --config evals/intent-discovery/vitest.evals.config.ts", + "eval:intent-discovery:json": "vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:report": "vitest-evals serve evals/intent-discovery/runs/latest/vitest-results.json", "test": "pnpm run test:ci", "test:ci": "tsc --noEmit && nx run-many --targets=test:eslint,test:sherif,test:knip,test:docs,test:lib,test:integration,test:types,build", - "generate-docs": "node scripts/generate-docs.ts", "test:docs": "node scripts/verify-links.ts", "test:eslint": "nx affected --target=test:eslint --exclude=examples/**", "test:knip": "knip", @@ -60,6 +63,7 @@ "sherif": "^1.11.1", "tinyglobby": "^0.2.17", "typescript": "6.0.3", - "vitest": "4.1.8" + "vitest": "4.1.8", + "vitest-evals": "^0.13.1" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8d95985..4d62027 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -59,6 +59,9 @@ importers: vitest: specifier: 4.1.8 version: 4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)) + vitest-evals: + specifier: ^0.13.1 + version: 0.13.1(tinyrainbow@3.1.0)(vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)))(zod@4.3.5) benchmarks/intent: devDependencies: @@ -1469,6 +1472,12 @@ packages: resolution: {integrity: sha512-LTV6/Kcr8pS//iDjDitfYi1bp0AlKBUuvNoHAbI8tMnj0PLOUtRWJxId5FwuE+z3oNBeDLeOWPoXVtqKjl277Q==} engines: {node: '>=18'} + '@vitest-evals/core@0.13.1': + resolution: {integrity: sha512-YX5bRG+J0GCzwJiNoq7UHJVRrtqx07lF3cYUrHnvfRLrn/R5nfBkFkm9eluAYlMFbWehFw+fFIW7bPuyL+3pMg==} + + '@vitest-evals/report-ui@0.13.1': + resolution: {integrity: sha512-uA0OSe8UFhSP8i92hUNSFbdJ7Lwi0b06DVfvPb9lnEADgZrExv8IiHy9mkRuU+aMwo7zQI75ZZz1qx07XzPczA==} + '@vitest/expect@4.1.8': resolution: {integrity: sha512-h3nDO677RDLEGlBxyQ5CW8RlMThSKSRLUePLOx09gNIWRL40edgA1GCZSZgf1W55MFAG6/Sw14KeaAnqv0NKdQ==} @@ -3507,18 +3516,10 @@ packages: tinybench@2.9.0: resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==} - tinyexec@1.0.2: - resolution: {integrity: sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==} - engines: {node: '>=18'} - tinyexec@1.2.4: resolution: {integrity: sha512-SHf/r48b7vOrjve9PxJo3MN5v5yuyjHvdUcrQffT3WXMUfnGmHDVbC4k3sHJaJTgZCwpUplIaAo5ANtMyp3YHg==} engines: {node: '>=18'} - tinyglobby@0.2.15: - resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==} - engines: {node: '>=12.0.0'} - tinyglobby@0.2.17: resolution: {integrity: sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==} engines: {node: '>=12.0.0'} @@ -3788,6 +3789,20 @@ packages: yaml: optional: true + vitest-evals@0.13.1: + resolution: {integrity: sha512-UCA3drMFVxtYB3F/0AjQEBSp7EPc2Du2Au85kLHtQg4V6p2mpifP4m5VEfwgxVXq8UfrnsMk8SJvOB/5EiDC0g==} + hasBin: true + peerDependencies: + ai: '>=4 <7' + tinyrainbow: '>=2 <4' + vitest: '>=4 <5' + zod: '>=3 <5' + peerDependenciesMeta: + ai: + optional: true + zod: + optional: true + vitest@4.1.8: resolution: {integrity: sha512-flY6ScbCIt9HThs+C5HS7jvGOB560DJtk/Z15IQROTA6zEy49Nh8T/dofWTQL+n3vswqn87sbJNiuqw1SDp5Ig==} engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} @@ -5217,6 +5232,14 @@ snapshots: lodash: 4.18.1 minimatch: 7.4.9 + '@vitest-evals/core@0.13.1': + dependencies: + zod: 4.3.5 + + '@vitest-evals/report-ui@0.13.1': + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest/expect@4.1.8': dependencies: '@standard-schema/spec': 1.1.0 @@ -6027,10 +6050,6 @@ snapshots: dependencies: walk-up-path: 4.0.0 - fdir@6.5.0(picomatch@4.0.3): - optionalDependencies: - picomatch: 4.0.3 - fdir@6.5.0(picomatch@4.0.4): optionalDependencies: picomatch: 4.0.4 @@ -7463,15 +7482,8 @@ snapshots: tinybench@2.9.0: {} - tinyexec@1.0.2: {} - tinyexec@1.2.4: {} - tinyglobby@0.2.15: - dependencies: - fdir: 6.5.0(picomatch@4.0.3) - picomatch: 4.0.3 - tinyglobby@0.2.17: dependencies: fdir: 6.5.0(picomatch@4.0.4) @@ -7511,7 +7523,7 @@ snapshots: ts-declaration-location@1.0.7(typescript@6.0.3): dependencies: - picomatch: 4.0.3 + picomatch: 4.0.4 typescript: 6.0.3 tsconfig-paths@4.2.0: @@ -7749,6 +7761,15 @@ snapshots: jiti: 2.7.0 yaml: 2.9.0 + vitest-evals@0.13.1(tinyrainbow@3.1.0)(vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)))(zod@4.3.5): + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest-evals/report-ui': 0.13.1 + tinyrainbow: 3.1.0 + vitest: 4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)) + optionalDependencies: + zod: 4.3.5 + vitest@4.1.8(@types/node@25.0.9)(happy-dom@20.3.1)(vite@7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0)): dependencies: '@vitest/expect': 4.1.8 @@ -7763,11 +7784,11 @@ snapshots: magic-string: 0.30.21 obug: 2.1.1 pathe: 2.0.3 - picomatch: 4.0.3 + picomatch: 4.0.4 std-env: 4.1.0 tinybench: 2.9.0 - tinyexec: 1.0.2 - tinyglobby: 0.2.15 + tinyexec: 1.2.4 + tinyglobby: 0.2.17 tinyrainbow: 3.1.0 vite: 7.3.1(@types/node@25.0.9)(jiti@2.7.0)(yaml@2.9.0) why-is-node-running: 2.3.0 From 4b548f5f4f4c9efa775cb863a0bcbf684f48aa64 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 08:48:58 -0700 Subject: [PATCH 02/13] Add intent discovery evaluation framework and related fixtures --- eslint.config.js | 9 +++ evals/intent-discovery/README.md | 2 + evals/intent-discovery/corpus/fixtures.ts | 29 ++++++++ evals/intent-discovery/corpus/tasks.ts | 25 +++---- evals/intent-discovery/fixture-corpus.eval.ts | 35 ++++++++++ .../fixtures/router-basic/package.json | 10 +++ .../router-basic/src/routes/users.$userId.tsx | 27 ++++++++ .../fixtures/saved-transcripts.ts | 33 +++++---- .../fixtures/start-basic/package.json | 11 +++ .../fixtures/start-basic/src/routes/users.tsx | 33 +++++++++ .../fixtures/table-v9-basic/package.json | 10 +++ .../table-v9-basic/src/user-table.tsx | 67 +++++++++++++++++++ .../graders/correct-skill-loaded.ts | 8 +-- .../graders/failure-classifier.ts | 2 +- .../graders/reference-only.ts | 2 +- evals/intent-discovery/graders/skill-areas.ts | 14 ++-- .../harness/saved-transcript-harness.ts | 8 +-- .../intent-discovery/intent-discovery.eval.ts | 2 +- evals/intent-discovery/tsconfig.json | 8 +++ 19 files changed, 282 insertions(+), 53 deletions(-) create mode 100644 evals/intent-discovery/corpus/fixtures.ts create mode 100644 evals/intent-discovery/fixture-corpus.eval.ts create mode 100644 evals/intent-discovery/fixtures/router-basic/package.json create mode 100644 evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx create mode 100644 evals/intent-discovery/fixtures/start-basic/package.json create mode 100644 evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx create mode 100644 evals/intent-discovery/fixtures/table-v9-basic/package.json create mode 100644 evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx create mode 100644 evals/intent-discovery/tsconfig.json diff --git a/eslint.config.js b/eslint.config.js index bc64866..735a47d 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -73,6 +73,15 @@ const config = [ ], }, }, + { + name: 'intent/evals', + files: ['evals/intent-discovery/**/*.ts'], + languageOptions: { + parserOptions: { + project: './evals/intent-discovery/tsconfig.json', + }, + }, + }, ] export default config diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index 3b43d0a..94ca19b 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -12,4 +12,6 @@ Opt-in eval suite for measuring whether Copilot discovers and invokes Intent sur This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`. +The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9. + Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/corpus/fixtures.ts b/evals/intent-discovery/corpus/fixtures.ts new file mode 100644 index 0000000..de38f74 --- /dev/null +++ b/evals/intent-discovery/corpus/fixtures.ts @@ -0,0 +1,29 @@ +import type { ExpectedSkillArea, IntentDiscoveryFixture } from './tasks' + +export type IntentDiscoveryFixtureDefinition = { + id: IntentDiscoveryFixture + purpose: string + skillAreas: Array + files: Array +} + +export const fixtures = { + 'router-basic': { + id: 'router-basic', + purpose: 'Route discovery and route loader changes.', + skillAreas: ['router'], + files: ['package.json', 'src/routes/users.$userId.tsx'], + }, + 'start-basic': { + id: 'start-basic', + purpose: 'TanStack Start server function and route loader behavior.', + skillAreas: ['start'], + files: ['package.json', 'src/routes/users.tsx'], + }, + 'table-v9-basic': { + id: 'table-v9-basic', + purpose: 'TanStack Table v9 column definitions and sorting behavior.', + skillAreas: ['table-v9'], + files: ['package.json', 'src/user-table.tsx'], + }, +} satisfies Record diff --git a/evals/intent-discovery/corpus/tasks.ts b/evals/intent-discovery/corpus/tasks.ts index d01826c..29bfa63 100644 --- a/evals/intent-discovery/corpus/tasks.ts +++ b/evals/intent-discovery/corpus/tasks.ts @@ -3,23 +3,14 @@ import type { PromptExplicitnessLevel, } from './conditions' -export const expectedSkillAreas = [ - 'router', - 'query', - 'table', - 'form', - 'start', -] as const +export const expectedSkillAreas = ['router', 'start', 'table-v9'] as const export type ExpectedSkillArea = (typeof expectedSkillAreas)[number] export type IntentDiscoveryFixture = | 'router-basic' - | 'query-basic' - | 'table-basic' - | 'form-basic' | 'start-basic' - | 'mixed-app' + | 'table-v9-basic' export type IntentDiscoveryFailureClass = | 'strict-success' @@ -49,11 +40,11 @@ export type IntentDiscoveryTask = { condition: IntentDiscoveryCondition explicitnessLevel: PromptExplicitnessLevel prompt: string - expectedSkillAreas: ExpectedSkillArea[] + expectedSkillAreas: Array expected: IntentDiscoveryExpected } -export const tasks: IntentDiscoveryTask[] = [ +export const tasks: Array = [ { id: 'router-current-intent-loads-router', fixture: 'router-basic', @@ -83,12 +74,12 @@ export const tasks: IntentDiscoveryTask[] = [ }, }, { - id: 'query-current-intent-loads-wrong-skill', - fixture: 'query-basic', + id: 'table-v9-current-intent-loads-wrong-skill', + fixture: 'table-v9-basic', condition: 'current-intent', explicitnessLevel: 2, - prompt: 'Add a mutation that invalidates the user list query after save.', - expectedSkillAreas: ['query'], + prompt: 'Add a TanStack Table v9 column with sortable user roles.', + expectedSkillAreas: ['table-v9'], expected: { strictInvocation: true, correctSkillLoaded: false, diff --git a/evals/intent-discovery/fixture-corpus.eval.ts b/evals/intent-discovery/fixture-corpus.eval.ts new file mode 100644 index 0000000..d9054a3 --- /dev/null +++ b/evals/intent-discovery/fixture-corpus.eval.ts @@ -0,0 +1,35 @@ +import { existsSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { describe, expect, it } from 'vitest' +import { fixtures } from './corpus/fixtures' +import { tasks, type ExpectedSkillArea } from './corpus/tasks' + +const fixturesDir = join(dirname(fileURLToPath(import.meta.url)), 'fixtures') + +describe('Intent discovery fixture corpus', () => { + it('has source files for every declared fixture', () => { + for (const fixture of Object.values(fixtures)) { + for (const file of fixture.files) { + expect( + existsSync(join(fixturesDir, fixture.id, file)), + `${fixture.id} is missing ${file}`, + ).toBe(true) + } + } + }) + + it('points each task at a fixture that covers its expected skill areas', () => { + for (const task of tasks) { + const fixture = fixtures[task.fixture] + + expect(fixture, `${task.id} uses an unknown fixture`).toBeDefined() + expect( + task.expectedSkillAreas.every((area) => + (fixture.skillAreas as Array).includes(area), + ), + `${task.id} expects ${task.expectedSkillAreas.join(', ')} but ${fixture.id} covers ${fixture.skillAreas.join(', ')}`, + ).toBe(true) + } + }) +}) diff --git a/evals/intent-discovery/fixtures/router-basic/package.json b/evals/intent-discovery/fixtures/router-basic/package.json new file mode 100644 index 0000000..ec20a76 --- /dev/null +++ b/evals/intent-discovery/fixtures/router-basic/package.json @@ -0,0 +1,10 @@ +{ + "name": "intent-eval-router-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-router": "latest", + "react": "latest", + "react-dom": "latest" + } +} diff --git a/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx b/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx new file mode 100644 index 0000000..09c25e3 --- /dev/null +++ b/evals/intent-discovery/fixtures/router-basic/src/routes/users.$userId.tsx @@ -0,0 +1,27 @@ +import { createFileRoute } from '@tanstack/react-router' + +type User = { + id: string + name: string +} + +async function fetchUser(userId: string): Promise { + const response = await fetch(`/api/users/${userId}`) + + if (!response.ok) { + throw new Error('Unable to load user') + } + + return response.json() as Promise +} + +export const Route = createFileRoute('/users/$userId')({ + loader: ({ params }) => fetchUser(params.userId), + component: UserRoute, +}) + +function UserRoute() { + const user = Route.useLoaderData() + + return

{user.name}

+} diff --git a/evals/intent-discovery/fixtures/saved-transcripts.ts b/evals/intent-discovery/fixtures/saved-transcripts.ts index 9ff337e..3471cf8 100644 --- a/evals/intent-discovery/fixtures/saved-transcripts.ts +++ b/evals/intent-discovery/fixtures/saved-transcripts.ts @@ -1,21 +1,21 @@ -import type { NormalizedMessage, SimpleToolCallRecord } from 'vitest-evals' +import type { NormalizedMessage, ToolCallRecord } from 'vitest-evals' import type { IntentDiscoveryTask } from '../corpus/tasks' import { tasks } from '../corpus/tasks' export type SavedTranscriptCase = IntentDiscoveryTask & { finalAnswer: string - messages: NormalizedMessage[] - toolCalls: SimpleToolCallRecord[] - commandsInvoked: string[] - intentCommandsInvoked: string[] - intentCommandOutputs: string[] - loadedSkills: string[] - agentErrors: string[] + messages: Array + toolCalls: Array + commandsInvoked: Array + intentCommandsInvoked: Array + intentCommandOutputs: Array + loadedSkills: Array + agentErrors: Array } const taskById = new Map(tasks.map((task) => [task.id, task])) -export const savedTranscriptCases: SavedTranscriptCase[] = [ +export const savedTranscriptCases: Array = [ savedTranscript('router-current-intent-loads-router', { finalAnswer: 'I loaded @tanstack/router#routing and used its route loader guidance before making the route change.', @@ -94,19 +94,18 @@ export const savedTranscriptCases: SavedTranscriptCase[] = [ loadedSkills: [], agentErrors: [], }), - savedTranscript('query-current-intent-loads-wrong-skill', { + savedTranscript('table-v9-current-intent-loads-wrong-skill', { finalAnswer: - 'I loaded @tanstack/router#routing, but the task needed TanStack Query mutation guidance.', + 'I loaded @tanstack/router#routing, but the task needed TanStack Table v9 column guidance.', messages: [ { role: 'user', - content: - 'Add a mutation that invalidates the user list query after save.', + content: 'Add a TanStack Table v9 column with sortable user roles.', }, { role: 'tool', content: - '$ intent list\n@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + '$ intent list\n@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', }, { role: 'tool', @@ -116,7 +115,7 @@ export const savedTranscriptCases: SavedTranscriptCase[] = [ { role: 'assistant', content: - 'I loaded @tanstack/router#routing, but the task needed TanStack Query mutation guidance.', + 'I loaded @tanstack/router#routing, but the task needed TanStack Table v9 column guidance.', }, ], toolCalls: [ @@ -124,7 +123,7 @@ export const savedTranscriptCases: SavedTranscriptCase[] = [ name: 'shell_command', arguments: { command: 'intent list' }, result: - '@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', }, { name: 'shell_command', @@ -138,7 +137,7 @@ export const savedTranscriptCases: SavedTranscriptCase[] = [ 'intent load @tanstack/router#routing', ], intentCommandOutputs: [ - '@tanstack/router#routing - Router route and loader guidance\n@tanstack/query#mutations - Query mutation guidance', + '@tanstack/router#routing - Router route and loader guidance\n@tanstack/table#v9-columns - TanStack Table v9 column guidance', 'Loaded @tanstack/router#routing', ], loadedSkills: ['@tanstack/router#routing'], diff --git a/evals/intent-discovery/fixtures/start-basic/package.json b/evals/intent-discovery/fixtures/start-basic/package.json new file mode 100644 index 0000000..bd9f7a5 --- /dev/null +++ b/evals/intent-discovery/fixtures/start-basic/package.json @@ -0,0 +1,11 @@ +{ + "name": "intent-eval-start-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-router": "latest", + "@tanstack/react-start": "1.168.26", + "react": "latest", + "react-dom": "latest" + } +} diff --git a/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx b/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx new file mode 100644 index 0000000..19561d3 --- /dev/null +++ b/evals/intent-discovery/fixtures/start-basic/src/routes/users.tsx @@ -0,0 +1,33 @@ +import { createFileRoute } from '@tanstack/react-router' +import { createServerFn } from '@tanstack/react-start' + +type User = { + id: string + name: string +} + +const getUsers = createServerFn({ method: 'GET' }).handler(async () => { + const users: Array = [ + { id: '1', name: 'Ada Lovelace' }, + { id: '2', name: 'Grace Hopper' }, + ] + + return users +}) + +export const Route = createFileRoute('/users')({ + loader: () => getUsers(), + component: UsersRoute, +}) + +function UsersRoute() { + const users = Route.useLoaderData() + + return ( +
    + {users.map((user) => ( +
  • {user.name}
  • + ))} +
+ ) +} diff --git a/evals/intent-discovery/fixtures/table-v9-basic/package.json b/evals/intent-discovery/fixtures/table-v9-basic/package.json new file mode 100644 index 0000000..e58f3c1 --- /dev/null +++ b/evals/intent-discovery/fixtures/table-v9-basic/package.json @@ -0,0 +1,10 @@ +{ + "name": "intent-eval-table-v9-basic", + "private": true, + "type": "module", + "dependencies": { + "@tanstack/react-table": "9.0.0-beta.16", + "react": "latest", + "react-dom": "latest" + } +} diff --git a/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx new file mode 100644 index 0000000..7abc83a --- /dev/null +++ b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx @@ -0,0 +1,67 @@ +import { + createColumnHelper, + flexRender, + getCoreRowModel, + getSortedRowModel, + useReactTable, + type SortingState, +} from '@tanstack/react-table' +import { useState } from 'react' + +type User = { + id: string + name: string + role: string +} + +const columnHelper = createColumnHelper() + +const columns = [ + columnHelper.accessor('name', { + header: 'Name', + cell: (info) => info.getValue(), + }), + columnHelper.accessor('role', { + header: 'Role', + cell: (info) => info.getValue(), + }), +] + +export function UserTable({ users }: { users: Array }) { + const [sorting, setSorting] = useState([]) + const table = useReactTable({ + data: users, + columns, + state: { sorting }, + onSortingChange: setSorting, + getCoreRowModel: getCoreRowModel(), + getSortedRowModel: getSortedRowModel(), + }) + + return ( + + + {table.getHeaderGroups().map((headerGroup) => ( + + {headerGroup.headers.map((header) => ( + + ))} + + ))} + + + {table.getRowModel().rows.map((row) => ( + + {row.getVisibleCells().map((cell) => ( + + ))} + + ))} + +
+ {flexRender(header.column.columnDef.header, header.getContext())} +
+ {flexRender(cell.column.columnDef.cell, cell.getContext())} +
+ ) +} diff --git a/evals/intent-discovery/graders/correct-skill-loaded.ts b/evals/intent-discovery/graders/correct-skill-loaded.ts index dde849e..9d16b8e 100644 --- a/evals/intent-discovery/graders/correct-skill-loaded.ts +++ b/evals/intent-discovery/graders/correct-skill-loaded.ts @@ -5,12 +5,12 @@ import { listIncludesExpectedSkillArea } from './skill-areas' export type CorrectSkillLoadedResult = { passed: boolean - loadedSkills: string[] + loadedSkills: Array } export function correctSkillLoaded( run: HarnessRun, - expectedSkillAreas: ExpectedSkillArea[], + expectedSkillAreas: Array, ): CorrectSkillLoadedResult { const loadedSkills = loadedSkillsFromRun(run) @@ -20,7 +20,7 @@ export function correctSkillLoaded( } } -function loadedSkillsFromRun(run: HarnessRun): string[] { +function loadedSkillsFromRun(run: HarnessRun): Array { const artifactSkills = stringArrayArtifact(run.artifacts?.loadedSkills) const commandSkills = toolCalls(run) .map((call) => commandString(call.arguments?.command)) @@ -31,7 +31,7 @@ function loadedSkillsFromRun(run: HarnessRun): string[] { return [...new Set([...artifactSkills, ...commandSkills])] } -function stringArrayArtifact(value: unknown): string[] { +function stringArrayArtifact(value: unknown): Array { if (!Array.isArray(value)) { return [] } diff --git a/evals/intent-discovery/graders/failure-classifier.ts b/evals/intent-discovery/graders/failure-classifier.ts index ce71635..62ec9dd 100644 --- a/evals/intent-discovery/graders/failure-classifier.ts +++ b/evals/intent-discovery/graders/failure-classifier.ts @@ -9,7 +9,7 @@ import { strictIntentInvocation } from './strict-invocation' export function classifyFailure( run: HarnessRun, - expectedSkillAreas: ExpectedSkillArea[], + expectedSkillAreas: Array, ): IntentDiscoveryFailureClass { if (run.errors.length > 0) { return 'harness-error' diff --git a/evals/intent-discovery/graders/reference-only.ts b/evals/intent-discovery/graders/reference-only.ts index 70285ca..1bcd2d6 100644 --- a/evals/intent-discovery/graders/reference-only.ts +++ b/evals/intent-discovery/graders/reference-only.ts @@ -5,7 +5,7 @@ import { strictIntentInvocation } from './strict-invocation' export function referenceOnly( run: HarnessRun, - expectedSkillAreas: ExpectedSkillArea[], + expectedSkillAreas: Array, ): boolean { if (strictIntentInvocation(run).passed) { return false diff --git a/evals/intent-discovery/graders/skill-areas.ts b/evals/intent-discovery/graders/skill-areas.ts index c1acf9b..cfd51cd 100644 --- a/evals/intent-discovery/graders/skill-areas.ts +++ b/evals/intent-discovery/graders/skill-areas.ts @@ -1,12 +1,10 @@ import type { JsonValue } from 'vitest-evals' import type { ExpectedSkillArea } from '../corpus/tasks' -const skillAreaPatterns: Record = { +const skillAreaPatterns: Record> = { router: [/router/i, /routing/i, /@tanstack\/router/i], - query: [/query/i, /mutation/i, /@tanstack\/query/i], - table: [/table/i, /column/i, /sorting/i, /@tanstack\/table/i], - form: [/form/i, /validation/i, /submit/i, /@tanstack\/form/i], - start: [/start/i, /full-stack/i, /@tanstack\/start/i], + start: [/tanstack start/i, /react-start/i, /server function/i, /full-stack/i], + 'table-v9': [/tanstack table/i, /react-table/i, /table v9/i, /v9/i], } export function jsonToSearchableText(value: JsonValue | undefined): string { @@ -23,7 +21,7 @@ export function jsonToSearchableText(value: JsonValue | undefined): string { export function textMatchesSkillArea( text: string, - expectedSkillAreas: ExpectedSkillArea[], + expectedSkillAreas: Array, ): boolean { return expectedSkillAreas.some((area) => skillAreaPatterns[area].some((pattern) => pattern.test(text)), @@ -31,8 +29,8 @@ export function textMatchesSkillArea( } export function listIncludesExpectedSkillArea( - values: string[], - expectedSkillAreas: ExpectedSkillArea[], + values: Array, + expectedSkillAreas: Array, ): boolean { return values.some((value) => textMatchesSkillArea(value, expectedSkillAreas)) } diff --git a/evals/intent-discovery/harness/saved-transcript-harness.ts b/evals/intent-discovery/harness/saved-transcript-harness.ts index 6e560be..e8c0731 100644 --- a/evals/intent-discovery/harness/saved-transcript-harness.ts +++ b/evals/intent-discovery/harness/saved-transcript-harness.ts @@ -1,5 +1,5 @@ import { createHarness } from 'vitest-evals' -import type { NormalizedMessage, SimpleToolCallRecord } from 'vitest-evals' +import type { NormalizedMessage, ToolCallRecord } from 'vitest-evals' import type { SavedTranscriptCase } from '../fixtures/saved-transcripts' export type IntentDiscoveryOutput = { @@ -70,9 +70,9 @@ export const savedTranscriptHarness = createHarness< }) function messagesWithToolCalls( - messages: NormalizedMessage[], - toolCalls: SimpleToolCallRecord[], -): NormalizedMessage[] { + messages: Array, + toolCalls: Array, +): Array { if (toolCalls.length === 0) { return messages } diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts index 741d742..4823ab1 100644 --- a/evals/intent-discovery/intent-discovery.eval.ts +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -89,7 +89,7 @@ function attachEvalMetadata({ }: { harnessName: string run: HarnessRun - scores: NamedJudgeResult[] + scores: Array task: RuntimeTask }): void { const avgScore = diff --git a/evals/intent-discovery/tsconfig.json b/evals/intent-discovery/tsconfig.json new file mode 100644 index 0000000..5291cc3 --- /dev/null +++ b/evals/intent-discovery/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "declaration": false, + "types": ["node", "vitest"] + }, + "include": ["**/*.ts"] +} From 5c9b822a590a5402bbcaee7dc73fdfa9e444ecbe Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 09:06:20 -0700 Subject: [PATCH 03/13] Refactor intent discovery evaluation: streamline skill loading and command parsing --- .../graders/correct-skill-loaded.ts | 33 +--- .../graders/strict-invocation.ts | 69 ++------- .../intent-discovery/harness-capture.eval.ts | 111 +++++++++++++ .../harness/parse-intent-commands.ts | 146 ++++++++++++++++++ .../harness/prepare-fixture.ts | 66 ++++++++ 5 files changed, 334 insertions(+), 91 deletions(-) create mode 100644 evals/intent-discovery/harness-capture.eval.ts create mode 100644 evals/intent-discovery/harness/parse-intent-commands.ts create mode 100644 evals/intent-discovery/harness/prepare-fixture.ts diff --git a/evals/intent-discovery/graders/correct-skill-loaded.ts b/evals/intent-discovery/graders/correct-skill-loaded.ts index 9d16b8e..99f3e33 100644 --- a/evals/intent-discovery/graders/correct-skill-loaded.ts +++ b/evals/intent-discovery/graders/correct-skill-loaded.ts @@ -1,6 +1,6 @@ import type { HarnessRun } from 'vitest-evals' -import { toolCalls } from 'vitest-evals' import type { ExpectedSkillArea } from '../corpus/tasks' +import { loadedSkillUsesFromRun } from '../harness/parse-intent-commands' import { listIncludesExpectedSkillArea } from './skill-areas' export type CorrectSkillLoadedResult = { @@ -21,34 +21,5 @@ export function correctSkillLoaded( } function loadedSkillsFromRun(run: HarnessRun): Array { - const artifactSkills = stringArrayArtifact(run.artifacts?.loadedSkills) - const commandSkills = toolCalls(run) - .map((call) => commandString(call.arguments?.command)) - .filter((command): command is string => Boolean(command)) - .map((command) => skillFromLoadCommand(command)) - .filter((skill): skill is string => Boolean(skill)) - - return [...new Set([...artifactSkills, ...commandSkills])] -} - -function stringArrayArtifact(value: unknown): Array { - if (!Array.isArray(value)) { - return [] - } - - return value.filter( - (candidate): candidate is string => typeof candidate === 'string', - ) -} - -function commandString(value: unknown): string | undefined { - return typeof value === 'string' ? value : undefined -} - -function skillFromLoadCommand(command: string): string | undefined { - const match = command.match( - /(?:^|\s)(?:(?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+load\s+(\S+)/i, - ) - - return match?.[1] + return loadedSkillUsesFromRun(run) } diff --git a/evals/intent-discovery/graders/strict-invocation.ts b/evals/intent-discovery/graders/strict-invocation.ts index 7fe5911..80eb487 100644 --- a/evals/intent-discovery/graders/strict-invocation.ts +++ b/evals/intent-discovery/graders/strict-invocation.ts @@ -1,6 +1,5 @@ -import type { HarnessRun, ToolCallRecord } from 'vitest-evals' -import { toolCalls } from 'vitest-evals' -import { jsonToSearchableText } from './skill-areas' +import type { HarnessRun } from 'vitest-evals' +import { intentCommandsFromRun } from '../harness/parse-intent-commands' export type StrictInvocationResult = { passed: boolean @@ -8,68 +7,18 @@ export type StrictInvocationResult = { source?: 'tool-call' | 'tool-message' } -const intentToolNames = new Set(['intent_list', 'intent_load']) - -const acceptedIntentCommandPattern = - /(?:^|\s)(?:(?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+(?:list|load\s+\S+)/i - export function strictIntentInvocation( run: HarnessRun, ): StrictInvocationResult { - for (const call of toolCalls(run)) { - if (intentToolNames.has(call.name)) { - return { - passed: true, - matchedCommand: call.name, - source: 'tool-call', - } - } - - const command = commandFromToolCall(call) + const command = intentCommandsFromRun(run)[0] - if (command && acceptedIntentCommandPattern.test(command)) { - return { - passed: true, - matchedCommand: command, - source: 'tool-call', - } - } + if (!command) { + return { passed: false } } - for (const message of run.session.messages) { - if (message.role !== 'tool') { - continue - } - - const content = jsonToSearchableText(message.content) - const match = content.match(acceptedIntentCommandPattern) - - if (match?.[0]) { - return { - passed: true, - matchedCommand: match[0].trim(), - source: 'tool-message', - } - } + return { + passed: true, + matchedCommand: command.raw, + source: command.source, } - - return { passed: false } -} - -function commandFromToolCall(call: ToolCallRecord): string | undefined { - return ( - stringRecordValue(call.arguments, 'command') ?? - stringRecordValue(call.arguments, 'cmd') ?? - stringRecordValue(call.arguments, 'input') ?? - stringRecordValue(call.metadata, 'command') - ) -} - -function stringRecordValue( - value: Record | undefined, - key: string, -): string | undefined { - const candidate = value?.[key] - - return typeof candidate === 'string' ? candidate : undefined } diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts new file mode 100644 index 0000000..5c0cea6 --- /dev/null +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -0,0 +1,111 @@ +import { existsSync, mkdirSync, readFileSync } from 'node:fs' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import type { ToolCallRecord } from 'vitest-evals' +import { fixtures } from './corpus/fixtures' +import { tasks } from './corpus/tasks' +import { + intentCommandsFromToolCalls, + parseIntentCommand, +} from './harness/parse-intent-commands' +import { prepareFixtureWorkspace } from './harness/prepare-fixture' + +describe('Intent discovery harness capture', () => { + it('parses accepted Intent command forms from tool calls', () => { + const calls: Array = [ + { name: 'shell_command', arguments: { command: 'intent list' } }, + { + name: 'shell_command', + arguments: { + command: 'pnpm exec intent load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { + command: 'npx @tanstack/intent load @tanstack/start#routing', + }, + }, + ] + + expect(intentCommandsFromToolCalls(calls)).toEqual([ + { + raw: 'intent list', + executable: 'intent', + action: 'list', + source: 'tool-call', + }, + { + raw: 'pnpm exec intent load @tanstack/router#routing', + executable: 'pnpm exec intent', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'npx @tanstack/intent load @tanstack/start#routing', + executable: 'npx @tanstack/intent', + action: 'load', + skillUse: '@tanstack/start#routing', + source: 'tool-call', + }, + ]) + }) + + it('does not parse prose mentions as strict invocation', () => { + expect( + parseIntentCommand( + 'I would run intent load @tanstack/router#routing', + 'tool-message', + ), + ).toBeUndefined() + }) + + it('prepares an isolated workspace for every task fixture', () => { + const parentDir = mkdtempSync(join(tmpdir(), 'intent-eval-fixtures-')) + + try { + for (const task of tasks) { + const prepared = prepareFixtureWorkspace({ + fixture: task.fixture, + parentDir, + }) + const fixture = fixtures[task.fixture] + + for (const file of fixture.files) { + expect(existsSync(join(prepared.workspacePath, file))).toBe(true) + } + + mkdirSync(join(prepared.workspacePath, 'src', 'generated'), { + recursive: true, + }) + prepared.cleanup() + expect(existsSync(prepared.workspacePath)).toBe(false) + } + } finally { + rmSync(parentDir, { recursive: true, force: true }) + } + }) + + it('does not mutate the source fixture while preparing a workspace', () => { + const prepared = prepareFixtureWorkspace({ fixture: 'router-basic' }) + + try { + const sourcePackageJson = readFileSync( + join(prepared.sourcePath, 'package.json'), + 'utf8', + ) + const copiedPackageJson = readFileSync( + join(prepared.workspacePath, 'package.json'), + 'utf8', + ) + + expect(copiedPackageJson).toBe(sourcePackageJson) + expect(prepared.workspacePath).not.toBe(prepared.sourcePath) + } finally { + prepared.cleanup() + } + }) +}) diff --git a/evals/intent-discovery/harness/parse-intent-commands.ts b/evals/intent-discovery/harness/parse-intent-commands.ts new file mode 100644 index 0000000..81d3f71 --- /dev/null +++ b/evals/intent-discovery/harness/parse-intent-commands.ts @@ -0,0 +1,146 @@ +import type { HarnessRun, ToolCallRecord } from 'vitest-evals' +import { toolCalls } from 'vitest-evals' +import { jsonToSearchableText } from '../graders/skill-areas' + +export type ParsedIntentCommand = { + raw: string + executable: 'intent' | 'pnpm exec intent' | 'npx @tanstack/intent' + action: 'list' | 'load' + skillUse?: string + source: 'tool-call' | 'tool-message' +} + +const commandPattern = + /^\s*\$?\s*((?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+(list|load)(?:\s+([^\s]+))?/i + +export function parseIntentCommand( + raw: string, + source: ParsedIntentCommand['source'], +): ParsedIntentCommand | undefined { + const match = raw.match(commandPattern) + + if (!match?.[1] || !match[2]) { + return undefined + } + + const executable = match[1].replace( + /\s+/g, + ' ', + ) as ParsedIntentCommand['executable'] + const action = match[2].toLowerCase() as ParsedIntentCommand['action'] + const skillUse = action === 'load' ? match[3] : undefined + + if (action === 'load' && !skillUse) { + return undefined + } + + return { + raw: match[0].trim().replace(/^\$\s*/, ''), + executable, + action, + skillUse, + source, + } +} + +export function intentCommandsFromRun( + run: HarnessRun, +): Array { + return [ + ...intentCommandsFromToolCalls(toolCalls(run)), + ...intentCommandsFromToolMessages(run), + ] +} + +export function intentCommandsFromToolCalls( + calls: Array, +): Array { + return calls.flatMap((call) => { + const command = commandFromToolCall(call) + const parsed = command + ? parseIntentCommand(command, 'tool-call') + : intentCommandFromToolName(call) + + return parsed ? [parsed] : [] + }) +} + +export function loadedSkillUsesFromRun(run: HarnessRun): Array { + const artifactSkills = Array.isArray(run.artifacts?.loadedSkills) + ? run.artifacts.loadedSkills.filter( + (candidate): candidate is string => typeof candidate === 'string', + ) + : [] + const commandSkills = intentCommandsFromRun(run) + .filter((command) => command.action === 'load' && Boolean(command.skillUse)) + .map((command) => command.skillUse as string) + + return [...new Set([...artifactSkills, ...commandSkills])] +} + +function intentCommandsFromToolMessages( + run: HarnessRun, +): Array { + return run.session.messages.flatMap((message) => { + if (message.role !== 'tool') { + return [] + } + + return jsonToSearchableText(message.content) + .split('\n') + .flatMap((line) => { + const parsed = parseIntentCommand(line, 'tool-message') + + return parsed ? [parsed] : [] + }) + }) +} + +function commandFromToolCall(call: ToolCallRecord): string | undefined { + return ( + stringRecordValue(call.arguments, 'command') ?? + stringRecordValue(call.arguments, 'cmd') ?? + stringRecordValue(call.arguments, 'input') ?? + stringRecordValue(call.metadata, 'command') + ) +} + +function intentCommandFromToolName( + call: ToolCallRecord, +): ParsedIntentCommand | undefined { + if (call.name === 'intent_list') { + return { + raw: call.name, + executable: 'intent', + action: 'list', + source: 'tool-call', + } + } + + if (call.name !== 'intent_load') { + return undefined + } + + const skillUse = stringRecordValue(call.arguments, 'use') + + if (!skillUse) { + return undefined + } + + return { + raw: `${call.name} ${skillUse}`, + executable: 'intent', + action: 'load', + skillUse, + source: 'tool-call', + } +} + +function stringRecordValue( + value: Record | undefined, + key: string, +): string | undefined { + const candidate = value?.[key] + + return typeof candidate === 'string' ? candidate : undefined +} diff --git a/evals/intent-discovery/harness/prepare-fixture.ts b/evals/intent-discovery/harness/prepare-fixture.ts new file mode 100644 index 0000000..dccb6d8 --- /dev/null +++ b/evals/intent-discovery/harness/prepare-fixture.ts @@ -0,0 +1,66 @@ +import { + cpSync, + existsSync, + mkdirSync, + mkdtempSync, + realpathSync, + rmSync, +} from 'node:fs' +import { tmpdir } from 'node:os' +import { basename, dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import type { IntentDiscoveryFixture } from '../corpus/tasks' + +const evalDir = dirname(dirname(fileURLToPath(import.meta.url))) +const fixturesDir = join(evalDir, 'fixtures') + +export type PreparedFixtureWorkspace = { + fixture: IntentDiscoveryFixture + sourcePath: string + workspacePath: string + cleanup: () => void +} + +export function fixturePath(fixture: IntentDiscoveryFixture): string { + return join(fixturesDir, fixture) +} + +export function prepareFixtureWorkspace({ + fixture, + parentDir, +}: { + fixture: IntentDiscoveryFixture + parentDir?: string +}): PreparedFixtureWorkspace { + const sourcePath = fixturePath(fixture) + + if (!existsSync(sourcePath)) { + throw new Error(`Fixture does not exist: ${fixture}`) + } + + const rootDir = + parentDir ?? mkdtempSync(join(realpathSync(tmpdir()), 'intent-eval-')) + mkdirSync(rootDir, { recursive: true }) + + const workspacePath = join(rootDir, basename(sourcePath)) + rmSync(workspacePath, { recursive: true, force: true }) + cpSync(sourcePath, workspacePath, { + recursive: true, + verbatimSymlinks: true, + filter: (source) => !source.includes(`${fixturesDir}/runs/`), + }) + + return { + fixture, + sourcePath, + workspacePath, + cleanup() { + if (parentDir) { + rmSync(workspacePath, { recursive: true, force: true }) + return + } + + rmSync(rootDir, { recursive: true, force: true }) + }, + } +} From 99e6ed5d1ab385251065a6ed4af17b5806675bfa Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 09:09:36 -0700 Subject: [PATCH 04/13] Implement live Copilot harness and error handling for intent discovery evaluation --- evals/intent-discovery/README.md | 2 + .../harness/live-copilot-harness.ts | 175 ++++++++++++++++++ .../harness/run-copilot-task.ts | 42 +++++ .../live-copilot-harness.eval.ts | 55 ++++++ 4 files changed, 274 insertions(+) create mode 100644 evals/intent-discovery/harness/live-copilot-harness.ts create mode 100644 evals/intent-discovery/harness/run-copilot-task.ts create mode 100644 evals/intent-discovery/live-copilot-harness.eval.ts diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index 94ca19b..eb5f9ad 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -14,4 +14,6 @@ This executable slice grades synthetic saved transcripts with Vitest plus `vites The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9. +The live Copilot harness is a boundary contract only. Until live capture is wired, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. + Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/harness/live-copilot-harness.ts b/evals/intent-discovery/harness/live-copilot-harness.ts new file mode 100644 index 0000000..a233220 --- /dev/null +++ b/evals/intent-discovery/harness/live-copilot-harness.ts @@ -0,0 +1,175 @@ +import { createHarness } from 'vitest-evals' +import type { IntentDiscoveryTask } from '../corpus/tasks' +import { intentCommandsFromToolCalls } from './parse-intent-commands' +import { prepareFixtureWorkspace } from './prepare-fixture' +import { + LiveCopilotRunnerUnavailableError, + runCopilotTask, +} from './run-copilot-task' + +export type LiveCopilotOutput = { + finalAnswer: string + runId: string +} + +export const liveCopilotHarness = createHarness< + IntentDiscoveryTask, + LiveCopilotOutput +>({ + name: 'intent-discovery-live-copilot', + run: async ({ input, setArtifact }) => { + const runId = `live:${input.id}` + const prepared = prepareFixtureWorkspace({ fixture: input.fixture }) + + setCommonArtifacts({ + input, + runId, + workspacePath: prepared.workspacePath, + setArtifact, + }) + + try { + const run = await runCopilotTask({ + task: input, + runId, + workspacePath: prepared.workspacePath, + }) + const intentCommands = intentCommandsFromToolCalls(run.toolCalls) + + setArtifact('transcriptPath', run.transcriptPath ?? '') + setArtifact('commandsInvoked', run.commandsInvoked) + setArtifact( + 'intentCommandsInvoked', + run.intentCommandsInvoked.length > 0 + ? run.intentCommandsInvoked + : intentCommands.map((command) => command.raw), + ) + setArtifact('intentCommandOutputs', run.intentCommandOutputs) + setArtifact('loadedSkills', run.loadedSkills) + setArtifact('fileDiff', run.fileDiff ?? '') + setArtifact('agentErrors', run.agentErrors) + + return { + output: { + finalAnswer: run.finalAnswer, + runId: run.runId, + }, + messages: run.messages, + toolCalls: run.toolCalls, + usage: run.usage ?? { + provider: 'copilot', + model: 'unknown', + }, + artifacts: { + runKind: 'live-copilot', + runnerStatus: 'completed', + }, + traces: [ + { + id: runId, + name: 'live Copilot run', + spans: [ + { + id: `${runId}:copilot`, + name: 'run Copilot task', + kind: 'agent', + status: 'ok', + }, + ], + }, + ], + errors: run.agentErrors, + } + } catch (error) { + const normalizedError = normalizeRunnerError(error) + + setArtifact('transcriptPath', '') + setArtifact('commandsInvoked', []) + setArtifact('intentCommandsInvoked', []) + setArtifact('intentCommandOutputs', []) + setArtifact('loadedSkills', []) + setArtifact('fileDiff', '') + setArtifact('agentErrors', [normalizedError.message]) + + return { + output: { + finalAnswer: '', + runId, + }, + messages: [ + { + role: 'user', + content: input.prompt, + }, + ], + toolCalls: [], + usage: { + provider: 'copilot', + model: 'unknown', + }, + artifacts: { + runKind: 'live-copilot', + runnerStatus: + error instanceof LiveCopilotRunnerUnavailableError + ? 'unsupported' + : 'failed', + }, + traces: [ + { + id: runId, + name: 'live Copilot run', + spans: [ + { + id: `${runId}:copilot`, + name: 'run Copilot task', + kind: 'agent', + status: 'error', + error: normalizedError, + }, + ], + }, + ], + errors: [normalizedError], + } + } finally { + prepared.cleanup() + } + }, +}) + +function setCommonArtifacts({ + input, + runId, + workspacePath, + setArtifact, +}: { + input: IntentDiscoveryTask + runId: string + workspacePath: string + setArtifact: (name: string, value: string | Array) => void +}): void { + setArtifact('runId', runId) + setArtifact('taskId', input.id) + setArtifact('condition', input.condition) + setArtifact('fixture', input.fixture) + setArtifact('prompt', input.prompt) + setArtifact('expectedSkillAreas', input.expectedSkillAreas) + setArtifact('workspacePath', workspacePath) +} + +function normalizeRunnerError(error: unknown): { + message: string + type: string +} { + if (error instanceof Error) { + return { + message: error.message, + type: error.name, + } + } + + return { + message: String(error ?? 'Unknown live Copilot runner error'), + type: 'Error', + } +} diff --git a/evals/intent-discovery/harness/run-copilot-task.ts b/evals/intent-discovery/harness/run-copilot-task.ts new file mode 100644 index 0000000..837a650 --- /dev/null +++ b/evals/intent-discovery/harness/run-copilot-task.ts @@ -0,0 +1,42 @@ +import type { + NormalizedMessage, + ToolCallRecord, + UsageSummary, +} from 'vitest-evals' +import type { IntentDiscoveryTask } from '../corpus/tasks' + +export class LiveCopilotRunnerUnavailableError extends Error { + constructor() { + super( + 'Live Copilot runner is not wired yet. Use saved transcripts until the runner can launch Copilot and capture transcript, command, and diff evidence.', + ) + this.name = 'LiveCopilotRunnerUnavailableError' + } +} + +export type RunCopilotTaskInput = { + task: IntentDiscoveryTask + runId: string + workspacePath: string +} + +export type CopilotTaskRun = { + finalAnswer: string + runId: string + messages: Array + toolCalls: Array + usage?: UsageSummary + transcriptPath?: string + commandsInvoked: Array + intentCommandsInvoked: Array + intentCommandOutputs: Array + loadedSkills: Array + fileDiff?: string + agentErrors: Array +} + +export async function runCopilotTask( + _input: RunCopilotTaskInput, +): Promise { + throw new LiveCopilotRunnerUnavailableError() +} diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts new file mode 100644 index 0000000..c97eed7 --- /dev/null +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -0,0 +1,55 @@ +import type { HarnessContext, HarnessRun } from 'vitest-evals' +import { describe, expect, it } from 'vitest' +import { failedSpans, toolCalls } from 'vitest-evals' +import { tasks, type IntentDiscoveryTask } from './corpus/tasks' +import { + liveCopilotHarness, + type LiveCopilotOutput, +} from './harness/live-copilot-harness' + +const routerTask = tasks.find( + (task) => task.id === 'router-current-intent-loads-router', +) + +if (!routerTask) { + throw new Error('Missing router-current-intent-loads-router task') +} + +describe('Intent discovery live Copilot harness', () => { + it('returns an explicit unsupported result until live capture is wired', async () => { + const result = await runLiveHarness(routerTask) + + expect(result.output).toEqual({ + finalAnswer: '', + runId: `live:${routerTask.id}`, + }) + expect(result.artifacts?.runKind).toBe('live-copilot') + expect(result.artifacts?.runnerStatus).toBe('unsupported') + expect(result.artifacts?.workspacePath).toEqual(expect.any(String)) + expect(toolCalls(result)).toHaveLength(0) + expect(result.errors).toEqual([ + { + message: + 'Live Copilot runner is not wired yet. Use saved transcripts until the runner can launch Copilot and capture transcript, command, and diff evidence.', + type: 'LiveCopilotRunnerUnavailableError', + }, + ]) + expect(failedSpans(result)).toHaveLength(1) + }) +}) + +async function runLiveHarness( + task: IntentDiscoveryTask, +): Promise> { + const artifacts: HarnessContext['artifacts'] = {} + const context: HarnessContext = { + artifacts, + setArtifact(name, value) { + artifacts[name] = value + }, + } + + return liveCopilotHarness.run(task, context) as Promise< + HarnessRun + > +} From 6884f1daa17ca072a16c7706d31ccb5fd401d2ae Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 09:17:26 -0700 Subject: [PATCH 05/13] Enhance live Copilot harness: support opt-in command backend and capture command, skill, transcript, and diff evidence --- evals/intent-discovery/README.md | 2 +- .../harness/live-copilot-harness.ts | 1 + .../harness/run-copilot-task.ts | 228 +++++++++++++++++- .../live-copilot-harness.eval.ts | 55 +++++ 4 files changed, 283 insertions(+), 3 deletions(-) diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index eb5f9ad..72cbe6b 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -14,6 +14,6 @@ This executable slice grades synthetic saved transcripts with Vitest plus `vites The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9. -The live Copilot harness is a boundary contract only. Until live capture is wired, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. +The live Copilot harness can run an opt-in command backend through `INTENT_DISCOVERY_COPILOT_COMMAND`. When that environment variable is unset, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. The command runs inside a prepared fixture workspace with task metadata in `INTENT_DISCOVERY_TASK_ID`, `INTENT_DISCOVERY_FIXTURE`, `INTENT_DISCOVERY_PROMPT`, `INTENT_DISCOVERY_RUN_ID`, and `INTENT_DISCOVERY_WORKSPACE`. Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/harness/live-copilot-harness.ts b/evals/intent-discovery/harness/live-copilot-harness.ts index a233220..02f1e77 100644 --- a/evals/intent-discovery/harness/live-copilot-harness.ts +++ b/evals/intent-discovery/harness/live-copilot-harness.ts @@ -32,6 +32,7 @@ export const liveCopilotHarness = createHarness< const run = await runCopilotTask({ task: input, runId, + sourcePath: prepared.sourcePath, workspacePath: prepared.workspacePath, }) const intentCommands = intentCommandsFromToolCalls(run.toolCalls) diff --git a/evals/intent-discovery/harness/run-copilot-task.ts b/evals/intent-discovery/harness/run-copilot-task.ts index 837a650..5dcc21b 100644 --- a/evals/intent-discovery/harness/run-copilot-task.ts +++ b/evals/intent-discovery/harness/run-copilot-task.ts @@ -4,6 +4,14 @@ import type { UsageSummary, } from 'vitest-evals' import type { IntentDiscoveryTask } from '../corpus/tasks' +import { mkdirSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { spawn } from 'node:child_process' +import { parseIntentCommand } from './parse-intent-commands' + +const evalDir = dirname(dirname(fileURLToPath(import.meta.url))) +const transcriptDir = join(evalDir, 'runs', 'latest', 'transcripts') export class LiveCopilotRunnerUnavailableError extends Error { constructor() { @@ -17,6 +25,7 @@ export class LiveCopilotRunnerUnavailableError extends Error { export type RunCopilotTaskInput = { task: IntentDiscoveryTask runId: string + sourcePath: string workspacePath: string } @@ -36,7 +45,222 @@ export type CopilotTaskRun = { } export async function runCopilotTask( - _input: RunCopilotTaskInput, + input: RunCopilotTaskInput, ): Promise { - throw new LiveCopilotRunnerUnavailableError() + const command = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + if (!command) { + throw new LiveCopilotRunnerUnavailableError() + } + + const result = await runCommand({ command, input }) + const transcript = transcriptFromCommandResult(result) + const transcriptPath = writeTranscript(input.runId, transcript) + const intentCommandCaptures = captureIntentCommands(transcript) + const fileDiff = await collectFileDiff(input.sourcePath, input.workspacePath) + const agentErrors = + result.exitCode === 0 ? [] : [result.stderr || result.stdout] + + return { + finalAnswer: finalAnswerFromTranscript(transcript), + runId: input.runId, + messages: [ + { + role: 'user', + content: input.task.prompt, + }, + { + role: 'tool', + content: transcript, + }, + { + role: 'assistant', + content: finalAnswerFromTranscript(transcript), + toolCalls: intentCommandCaptures.map((capture) => capture.toolCall), + }, + ], + toolCalls: intentCommandCaptures.map((capture) => capture.toolCall), + usage: { + provider: 'copilot-command', + model: process.env.INTENT_DISCOVERY_COPILOT_MODEL ?? 'unknown', + }, + transcriptPath, + commandsInvoked: intentCommandCaptures.map((capture) => capture.command), + intentCommandsInvoked: intentCommandCaptures.map( + (capture) => capture.command, + ), + intentCommandOutputs: intentCommandCaptures.map( + (capture) => capture.output, + ), + loadedSkills: [ + ...new Set( + intentCommandCaptures + .map((capture) => capture.skillUse) + .filter((skillUse): skillUse is string => Boolean(skillUse)), + ), + ], + fileDiff, + agentErrors, + } +} + +type CommandResult = { + stdout: string + stderr: string + exitCode: number | null +} + +type IntentCommandCapture = { + command: string + output: string + skillUse?: string + toolCall: ToolCallRecord +} + +async function runCommand({ + command, + input, +}: { + command: string + input: RunCopilotTaskInput +}): Promise { + return new Promise((resolve, reject) => { + const child = spawn(command, { + cwd: input.workspacePath, + shell: true, + env: { + ...process.env, + INTENT_DISCOVERY_TASK_ID: input.task.id, + INTENT_DISCOVERY_FIXTURE: input.task.fixture, + INTENT_DISCOVERY_PROMPT: input.task.prompt, + INTENT_DISCOVERY_RUN_ID: input.runId, + INTENT_DISCOVERY_WORKSPACE: input.workspacePath, + }, + }) + const stdoutChunks: Array = [] + const stderrChunks: Array = [] + + child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) + child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) + child.on('error', reject) + child.on('close', (exitCode) => { + resolve({ + stdout: Buffer.concat(stdoutChunks).toString('utf8'), + stderr: Buffer.concat(stderrChunks).toString('utf8'), + exitCode, + }) + }) + }) +} + +function transcriptFromCommandResult(result: CommandResult): string { + return [result.stdout.trim(), result.stderr.trim()].filter(Boolean).join('\n') +} + +function finalAnswerFromTranscript(transcript: string): string { + const finalAnswerLine = transcript + .split('\n') + .find((line) => line.startsWith('FINAL_ANSWER:')) + + return finalAnswerLine?.replace(/^FINAL_ANSWER:\s*/, '') ?? transcript.trim() +} + +function writeTranscript(runId: string, transcript: string): string { + mkdirSync(transcriptDir, { recursive: true }) + const transcriptPath = join(transcriptDir, `${sanitizeFileName(runId)}.txt`) + + writeFileSync(transcriptPath, transcript) + + return transcriptPath +} + +function captureIntentCommands( + transcript: string, +): Array { + const lines = transcript.split('\n') + const captures: Array = [] + + for (let index = 0; index < lines.length; index += 1) { + const line = lines[index] + const command = parseIntentCommand(line ?? '', 'tool-message') + + if (!command) { + continue + } + + const output = outputAfterCommand(lines, index) + + captures.push({ + command: command.raw, + output, + skillUse: command.skillUse, + toolCall: { + name: 'shell_command', + arguments: { + command: command.raw, + }, + result: output, + }, + }) + } + + return captures +} + +function outputAfterCommand( + lines: Array, + commandIndex: number, +): string { + const output: Array = [] + + for (let index = commandIndex + 1; index < lines.length; index += 1) { + const line = lines[index] ?? '' + + if (parseIntentCommand(line, 'tool-message')) { + break + } + + output.push(line) + } + + return output.join('\n').trim() +} + +async function collectFileDiff( + sourcePath: string, + workspacePath: string, +): Promise { + const result = await runDiff(sourcePath, workspacePath) + + if (result.exitCode !== 0 && result.exitCode !== 1) { + return result.stderr + } + + return result.stdout +} + +async function runDiff( + sourcePath: string, + workspacePath: string, +): Promise { + return new Promise((resolve, reject) => { + const child = spawn('diff', ['-ruN', sourcePath, workspacePath]) + const stdoutChunks: Array = [] + const stderrChunks: Array = [] + + child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) + child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) + child.on('error', reject) + child.on('close', (exitCode) => { + resolve({ + stdout: Buffer.concat(stdoutChunks).toString('utf8'), + stderr: Buffer.concat(stderrChunks).toString('utf8'), + exitCode, + }) + }) + }) +} + +function sanitizeFileName(value: string): string { + return value.replace(/[^a-z0-9.-]+/gi, '-') } diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index c97eed7..eaefec1 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -1,3 +1,7 @@ +import { existsSync, writeFileSync } from 'node:fs' +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' @@ -36,6 +40,57 @@ describe('Intent discovery live Copilot harness', () => { ]) expect(failedSpans(result)).toHaveLength(1) }) + + it('runs an opt-in command backend and captures command, skill, transcript, and diff evidence', async () => { + const tempDir = mkdtempSync(join(tmpdir(), 'intent-eval-command-')) + const fakeRunnerPath = join(tempDir, 'fake-runner.mjs') + const previousCommand = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + writeFileSync( + fakeRunnerPath, + [ + "import { writeFileSync } from 'node:fs'", + "writeFileSync('agent-output.txt', process.env.INTENT_DISCOVERY_TASK_ID ?? '')", + "console.log('$ intent list')", + "console.log('@tanstack/router#routing - Router route guidance')", + "console.log('$ intent load @tanstack/router#routing')", + "console.log('Loaded @tanstack/router#routing')", + "console.log('FINAL_ANSWER: Loaded router guidance and updated the fixture.')", + ].join('\n'), + ) + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = `node ${fakeRunnerPath}` + + try { + const result = await runLiveHarness(routerTask) + + expect(result.errors).toEqual([]) + expect(result.output.finalAnswer).toBe( + 'Loaded router guidance and updated the fixture.', + ) + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.artifacts?.intentCommandsInvoked).toEqual([ + 'intent list', + 'intent load @tanstack/router#routing', + ]) + expect(result.artifacts?.loadedSkills).toEqual([ + '@tanstack/router#routing', + ]) + expect(result.artifacts?.fileDiff).toEqual( + expect.stringContaining('agent-output.txt'), + ) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(existsSync(String(result.artifacts?.transcriptPath))).toBe(true) + expect(toolCalls(result)).toHaveLength(2) + expect(failedSpans(result)).toHaveLength(0) + } finally { + if (previousCommand === undefined) { + delete process.env.INTENT_DISCOVERY_COPILOT_COMMAND + } else { + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = previousCommand + } + rmSync(tempDir, { recursive: true, force: true }) + } + }) }) async function runLiveHarness( From 174b452221d8a789c4dba3de3a8fd3f7e23ba80d Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 09:32:38 -0700 Subject: [PATCH 06/13] Add live Copilot harness and evaluation metadata for intent discovery --- evals/intent-discovery/README.md | 9 ++ .../bin/copilot-cli-adapter.mjs | 87 +++++++++++++++++++ .../intent-discovery/graders/eval-metadata.ts | 50 +++++++++++ .../intent-discovery/intent-discovery.eval.ts | 51 +---------- .../live-copilot-harness.eval.ts | 76 +++++++++++++++- package.json | 1 + 6 files changed, 224 insertions(+), 50 deletions(-) create mode 100644 evals/intent-discovery/bin/copilot-cli-adapter.mjs create mode 100644 evals/intent-discovery/graders/eval-metadata.ts diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index 72cbe6b..f9474fb 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -6,8 +6,15 @@ Opt-in eval suite for measuring whether Copilot discovers and invokes Intent sur - `pnpm eval:intent-discovery` runs the saved-transcript eval suite. - `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`. +- `pnpm eval:intent-discovery:live` runs the eval suite with the local Copilot CLI adapter enabled. - `pnpm eval:intent-discovery:report` serves the saved JSON report. +The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot case in the report artifact, run JSON generation with the live gate enabled: + +```sh +INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND="node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs" pnpm eval:intent-discovery:json +``` + ## Current scope This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`. @@ -16,4 +23,6 @@ The controlled fixture corpus is limited to current skill-backed surfaces. For t The live Copilot harness can run an opt-in command backend through `INTENT_DISCOVERY_COPILOT_COMMAND`. When that environment variable is unset, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. The command runs inside a prepared fixture workspace with task metadata in `INTENT_DISCOVERY_TASK_ID`, `INTENT_DISCOVERY_FIXTURE`, `INTENT_DISCOVERY_PROMPT`, `INTENT_DISCOVERY_RUN_ID`, and `INTENT_DISCOVERY_WORKSPACE`. +`pnpm eval:intent-discovery:live` sets `INTENT_DISCOVERY_RUN_LIVE=1` and `INTENT_DISCOVERY_COPILOT_COMMAND` to the repo-local Copilot CLI adapter. The adapter calls `copilot -p` in the prepared fixture workspace, writes a Copilot share transcript under the generated run directory, and prints the transcript for command capture. Live runs attach the same strict efficacy scores as saved transcripts, so a passing harness run can still report `AutonomousDiscoverySuccess: 0` when Copilot did not invoke Intent or loaded the wrong skill. Do not put API keys or tokens in the command or prompt; provide credentials through the normal Copilot CLI login or secret environment configuration. + Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases. diff --git a/evals/intent-discovery/bin/copilot-cli-adapter.mjs b/evals/intent-discovery/bin/copilot-cli-adapter.mjs new file mode 100644 index 0000000..80a7c86 --- /dev/null +++ b/evals/intent-discovery/bin/copilot-cli-adapter.mjs @@ -0,0 +1,87 @@ +#!/usr/bin/env node + +import { existsSync, mkdirSync, readFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { spawnSync } from 'node:child_process' + +const workspace = requiredEnv('INTENT_DISCOVERY_WORKSPACE') +const taskId = requiredEnv('INTENT_DISCOVERY_TASK_ID') +const fixture = requiredEnv('INTENT_DISCOVERY_FIXTURE') +const prompt = requiredEnv('INTENT_DISCOVERY_PROMPT') +const runId = requiredEnv('INTENT_DISCOVERY_RUN_ID') +const sharePath = join( + workspace, + '.intent-eval', + `${sanitizeFileName(runId)}.md`, +) + +mkdirSync(dirname(sharePath), { recursive: true }) + +const copilotPrompt = [ + `Task id: ${taskId}`, + `Fixture: ${fixture}`, + '', + prompt, + '', + 'Work in the current repository. Use the available project context and tools as you normally would. Do not summarize this prompt; complete the task and report what you changed.', +].join('\n') + +const args = [ + '-p', + copilotPrompt, + '-C', + workspace, + '--allow-all-tools', + '--add-dir', + workspace, + '--no-ask-user', + '--no-color', + '--plain-diff', + '--share', + sharePath, +] + +const result = spawnSync('copilot', args, { + cwd: workspace, + encoding: 'utf8', + env: { + ...process.env, + NO_COLOR: '1', + }, + stdio: ['ignore', 'pipe', 'pipe'], +}) + +if (result.error) { + console.error(result.error.message) + process.exit(1) +} + +if (result.stdout.trim()) { + console.log(result.stdout.trim()) +} + +if (existsSync(sharePath)) { + console.log(`\nTRANSCRIPT_PATH: ${sharePath}`) + console.log(readFileSync(sharePath, 'utf8')) +} + +if (result.stderr.trim()) { + console.error(result.stderr.trim()) +} + +process.exit(result.status ?? 1) + +function requiredEnv(name) { + const value = process.env[name] + + if (!value) { + console.error(`Missing required environment variable: ${name}`) + process.exit(1) + } + + return value +} + +function sanitizeFileName(value) { + return value.replace(/[^a-z0-9.-]+/gi, '-') +} diff --git a/evals/intent-discovery/graders/eval-metadata.ts b/evals/intent-discovery/graders/eval-metadata.ts new file mode 100644 index 0000000..f666baf --- /dev/null +++ b/evals/intent-discovery/graders/eval-metadata.ts @@ -0,0 +1,50 @@ +import type { HarnessRun, JudgeResult, JsonValue } from 'vitest-evals' +import { toolCalls } from 'vitest-evals' + +export type NamedJudgeResult = JudgeResult & { name: string } + +export type RuntimeTask = { + meta: { + harness?: unknown + eval?: unknown + } +} + +export function score( + name: string, + passed: boolean, + metadata?: NamedJudgeResult['metadata'], +): NamedJudgeResult { + return { + name, + score: passed ? 1 : 0, + metadata, + } +} + +export function attachEvalMetadata({ + harnessName, + run, + scores, + task, +}: { + harnessName: string + run: HarnessRun + scores: Array + task: RuntimeTask +}): void { + const avgScore = + scores.reduce((total, item) => total + (item.score ?? 0), 0) / scores.length + + task.meta.harness = { + name: harnessName, + run, + } + task.meta.eval = { + scores, + avgScore, + output: run.output, + toolCalls: toolCalls(run), + thresholdFailed: false, + } +} diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts index 4823ab1..ff165ae 100644 --- a/evals/intent-discovery/intent-discovery.eval.ts +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -1,8 +1,9 @@ -import type { HarnessContext, HarnessRun, JudgeResult } from 'vitest-evals' +import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' import { countsTowardAutonomousScore } from './corpus/conditions' import { correctSkillLoaded } from './graders/correct-skill-loaded' +import { attachEvalMetadata, score } from './graders/eval-metadata' import { classifyFailure } from './graders/failure-classifier' import { referenceOnly } from './graders/reference-only' import { strictIntentInvocation } from './graders/strict-invocation' @@ -67,54 +68,6 @@ describe('Intent discovery saved transcripts', () => { } }) -type NamedJudgeResult = JudgeResult & { name: string } - -function score( - name: string, - passed: boolean, - metadata?: NamedJudgeResult['metadata'], -): NamedJudgeResult { - return { - name, - score: passed ? 1 : 0, - metadata, - } -} - -function attachEvalMetadata({ - harnessName, - run, - scores, - task, -}: { - harnessName: string - run: HarnessRun - scores: Array - task: RuntimeTask -}): void { - const avgScore = - scores.reduce((total, item) => total + (item.score ?? 0), 0) / scores.length - - task.meta.harness = { - name: harnessName, - run, - } - task.meta.eval = { - scores, - avgScore, - output: run.output, - toolCalls: toolCalls(run), - thresholdFailed: false, - } -} - -type RuntimeTask = { - meta: { - harness?: unknown - eval?: unknown - } -} - async function runSavedTranscript( evalCase: (typeof savedTranscriptCases)[number], ) { diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index eaefec1..b80ba42 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -5,7 +5,13 @@ import { join } from 'node:path' import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' +import { countsTowardAutonomousScore } from './corpus/conditions' import { tasks, type IntentDiscoveryTask } from './corpus/tasks' +import { correctSkillLoaded } from './graders/correct-skill-loaded' +import { attachEvalMetadata, score } from './graders/eval-metadata' +import { classifyFailure } from './graders/failure-classifier' +import { referenceOnly } from './graders/reference-only' +import { strictIntentInvocation } from './graders/strict-invocation' import { liveCopilotHarness, type LiveCopilotOutput, @@ -21,7 +27,7 @@ if (!routerTask) { describe('Intent discovery live Copilot harness', () => { it('returns an explicit unsupported result until live capture is wired', async () => { - const result = await runLiveHarness(routerTask) + const result = await withoutCopilotCommand(() => runLiveHarness(routerTask)) expect(result.output).toEqual({ finalAnswer: '', @@ -91,8 +97,76 @@ describe('Intent discovery live Copilot harness', () => { rmSync(tempDir, { recursive: true, force: true }) } }) + + it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( + 'runs the configured live backend', + async (context) => { + const result = await runLiveHarness(routerTask) + const strict = strictIntentInvocation(result) + const loaded = correctSkillLoaded(result, routerTask.expectedSkillAreas) + const reference = referenceOnly(result, routerTask.expectedSkillAreas) + const failureClass = classifyFailure( + result, + routerTask.expectedSkillAreas, + ) + const autonomous = countsTowardAutonomousScore({ + condition: routerTask.condition, + explicitnessLevel: routerTask.explicitnessLevel, + }) + + attachEvalMetadata({ + harnessName: liveCopilotHarness.name, + run: result, + scores: [ + score( + 'AutonomousDiscoverySuccess', + autonomous && strict.passed && loaded.passed, + { + rationale: + 'Scores only autonomous live runs where Copilot invoked Intent and loaded the expected skill.', + failureClass, + runnerStatus: String(result.artifacts?.runnerStatus ?? ''), + }, + ), + score('StrictIntentInvocation', strict.passed, { + matchedCommand: strict.matchedCommand, + source: strict.source, + }), + score('CorrectSkillLoaded', loaded.passed, { + loadedSkills: loaded.loadedSkills, + expectedSkillAreas: routerTask.expectedSkillAreas, + }), + score('NoReferenceOnlyFalsePositive', !reference, { + referenceOnly: reference, + }), + ], + task: context.task, + }) + + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.output.runId).toBe(`live:${routerTask.id}`) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) + expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) + }, + 300_000, + ) }) +async function withoutCopilotCommand(run: () => Promise): Promise { + const previousCommand = process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + delete process.env.INTENT_DISCOVERY_COPILOT_COMMAND + + try { + return await run() + } finally { + if (previousCommand !== undefined) { + process.env.INTENT_DISCOVERY_COPILOT_COMMAND = previousCommand + } + } +} + async function runLiveHarness( task: IntentDiscoveryTask, ): Promise> { diff --git a/package.json b/package.json index a38903c..ed75e94 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "generate-docs": "node scripts/generate-docs.ts", "eval:intent-discovery": "vitest run --config evals/intent-discovery/vitest.evals.config.ts", "eval:intent-discovery:json": "vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:live": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts", "eval:intent-discovery:report": "vitest-evals serve evals/intent-discovery/runs/latest/vitest-results.json", "test": "pnpm run test:ci", "test:ci": "tsc --noEmit && nx run-many --targets=test:eslint,test:sherif,test:knip,test:docs,test:lib,test:integration,test:types,build", From 391c3edc4fa4ae2f96a935ff4cc22e47fbc10ff8 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 09:39:50 -0700 Subject: [PATCH 07/13] Add live Copilot condition setup and enhance evaluation framework for intent discovery --- evals/intent-discovery/README.md | 13 +- .../intent-discovery/condition-setup.eval.ts | 93 +++++++++++++ evals/intent-discovery/corpus/conditions.ts | 4 + evals/intent-discovery/corpus/live-tasks.ts | 63 +++++++++ evals/intent-discovery/corpus/skill-uses.ts | 13 ++ .../harness/live-copilot-harness.ts | 10 ++ .../harness/setup-intent-condition.ts | 104 +++++++++++++++ .../live-copilot-harness.eval.ts | 125 ++++++++++-------- package.json | 1 + 9 files changed, 371 insertions(+), 55 deletions(-) create mode 100644 evals/intent-discovery/condition-setup.eval.ts create mode 100644 evals/intent-discovery/corpus/live-tasks.ts create mode 100644 evals/intent-discovery/corpus/skill-uses.ts create mode 100644 evals/intent-discovery/harness/setup-intent-condition.ts diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index f9474fb..6ed8b98 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -7,12 +7,14 @@ Opt-in eval suite for measuring whether Copilot discovers and invokes Intent sur - `pnpm eval:intent-discovery` runs the saved-transcript eval suite. - `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`. - `pnpm eval:intent-discovery:live` runs the eval suite with the local Copilot CLI adapter enabled. +- `pnpm eval:intent-discovery:live:json` writes a JSON report that includes live Copilot condition cases. - `pnpm eval:intent-discovery:report` serves the saved JSON report. -The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot case in the report artifact, run JSON generation with the live gate enabled: +The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot condition matrix in the report artifact, run: ```sh -INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND="node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs" pnpm eval:intent-discovery:json +pnpm eval:intent-discovery:live:json +pnpm eval:intent-discovery:report ``` ## Current scope @@ -21,6 +23,13 @@ This executable slice grades synthetic saved transcripts with Vitest plus `vites The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9. +Live Router runs compare four setup conditions: + +- `no-intent`: no Intent guidance or allowlist is added. +- `current-intent`: `package.json#intent.skills` plus the current install-style `AGENTS.md` skill-loading guidance. +- `mapped-intent`: `package.json#intent.skills` plus `AGENTS.md` task-to-skill mappings like `install --map`. +- `explicit-intent-control`: current install-style setup plus a prompt that explicitly asks the agent to run Intent. This condition is diagnostic and excluded from autonomous scoring. + The live Copilot harness can run an opt-in command backend through `INTENT_DISCOVERY_COPILOT_COMMAND`. When that environment variable is unset, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. The command runs inside a prepared fixture workspace with task metadata in `INTENT_DISCOVERY_TASK_ID`, `INTENT_DISCOVERY_FIXTURE`, `INTENT_DISCOVERY_PROMPT`, `INTENT_DISCOVERY_RUN_ID`, and `INTENT_DISCOVERY_WORKSPACE`. `pnpm eval:intent-discovery:live` sets `INTENT_DISCOVERY_RUN_LIVE=1` and `INTENT_DISCOVERY_COPILOT_COMMAND` to the repo-local Copilot CLI adapter. The adapter calls `copilot -p` in the prepared fixture workspace, writes a Copilot share transcript under the generated run directory, and prints the transcript for command capture. Live runs attach the same strict efficacy scores as saved transcripts, so a passing harness run can still report `AutonomousDiscoverySuccess: 0` when Copilot did not invoke Intent or loaded the wrong skill. Do not put API keys or tokens in the command or prompt; provide credentials through the normal Copilot CLI login or secret environment configuration. diff --git a/evals/intent-discovery/condition-setup.eval.ts b/evals/intent-discovery/condition-setup.eval.ts new file mode 100644 index 0000000..bbfa1e3 --- /dev/null +++ b/evals/intent-discovery/condition-setup.eval.ts @@ -0,0 +1,93 @@ +import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import { applyIntentCondition } from './harness/setup-intent-condition' +import { prepareFixtureWorkspace } from './harness/prepare-fixture' + +describe('Intent discovery condition setup', () => { + it('leaves no-intent workspaces without Intent guidance', () => { + const prepared = prepareInTemp() + + try { + const result = applyIntentCondition({ + condition: 'no-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + + expect(result.filesWritten).toEqual([]) + expect(existsSync(join(prepared.workspacePath, 'AGENTS.md'))).toBe(false) + expect( + readFileSync(join(prepared.workspacePath, 'package.json'), 'utf8'), + ).not.toContain('"intent"') + } finally { + prepared.cleanup() + } + }) + + it('writes current Intent guidance without mappings', () => { + const prepared = prepareInTemp() + + try { + const result = applyIntentCondition({ + condition: 'current-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + const agents = readFileSync( + join(prepared.workspacePath, 'AGENTS.md'), + 'utf8', + ) + const packageJson = readFileSync( + join(prepared.workspacePath, 'package.json'), + 'utf8', + ) + + expect(result.filesWritten).toHaveLength(2) + expect(agents).toContain('Skill Loading') + expect(agents).toContain('npx @tanstack/intent@latest list') + expect(agents).not.toContain('\nskills:\n') + expect(packageJson).toContain('"@tanstack/router"') + } finally { + prepared.cleanup() + } + }) + + it('writes mapped Intent guidance with use values', () => { + const prepared = prepareInTemp() + + try { + applyIntentCondition({ + condition: 'mapped-intent', + expectedSkillAreas: ['router'], + workspacePath: prepared.workspacePath, + }) + const agents = readFileSync( + join(prepared.workspacePath, 'AGENTS.md'), + 'utf8', + ) + + expect(agents).toContain('skills:') + expect(agents).toContain('use: "@tanstack/router#routing"') + } finally { + prepared.cleanup() + } + }) +}) + +function prepareInTemp() { + const parentDir = mkdtempSync(join(tmpdir(), 'intent-eval-condition-')) + const prepared = prepareFixtureWorkspace({ + fixture: 'router-basic', + parentDir, + }) + + return { + ...prepared, + cleanup() { + prepared.cleanup() + rmSync(parentDir, { recursive: true, force: true }) + }, + } +} diff --git a/evals/intent-discovery/corpus/conditions.ts b/evals/intent-discovery/corpus/conditions.ts index cafb9d2..ec960c9 100644 --- a/evals/intent-discovery/corpus/conditions.ts +++ b/evals/intent-discovery/corpus/conditions.ts @@ -11,6 +11,10 @@ export const intentDiscoveryConditions = [ id: 'current-intent', countsTowardAutonomousScore: true, }, + { + id: 'mapped-intent', + countsTowardAutonomousScore: true, + }, { id: 'explicit-intent-control', countsTowardAutonomousScore: false, diff --git a/evals/intent-discovery/corpus/live-tasks.ts b/evals/intent-discovery/corpus/live-tasks.ts new file mode 100644 index 0000000..d0977de --- /dev/null +++ b/evals/intent-discovery/corpus/live-tasks.ts @@ -0,0 +1,63 @@ +import type { IntentDiscoveryTask } from './tasks' + +const routerPrompt = + 'Add a route that loads user data before rendering the page.' + +export const liveTasks: Array = [ + { + id: 'live-router-no-intent', + fixture: 'router-basic', + condition: 'no-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: false, + correctSkillLoaded: false, + referenceOnly: true, + failureClass: 'reference-only', + }, + }, + { + id: 'live-router-current-intent', + fixture: 'router-basic', + condition: 'current-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'live-router-mapped-intent', + fixture: 'router-basic', + condition: 'mapped-intent', + explicitnessLevel: 2, + prompt: routerPrompt, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, + { + id: 'live-router-explicit-intent-control', + fixture: 'router-basic', + condition: 'explicit-intent-control', + explicitnessLevel: 4, + prompt: `${routerPrompt}\n\nRun intent list, load the relevant skill, and use the loaded guidance before changing files.`, + expectedSkillAreas: ['router'], + expected: { + strictInvocation: true, + correctSkillLoaded: true, + referenceOnly: false, + failureClass: 'strict-success', + }, + }, +] diff --git a/evals/intent-discovery/corpus/skill-uses.ts b/evals/intent-discovery/corpus/skill-uses.ts new file mode 100644 index 0000000..c3142f9 --- /dev/null +++ b/evals/intent-discovery/corpus/skill-uses.ts @@ -0,0 +1,13 @@ +import type { ExpectedSkillArea } from './tasks' + +export const expectedSkillUseByArea = { + router: '@tanstack/router#routing', + start: '@tanstack/start#routing', + 'table-v9': '@tanstack/table#v9-columns', +} satisfies Record + +export const packageAllowlistByArea = { + router: '@tanstack/router', + start: '@tanstack/start', + 'table-v9': '@tanstack/table', +} satisfies Record diff --git a/evals/intent-discovery/harness/live-copilot-harness.ts b/evals/intent-discovery/harness/live-copilot-harness.ts index 02f1e77..6e80365 100644 --- a/evals/intent-discovery/harness/live-copilot-harness.ts +++ b/evals/intent-discovery/harness/live-copilot-harness.ts @@ -6,6 +6,7 @@ import { LiveCopilotRunnerUnavailableError, runCopilotTask, } from './run-copilot-task' +import { applyIntentCondition } from './setup-intent-condition' export type LiveCopilotOutput = { finalAnswer: string @@ -20,10 +21,16 @@ export const liveCopilotHarness = createHarness< run: async ({ input, setArtifact }) => { const runId = `live:${input.id}` const prepared = prepareFixtureWorkspace({ fixture: input.fixture }) + const appliedCondition = applyIntentCondition({ + condition: input.condition, + expectedSkillAreas: input.expectedSkillAreas, + workspacePath: prepared.workspacePath, + }) setCommonArtifacts({ input, runId, + setupFilesWritten: appliedCondition.filesWritten, workspacePath: prepared.workspacePath, setArtifact, }) @@ -141,11 +148,13 @@ export const liveCopilotHarness = createHarness< function setCommonArtifacts({ input, runId, + setupFilesWritten, workspacePath, setArtifact, }: { input: IntentDiscoveryTask runId: string + setupFilesWritten: Array workspacePath: string setArtifact: (name: string, value: string | Array) => void }): void { @@ -155,6 +164,7 @@ function setCommonArtifacts({ setArtifact('fixture', input.fixture) setArtifact('prompt', input.prompt) setArtifact('expectedSkillAreas', input.expectedSkillAreas) + setArtifact('setupFilesWritten', setupFilesWritten) setArtifact('workspacePath', workspacePath) } diff --git a/evals/intent-discovery/harness/setup-intent-condition.ts b/evals/intent-discovery/harness/setup-intent-condition.ts new file mode 100644 index 0000000..d884f89 --- /dev/null +++ b/evals/intent-discovery/harness/setup-intent-condition.ts @@ -0,0 +1,104 @@ +import { readFileSync, writeFileSync } from 'node:fs' +import { join } from 'node:path' +import type { IntentDiscoveryCondition } from '../corpus/conditions' +import type { ExpectedSkillArea } from '../corpus/tasks' +import { + expectedSkillUseByArea, + packageAllowlistByArea, +} from '../corpus/skill-uses' + +export type AppliedIntentCondition = { + condition: IntentDiscoveryCondition + filesWritten: Array +} + +export function applyIntentCondition({ + condition, + expectedSkillAreas, + workspacePath, +}: { + condition: IntentDiscoveryCondition + expectedSkillAreas: Array + workspacePath: string +}): AppliedIntentCondition { + if (condition === 'no-intent' || condition === 'plain-docs') { + return { condition, filesWritten: [] } + } + + const filesWritten = [ + writePackageAllowlist(workspacePath, expectedSkillAreas), + writeAgentsFile({ condition, expectedSkillAreas, workspacePath }), + ] + + return { condition, filesWritten } +} + +function writePackageAllowlist( + workspacePath: string, + expectedSkillAreas: Array, +): string { + const packageJsonPath = join(workspacePath, 'package.json') + const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8')) as { + intent?: { skills?: Array } + } + + packageJson.intent = { + ...packageJson.intent, + skills: expectedSkillAreas.map((area) => packageAllowlistByArea[area]), + } + writeFileSync(packageJsonPath, `${JSON.stringify(packageJson, null, 2)}\n`) + + return packageJsonPath +} + +function writeAgentsFile({ + condition, + expectedSkillAreas, + workspacePath, +}: { + condition: IntentDiscoveryCondition + expectedSkillAreas: Array + workspacePath: string +}): string { + const agentsPath = join(workspacePath, 'AGENTS.md') + const block = + condition === 'mapped-intent' + ? mappedGuidanceBlock(expectedSkillAreas) + : loadingGuidanceBlock() + + writeFileSync(agentsPath, `${block}\n`) + + return agentsPath +} + +function loadingGuidanceBlock(): string { + return ` +## Skill Loading + +Before substantial work: + +- Skill check: run \`npx @tanstack/intent@latest list\`, or use skills already listed in context. +- Skill guidance: if one local skill clearly matches the task, run \`npx @tanstack/intent@latest load #\` and follow the returned \`SKILL.md\`. +- Monorepos: when working across packages, run the skill check from the workspace root and prefer the local skill for the package being changed. +- Multiple matches: prefer the most specific local skill for the package or concern you are changing; load additional skills only when the task spans multiple packages or concerns. +` +} + +function mappedGuidanceBlock( + expectedSkillAreas: Array, +): string { + const mappings = expectedSkillAreas + .map((area) => { + const use = expectedSkillUseByArea[area] + + return ` - when: "working on ${area} tasks" + use: "${use}"` + }) + .join('\n') + + return ` +# Skill mappings - load \`use\` with \`npx @tanstack/intent@latest load \`. +skills: +${mappings} +` +} diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index b80ba42..cbd6d66 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -6,6 +6,7 @@ import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' import { countsTowardAutonomousScore } from './corpus/conditions' +import { liveTasks } from './corpus/live-tasks' import { tasks, type IntentDiscoveryTask } from './corpus/tasks' import { correctSkillLoaded } from './graders/correct-skill-loaded' import { attachEvalMetadata, score } from './graders/eval-metadata' @@ -98,61 +99,79 @@ describe('Intent discovery live Copilot harness', () => { } }) - it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( - 'runs the configured live backend', - async (context) => { - const result = await runLiveHarness(routerTask) - const strict = strictIntentInvocation(result) - const loaded = correctSkillLoaded(result, routerTask.expectedSkillAreas) - const reference = referenceOnly(result, routerTask.expectedSkillAreas) - const failureClass = classifyFailure( - result, - routerTask.expectedSkillAreas, - ) - const autonomous = countsTowardAutonomousScore({ - condition: routerTask.condition, - explicitnessLevel: routerTask.explicitnessLevel, - }) - - attachEvalMetadata({ - harnessName: liveCopilotHarness.name, - run: result, - scores: [ - score( - 'AutonomousDiscoverySuccess', - autonomous && strict.passed && loaded.passed, - { - rationale: - 'Scores only autonomous live runs where Copilot invoked Intent and loaded the expected skill.', - failureClass, - runnerStatus: String(result.artifacts?.runnerStatus ?? ''), - }, - ), - score('StrictIntentInvocation', strict.passed, { - matchedCommand: strict.matchedCommand, - source: strict.source, - }), - score('CorrectSkillLoaded', loaded.passed, { - loadedSkills: loaded.loadedSkills, - expectedSkillAreas: routerTask.expectedSkillAreas, - }), - score('NoReferenceOnlyFalsePositive', !reference, { - referenceOnly: reference, - }), - ], - task: context.task, - }) - - expect(result.artifacts?.runnerStatus).toBe('completed') - expect(result.output.runId).toBe(`live:${routerTask.id}`) - expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) - expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) - expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) - }, - 300_000, - ) + for (const liveTask of liveTasks) { + it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( + `live/${liveTask.condition}/${liveTask.fixture}`, + async (context) => { + const result = await runLiveHarness(liveTask) + + attachLiveEvalMetadata({ + contextTask: context.task, + result, + task: liveTask, + }) + + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.output.runId).toBe(`live:${liveTask.id}`) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) + expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) + expect(result.artifacts?.setupFilesWritten).toEqual(expect.any(Array)) + }, + 300_000, + ) + } }) +function attachLiveEvalMetadata({ + contextTask, + result, + task, +}: { + contextTask: Parameters[0]['task'] + result: HarnessRun + task: IntentDiscoveryTask +}): void { + const strict = strictIntentInvocation(result) + const loaded = correctSkillLoaded(result, task.expectedSkillAreas) + const reference = referenceOnly(result, task.expectedSkillAreas) + const failureClass = classifyFailure(result, task.expectedSkillAreas) + const autonomous = countsTowardAutonomousScore({ + condition: task.condition, + explicitnessLevel: task.explicitnessLevel, + }) + + attachEvalMetadata({ + harnessName: liveCopilotHarness.name, + run: result, + scores: [ + score( + 'AutonomousDiscoverySuccess', + autonomous && strict.passed && loaded.passed, + { + rationale: + 'Scores only autonomous live runs where Copilot invoked Intent and loaded the expected skill.', + condition: task.condition, + failureClass, + runnerStatus: String(result.artifacts?.runnerStatus ?? ''), + }, + ), + score('StrictIntentInvocation', strict.passed, { + matchedCommand: strict.matchedCommand, + source: strict.source, + }), + score('CorrectSkillLoaded', loaded.passed, { + loadedSkills: loaded.loadedSkills, + expectedSkillAreas: task.expectedSkillAreas, + }), + score('NoReferenceOnlyFalsePositive', !reference, { + referenceOnly: reference, + }), + ], + task: contextTask, + }) +} + async function withoutCopilotCommand(run: () => Promise): Promise { const previousCommand = process.env.INTENT_DISCOVERY_COPILOT_COMMAND diff --git a/package.json b/package.json index ed75e94..4789fa0 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "eval:intent-discovery": "vitest run --config evals/intent-discovery/vitest.evals.config.ts", "eval:intent-discovery:json": "vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", "eval:intent-discovery:live": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts", + "eval:intent-discovery:live:json": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", "eval:intent-discovery:report": "vitest-evals serve evals/intent-discovery/runs/latest/vitest-results.json", "test": "pnpm run test:ci", "test:ci": "tsc --noEmit && nx run-many --targets=test:eslint,test:sherif,test:knip,test:docs,test:lib,test:integration,test:types,build", From d674d2be4c9eeb13e6712fd7b38bf2f779af4915 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 15:30:47 -0700 Subject: [PATCH 08/13] Enhance intent discovery evaluation: update file writing logic and add skill package handling --- .../intent-discovery/condition-setup.eval.ts | 15 +++++- .../intent-discovery/harness-capture.eval.ts | 14 +++++ .../harness/parse-intent-commands.ts | 12 +++-- .../harness/setup-intent-condition.ts | 51 ++++++++++++++++++- 4 files changed, 86 insertions(+), 6 deletions(-) diff --git a/evals/intent-discovery/condition-setup.eval.ts b/evals/intent-discovery/condition-setup.eval.ts index bbfa1e3..53e5bf1 100644 --- a/evals/intent-discovery/condition-setup.eval.ts +++ b/evals/intent-discovery/condition-setup.eval.ts @@ -44,11 +44,24 @@ describe('Intent discovery condition setup', () => { 'utf8', ) - expect(result.filesWritten).toHaveLength(2) + expect(result.filesWritten).toHaveLength(4) expect(agents).toContain('Skill Loading') expect(agents).toContain('npx @tanstack/intent@latest list') expect(agents).not.toContain('\nskills:\n') expect(packageJson).toContain('"@tanstack/router"') + expect( + existsSync( + join( + prepared.workspacePath, + 'node_modules', + '@tanstack', + 'router', + 'skills', + 'routing', + 'SKILL.md', + ), + ), + ).toBe(true) } finally { prepared.cleanup() } diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts index 5c0cea6..2907ff0 100644 --- a/evals/intent-discovery/harness-capture.eval.ts +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -28,6 +28,13 @@ describe('Intent discovery harness capture', () => { command: 'npx @tanstack/intent load @tanstack/start#routing', }, }, + { + name: 'shell_command', + arguments: { + command: + 'cd /tmp/eval/router-basic && npx @tanstack/intent@latest load @tanstack/router#routing 2>&1', + }, + }, ] expect(intentCommandsFromToolCalls(calls)).toEqual([ @@ -51,6 +58,13 @@ describe('Intent discovery harness capture', () => { skillUse: '@tanstack/start#routing', source: 'tool-call', }, + { + raw: 'npx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'npx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, ]) }) diff --git a/evals/intent-discovery/harness/parse-intent-commands.ts b/evals/intent-discovery/harness/parse-intent-commands.ts index 81d3f71..4689288 100644 --- a/evals/intent-discovery/harness/parse-intent-commands.ts +++ b/evals/intent-discovery/harness/parse-intent-commands.ts @@ -1,17 +1,21 @@ -import type { HarnessRun, ToolCallRecord } from 'vitest-evals' import { toolCalls } from 'vitest-evals' import { jsonToSearchableText } from '../graders/skill-areas' +import type { HarnessRun, ToolCallRecord } from 'vitest-evals' export type ParsedIntentCommand = { raw: string - executable: 'intent' | 'pnpm exec intent' | 'npx @tanstack/intent' + executable: + | 'intent' + | 'pnpm exec intent' + | 'npx @tanstack/intent' + | 'npx @tanstack/intent@latest' action: 'list' | 'load' skillUse?: string source: 'tool-call' | 'tool-message' } const commandPattern = - /^\s*\$?\s*((?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent)|(?:intent))\s+(list|load)(?:\s+([^\s]+))?/i + /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i export function parseIntentCommand( raw: string, @@ -35,7 +39,7 @@ export function parseIntentCommand( } return { - raw: match[0].trim().replace(/^\$\s*/, ''), + raw: `${executable} ${action}${skillUse ? ` ${skillUse}` : ''}`, executable, action, skillUse, diff --git a/evals/intent-discovery/harness/setup-intent-condition.ts b/evals/intent-discovery/harness/setup-intent-condition.ts index d884f89..56f999a 100644 --- a/evals/intent-discovery/harness/setup-intent-condition.ts +++ b/evals/intent-discovery/harness/setup-intent-condition.ts @@ -1,4 +1,4 @@ -import { readFileSync, writeFileSync } from 'node:fs' +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' import { join } from 'node:path' import type { IntentDiscoveryCondition } from '../corpus/conditions' import type { ExpectedSkillArea } from '../corpus/tasks' @@ -28,11 +28,60 @@ export function applyIntentCondition({ const filesWritten = [ writePackageAllowlist(workspacePath, expectedSkillAreas), writeAgentsFile({ condition, expectedSkillAreas, workspacePath }), + ...writeSkillPackages(workspacePath, expectedSkillAreas), ] return { condition, filesWritten } } +function writeSkillPackages( + workspacePath: string, + expectedSkillAreas: Array, +): Array { + return expectedSkillAreas.flatMap((area) => { + const packageName = packageAllowlistByArea[area] + const use = expectedSkillUseByArea[area] + const skillName = use.split('#')[1] + + if (!skillName) { + throw new Error(`Invalid expected skill use for ${area}: ${use}`) + } + + const packageRoot = join( + workspacePath, + 'node_modules', + ...packageName.split('/'), + ) + const skillDir = join(packageRoot, 'skills', skillName) + const packageJsonPath = join(packageRoot, 'package.json') + const skillPath = join(skillDir, 'SKILL.md') + + mkdirSync(skillDir, { recursive: true }) + writeFileSync( + packageJsonPath, + `${JSON.stringify( + { + name: packageName, + version: '0.0.0-intent-eval', + intent: { + version: 1, + repo: `TanStack/${area}`, + docs: 'docs/', + }, + }, + null, + 2, + )}\n`, + ) + writeFileSync( + skillPath, + `---\nname: "${skillName}"\ndescription: "Guidance for ${area} eval tasks"\n---\n\nUse this skill for ${area} eval tasks.\n`, + ) + + return [packageJsonPath, skillPath] + }) +} + function writePackageAllowlist( workspacePath: string, expectedSkillAreas: Array, From 818d2117422ef5ae5a8235c7278cb51161ecf559 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 15:47:23 -0700 Subject: [PATCH 09/13] Enhance intent discovery evaluation: add LLM judge and summary report generation, update command parsing for various package managers --- evals/intent-discovery/README.md | 7 + evals/intent-discovery/bin/llm-judge.mjs | 120 ++++++++++++ .../bin/summarize-results.mjs | 174 ++++++++++++++++++ .../intent-discovery/harness-capture.eval.ts | 71 +++++++ .../harness/parse-intent-commands.ts | 5 +- .../harness/setup-intent-condition.ts | 77 +++++--- .../live-copilot-harness.eval.ts | 64 +++++-- package.json | 2 + 8 files changed, 478 insertions(+), 42 deletions(-) create mode 100644 evals/intent-discovery/bin/llm-judge.mjs create mode 100644 evals/intent-discovery/bin/summarize-results.mjs diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md index 6ed8b98..77b880d 100644 --- a/evals/intent-discovery/README.md +++ b/evals/intent-discovery/README.md @@ -8,15 +8,22 @@ Opt-in eval suite for measuring whether Copilot discovers and invokes Intent sur - `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`. - `pnpm eval:intent-discovery:live` runs the eval suite with the local Copilot CLI adapter enabled. - `pnpm eval:intent-discovery:live:json` writes a JSON report that includes live Copilot condition cases. +- `pnpm eval:intent-discovery:judge` optionally annotates the latest JSON report with an OpenAI-backed output-quality judge when `OPENAI_API_KEY` is set. - `pnpm eval:intent-discovery:report` serves the saved JSON report. +- `pnpm eval:intent-discovery:summary` writes `summary.json` and `summary.md` from the latest JSON report. The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot condition matrix in the report artifact, run: ```sh pnpm eval:intent-discovery:live:json +pnpm eval:intent-discovery:summary pnpm eval:intent-discovery:report ``` +Set `INTENT_DISCOVERY_RUN_COUNT=3` with the live commands to run each live condition three times and include `pass@k` / `pass^k` in the generated summary. + +The optional LLM judge is secondary. It can annotate whether final answers appear to apply loaded guidance, but it never changes deterministic scores such as `StrictIntentInvocation`, `CorrectSkillLoaded`, or `AutonomousDiscoverySuccess`. + ## Current scope This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`. diff --git a/evals/intent-discovery/bin/llm-judge.mjs b/evals/intent-discovery/bin/llm-judge.mjs new file mode 100644 index 0000000..7efc974 --- /dev/null +++ b/evals/intent-discovery/bin/llm-judge.mjs @@ -0,0 +1,120 @@ +#!/usr/bin/env node + +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' + +const reportPath = + process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json' +const apiKey = process.env.OPENAI_API_KEY +const model = process.env.INTENT_DISCOVERY_LLM_JUDGE_MODEL ?? 'gpt-4o-mini' + +if (!apiKey) { + console.log('Skipped LLM judge: OPENAI_API_KEY is not set.') + process.exit(0) +} + +const report = JSON.parse(readFileSync(reportPath, 'utf8')) +const cases = reportCases(report) +const judgments = [] + +for (const item of cases) { + judgments.push(await judgeCase({ apiKey, item, model })) +} + +const output = { + generatedAt: new Date().toISOString(), + judgments, + model, +} +const outDir = dirname(reportPath) +mkdirSync(outDir, { recursive: true }) +writeFileSync( + join(outDir, 'llm-judge.json'), + `${JSON.stringify(output, null, 2)}\n`, +) +console.log(JSON.stringify(output, null, 2)) + +function reportCases(report) { + return (report.testResults ?? []).flatMap((suite) => + (suite.assertionResults ?? []) + .filter((test) => test.meta?.eval) + .map((test) => { + const run = test.meta.harness?.run ?? {} + const artifacts = run.artifacts ?? {} + const scores = Object.fromEntries( + (test.meta.eval.scores ?? []).map((score) => [ + score.name, + score.score ?? 0, + ]), + ) + + return { + artifacts: pick(artifacts, [ + 'condition', + 'expectedSkillAreas', + 'intentCommandsInvoked', + 'loadedSkills', + 'runnerStatus', + 'taskId', + ]), + finalAnswer: test.meta.eval.output?.finalAnswer ?? '', + scores, + title: test.title, + } + }), + ) +} + +async function judgeCase({ apiKey, item, model }) { + const response = await fetch('https://api.openai.com/v1/chat/completions', { + body: JSON.stringify({ + messages: [ + { + role: 'system', + content: + 'You judge whether a coding agent output appears to apply loaded library skill guidance. You must not decide whether Intent was invoked; that is provided by deterministic scores. Return strict JSON only.', + }, + { + role: 'user', + content: JSON.stringify({ + instruction: + 'Assess final output quality only. Return {"appliedGuidance":"yes"|"no"|"unknown","rationale":"..."}. Use unknown when evidence is insufficient.', + item, + }), + }, + ], + model, + response_format: { type: 'json_object' }, + temperature: 0, + }), + headers: { + authorization: `Bearer ${apiKey}`, + 'content-type': 'application/json', + }, + method: 'POST', + }) + + if (!response.ok) { + return { + error: await response.text(), + title: item.title, + } + } + + const body = await response.json() + const content = body.choices?.[0]?.message?.content ?? '{}' + + return { + deterministicScores: item.scores, + judgment: JSON.parse(content), + title: item.title, + } +} + +function pick(value, keys) { + return Object.fromEntries( + keys + .filter((key) => Object.prototype.hasOwnProperty.call(value, key)) + .map((key) => [key, value[key]]), + ) +} diff --git a/evals/intent-discovery/bin/summarize-results.mjs b/evals/intent-discovery/bin/summarize-results.mjs new file mode 100644 index 0000000..7032e14 --- /dev/null +++ b/evals/intent-discovery/bin/summarize-results.mjs @@ -0,0 +1,174 @@ +#!/usr/bin/env node + +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' + +const reportPath = + process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json' +const report = JSON.parse(readFileSync(reportPath, 'utf8')) +const summary = summarizeReport(report) +const outDir = dirname(reportPath) + +mkdirSync(outDir, { recursive: true }) +writeFileSync( + join(outDir, 'summary.json'), + `${JSON.stringify(summary, null, 2)}\n`, +) +writeFileSync(join(outDir, 'summary.md'), `${formatSummaryMarkdown(summary)}\n`) +console.log(formatSummaryMarkdown(summary)) + +export function summarizeReport(report) { + const cases = reportCases(report) + const byCondition = groupBy(cases, (item) => item.condition ?? 'unknown') + const conditionSummaries = Object.fromEntries( + [...byCondition.entries()].map(([condition, items]) => [ + condition, + summarizeCases(items), + ]), + ) + + return { + generatedAt: new Date().toISOString(), + totals: { + reportCases: cases.length, + testFailures: report.numFailedTests ?? 0, + testPasses: report.numPassedTests ?? 0, + testSuites: report.numTotalTestSuites ?? 0, + }, + byCondition: conditionSummaries, + failureClasses: countBy( + cases.map((item) => item.failureClass ?? 'unknown'), + ), + repeatedRuns: repeatedRunSummary(cases), + } +} + +function reportCases(report) { + return (report.testResults ?? []).flatMap((suite) => + (suite.assertionResults ?? []) + .filter((test) => test.meta?.eval) + .map((test) => { + const artifacts = test.meta.harness?.run?.artifacts ?? {} + const scores = Object.fromEntries( + (test.meta.eval.scores ?? []).map((score) => [ + score.name, + score.score ?? 0, + ]), + ) + const firstScore = test.meta.eval.scores?.[0] + + return { + condition: artifacts.condition, + failureClass: firstScore?.metadata?.failureClass, + fixture: artifacts.fixture, + loadedSkills: artifacts.loadedSkills ?? [], + scores, + taskId: artifacts.taskId ?? test.title, + title: test.title, + } + }), + ) +} + +function summarizeCases(cases) { + return { + autonomousSuccessRate: rate(cases, 'AutonomousDiscoverySuccess'), + correctSkillLoadedRate: rate(cases, 'CorrectSkillLoaded'), + count: cases.length, + referenceOnlyFalsePositiveRate: rate(cases, 'NoReferenceOnlyFalsePositive'), + strictInvocationRate: rate(cases, 'StrictIntentInvocation'), + } +} + +function repeatedRunSummary(cases) { + const liveCases = cases.filter((item) => item.title.includes('/run-')) + const grouped = groupBy(liveCases, (item) => + item.title.replace(/\/run-\d+$/, ''), + ) + + return Object.fromEntries( + [...grouped.entries()].map(([key, items]) => { + const successes = items.map( + (item) => item.scores.AutonomousDiscoverySuccess === 1, + ) + + return [ + key, + { + passAtK: successes.some(Boolean), + passHatK: successes.every(Boolean), + runs: items.length, + successes: successes.filter(Boolean).length, + }, + ] + }), + ) +} + +function formatSummaryMarkdown(summary) { + const lines = [ + '# Intent discovery eval summary', + '', + `Report cases: ${summary.totals.reportCases}`, + `Tests: ${summary.totals.testPasses} passed, ${summary.totals.testFailures} failed`, + '', + '## By condition', + '', + '| Condition | Cases | Strict invocation | Correct skill | Autonomous success | No reference-only false positive |', + '| --- | ---: | ---: | ---: | ---: | ---: |', + ] + + for (const [condition, item] of Object.entries(summary.byCondition)) { + lines.push( + `| ${condition} | ${item.count} | ${percent(item.strictInvocationRate)} | ${percent(item.correctSkillLoadedRate)} | ${percent(item.autonomousSuccessRate)} | ${percent(item.referenceOnlyFalsePositiveRate)} |`, + ) + } + + lines.push('', '## Failure classes', '') + for (const [failureClass, count] of Object.entries(summary.failureClasses)) { + lines.push(`- ${failureClass}: ${count}`) + } + + lines.push('', '## Repeated runs', '') + const repeated = Object.entries(summary.repeatedRuns) + if (repeated.length === 0) { + lines.push('No repeated live runs found.') + } else { + for (const [key, item] of repeated) { + lines.push( + `- ${key}: pass@k=${item.passAtK}, pass^k=${item.passHatK}, successes=${item.successes}/${item.runs}`, + ) + } + } + + return lines.join('\n') +} + +function groupBy(items, keyFn) { + const grouped = new Map() + for (const item of items) { + const key = keyFn(item) + grouped.set(key, [...(grouped.get(key) ?? []), item]) + } + return grouped +} + +function countBy(items) { + return Object.fromEntries( + [...groupBy(items, (item) => item).entries()].map(([key, values]) => [ + key, + values.length, + ]), + ) +} + +function rate(cases, scoreName) { + if (cases.length === 0) return 0 + return ( + cases.filter((item) => item.scores[scoreName] === 1).length / cases.length + ) +} + +function percent(value) { + return `${Math.round(value * 100)}%` +} diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts index 2907ff0..d664227 100644 --- a/evals/intent-discovery/harness-capture.eval.ts +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -35,6 +35,38 @@ describe('Intent discovery harness capture', () => { 'cd /tmp/eval/router-basic && npx @tanstack/intent@latest load @tanstack/router#routing 2>&1', }, }, + { + name: 'shell_command', + arguments: { + command: + 'pnpm dlx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'pnpm dlx @tanstack/intent@latest list' }, + }, + { + name: 'shell_command', + arguments: { + command: + 'yarn dlx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'yarn dlx @tanstack/intent@latest list' }, + }, + { + name: 'shell_command', + arguments: { + command: 'bunx @tanstack/intent@latest load @tanstack/router#routing', + }, + }, + { + name: 'shell_command', + arguments: { command: 'bunx @tanstack/intent@latest list' }, + }, ] expect(intentCommandsFromToolCalls(calls)).toEqual([ @@ -65,6 +97,45 @@ describe('Intent discovery harness capture', () => { skillUse: '@tanstack/router#routing', source: 'tool-call', }, + { + raw: 'pnpm dlx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'pnpm dlx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'pnpm dlx @tanstack/intent@latest list', + executable: 'pnpm dlx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, + { + raw: 'yarn dlx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'yarn dlx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'yarn dlx @tanstack/intent@latest list', + executable: 'yarn dlx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, + { + raw: 'bunx @tanstack/intent@latest load @tanstack/router#routing', + executable: 'bunx @tanstack/intent@latest', + action: 'load', + skillUse: '@tanstack/router#routing', + source: 'tool-call', + }, + { + raw: 'bunx @tanstack/intent@latest list', + executable: 'bunx @tanstack/intent@latest', + action: 'list', + source: 'tool-call', + }, ]) }) diff --git a/evals/intent-discovery/harness/parse-intent-commands.ts b/evals/intent-discovery/harness/parse-intent-commands.ts index 4689288..0d2896a 100644 --- a/evals/intent-discovery/harness/parse-intent-commands.ts +++ b/evals/intent-discovery/harness/parse-intent-commands.ts @@ -5,17 +5,20 @@ import type { HarnessRun, ToolCallRecord } from 'vitest-evals' export type ParsedIntentCommand = { raw: string executable: + | 'bunx @tanstack/intent@latest' | 'intent' | 'pnpm exec intent' + | 'pnpm dlx @tanstack/intent@latest' | 'npx @tanstack/intent' | 'npx @tanstack/intent@latest' + | 'yarn dlx @tanstack/intent@latest' action: 'list' | 'load' skillUse?: string source: 'tool-call' | 'tool-message' } const commandPattern = - /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:pnpm\s+exec\s+intent)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i + /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:bunx\s+@tanstack\/intent@latest)|(?:pnpm\s+exec\s+intent)|(?:pnpm\s+dlx\s+@tanstack\/intent@latest)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:yarn\s+dlx\s+@tanstack\/intent@latest)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i export function parseIntentCommand( raw: string, diff --git a/evals/intent-discovery/harness/setup-intent-condition.ts b/evals/intent-discovery/harness/setup-intent-condition.ts index 56f999a..cf91d6e 100644 --- a/evals/intent-discovery/harness/setup-intent-condition.ts +++ b/evals/intent-discovery/harness/setup-intent-condition.ts @@ -1,7 +1,12 @@ import { mkdirSync, readFileSync, writeFileSync } from 'node:fs' import { join } from 'node:path' +import { + buildIntentSkillGuidanceBlock, + buildIntentSkillsBlock, +} from '../../../packages/intent/src/commands/install-writer.js' import type { IntentDiscoveryCondition } from '../corpus/conditions' import type { ExpectedSkillArea } from '../corpus/tasks' +import type { ScanResult } from '../../../packages/intent/src/types.js' import { expectedSkillUseByArea, packageAllowlistByArea, @@ -121,33 +126,63 @@ function writeAgentsFile({ } function loadingGuidanceBlock(): string { - return ` -## Skill Loading - -Before substantial work: - -- Skill check: run \`npx @tanstack/intent@latest list\`, or use skills already listed in context. -- Skill guidance: if one local skill clearly matches the task, run \`npx @tanstack/intent@latest load #\` and follow the returned \`SKILL.md\`. -- Monorepos: when working across packages, run the skill check from the workspace root and prefer the local skill for the package being changed. -- Multiple matches: prefer the most specific local skill for the package or concern you are changing; load additional skills only when the task spans multiple packages or concerns. -` + return buildIntentSkillGuidanceBlock('npm').block.trimEnd() } function mappedGuidanceBlock( expectedSkillAreas: Array, ): string { - const mappings = expectedSkillAreas - .map((area) => { + return buildIntentSkillsBlock( + scanResultForAreas(expectedSkillAreas), + ).block.trimEnd() +} + +function scanResultForAreas( + expectedSkillAreas: Array, +): ScanResult { + return { + conflicts: [], + nodeModules: { + global: { detected: false, exists: false, path: null, scanned: false }, + local: { + detected: true, + exists: true, + path: 'node_modules', + scanned: true, + }, + }, + notices: [], + packageManager: 'npm', + packages: expectedSkillAreas.map((area) => { + const packageName = packageAllowlistByArea[area] const use = expectedSkillUseByArea[area] + const skillName = use.split('#')[1] - return ` - when: "working on ${area} tasks" - use: "${use}"` - }) - .join('\n') + if (!skillName) { + throw new Error(`Invalid expected skill use for ${area}: ${use}`) + } - return ` -# Skill mappings - load \`use\` with \`npx @tanstack/intent@latest load \`. -skills: -${mappings} -` + return { + intent: { + docs: 'docs/', + repo: `TanStack/${area}`, + version: 1, + }, + kind: 'npm', + name: packageName, + packageRoot: `node_modules/${packageName}`, + skills: [ + { + description: `Guidance for ${area} eval tasks`, + name: skillName, + path: `node_modules/${packageName}/skills/${skillName}/SKILL.md`, + }, + ], + source: 'local', + version: '0.0.0-intent-eval', + } + }), + stats: { packageJsonCacheHits: 0, packageJsonReadCount: 0 }, + warnings: [], + } } diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index cbd6d66..df5879b 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -21,6 +21,7 @@ import { const routerTask = tasks.find( (task) => task.id === 'router-current-intent-loads-router', ) +const liveRunCount = liveRunCountFromEnv() if (!routerTask) { throw new Error('Missing router-current-intent-loads-router task') @@ -100,29 +101,52 @@ describe('Intent discovery live Copilot harness', () => { }) for (const liveTask of liveTasks) { - it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( - `live/${liveTask.condition}/${liveTask.fixture}`, - async (context) => { - const result = await runLiveHarness(liveTask) - - attachLiveEvalMetadata({ - contextTask: context.task, - result, - task: liveTask, - }) - - expect(result.artifacts?.runnerStatus).toBe('completed') - expect(result.output.runId).toBe(`live:${liveTask.id}`) - expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) - expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) - expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) - expect(result.artifacts?.setupFilesWritten).toEqual(expect.any(Array)) - }, - 300_000, - ) + for (let runIndex = 1; runIndex <= liveRunCount; runIndex += 1) { + it.skipIf(process.env.INTENT_DISCOVERY_RUN_LIVE !== '1')( + `live/${liveTask.condition}/${liveTask.fixture}/run-${runIndex}`, + async (context) => { + const task = liveRunTask(liveTask, runIndex) + const result = await runLiveHarness(task) + + attachLiveEvalMetadata({ + contextTask: context.task, + result, + task, + }) + + expect(result.artifacts?.runnerStatus).toBe('completed') + expect(result.output.runId).toBe(`live:${task.id}`) + expect(result.artifacts?.transcriptPath).toEqual(expect.any(String)) + expect(result.artifacts?.commandsInvoked).toEqual(expect.any(Array)) + expect(result.artifacts?.loadedSkills).toEqual(expect.any(Array)) + expect(result.artifacts?.setupFilesWritten).toEqual(expect.any(Array)) + }, + 300_000, + ) + } } }) +function liveRunCountFromEnv(): number { + const value = Number(process.env.INTENT_DISCOVERY_RUN_COUNT ?? '1') + + if (!Number.isInteger(value) || value < 1) { + return 1 + } + + return value +} + +function liveRunTask( + task: IntentDiscoveryTask, + runIndex: number, +): IntentDiscoveryTask { + return { + ...task, + id: `${task.id}-run-${runIndex}`, + } +} + function attachLiveEvalMetadata({ contextTask, result, diff --git a/package.json b/package.json index 4789fa0..bcc3247 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,9 @@ "eval:intent-discovery:json": "vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", "eval:intent-discovery:live": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts", "eval:intent-discovery:live:json": "INTENT_DISCOVERY_RUN_LIVE=1 INTENT_DISCOVERY_COPILOT_COMMAND=\"node $PWD/evals/intent-discovery/bin/copilot-cli-adapter.mjs\" vitest run --config evals/intent-discovery/vitest.evals.config.ts --reporter=default --reporter=json --outputFile.json=evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:judge": "node evals/intent-discovery/bin/llm-judge.mjs evals/intent-discovery/runs/latest/vitest-results.json", "eval:intent-discovery:report": "vitest-evals serve evals/intent-discovery/runs/latest/vitest-results.json", + "eval:intent-discovery:summary": "node evals/intent-discovery/bin/summarize-results.mjs evals/intent-discovery/runs/latest/vitest-results.json", "test": "pnpm run test:ci", "test:ci": "tsc --noEmit && nx run-many --targets=test:eslint,test:sherif,test:knip,test:docs,test:lib,test:integration,test:types,build", "test:docs": "node scripts/verify-links.ts", From af482ee2c4f440d6fdf3a10c773721ac5dcedbc3 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 23:07:44 +0000 Subject: [PATCH 10/13] ci: apply automated fixes --- .../fixtures/table-v9-basic/src/user-table.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx index 7abc83a..5c07dce 100644 --- a/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx +++ b/evals/intent-discovery/fixtures/table-v9-basic/src/user-table.tsx @@ -45,7 +45,10 @@ export function UserTable({ users }: { users: Array }) { {headerGroup.headers.map((header) => ( - {flexRender(header.column.columnDef.header, header.getContext())} + {flexRender( + header.column.columnDef.header, + header.getContext(), + )} ))} From c1b9ca1d85fc6a3b33eb5a22c40056447467d2f9 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 16:14:31 -0700 Subject: [PATCH 11/13] Enhance intent discovery evaluation: add request timeout for LLM judge, improve error handling, and update fixture dependencies --- eslint.config.js | 4 ++ evals/intent-discovery/bin/llm-judge.mjs | 33 ++++++++++++- evals/intent-discovery/fixture-corpus.eval.ts | 3 +- .../fixtures/router-basic/package.json | 6 +-- .../fixtures/saved-transcripts.ts | 2 +- .../fixtures/start-basic/package.json | 6 +-- .../fixtures/table-v9-basic/package.json | 4 +- .../graders/correct-skill-loaded.ts | 4 +- .../intent-discovery/graders/eval-metadata.ts | 7 ++- .../graders/failure-classifier.ts | 8 +-- .../graders/reference-only.ts | 4 +- evals/intent-discovery/graders/skill-areas.ts | 7 ++- .../graders/strict-invocation.ts | 2 +- .../intent-discovery/harness-capture.eval.ts | 35 +++++++++++-- .../harness/live-copilot-harness.ts | 34 +++++++------ .../harness/parse-intent-commands.ts | 5 +- .../harness/prepare-fixture.ts | 4 +- .../harness/run-copilot-task.ts | 49 ++++++++++++++++--- .../harness/saved-transcript-harness.ts | 25 +++++++++- .../harness/setup-intent-condition.ts | 6 +-- .../intent-discovery/intent-discovery.eval.ts | 7 +-- .../live-copilot-harness.eval.ts | 17 +++---- 22 files changed, 200 insertions(+), 72 deletions(-) diff --git a/eslint.config.js b/eslint.config.js index 735a47d..8907d2f 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -6,6 +6,10 @@ import unusedImports from 'eslint-plugin-unused-imports' /** @type {import('eslint').Linter.Config[]} */ const config = [ ...tanstackConfig, + { + name: 'intent/eval-fixture-ignores', + ignores: ['evals/intent-discovery/fixtures/**/*.tsx'], + }, { name: 'tanstack/temp', plugins: { diff --git a/evals/intent-discovery/bin/llm-judge.mjs b/evals/intent-discovery/bin/llm-judge.mjs index 7efc974..c04538e 100644 --- a/evals/intent-discovery/bin/llm-judge.mjs +++ b/evals/intent-discovery/bin/llm-judge.mjs @@ -7,6 +7,9 @@ const reportPath = process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json' const apiKey = process.env.OPENAI_API_KEY const model = process.env.INTENT_DISCOVERY_LLM_JUDGE_MODEL ?? 'gpt-4o-mini' +const requestTimeoutMs = Number( + process.env.INTENT_DISCOVERY_LLM_JUDGE_TIMEOUT_MS ?? '30000', +) if (!apiKey) { console.log('Skipped LLM judge: OPENAI_API_KEY is not set.') @@ -66,7 +69,12 @@ function reportCases(report) { } async function judgeCase({ apiKey, item, model }) { - const response = await fetch('https://api.openai.com/v1/chat/completions', { + const controller = new AbortController() + const timeout = setTimeout(() => controller.abort(), requestTimeoutMs) + let response + + try { + response = await fetch('https://api.openai.com/v1/chat/completions', { body: JSON.stringify({ messages: [ { @@ -92,7 +100,17 @@ async function judgeCase({ apiKey, item, model }) { 'content-type': 'application/json', }, method: 'POST', + signal: controller.signal, }) + } catch (error) { + return { + deterministicScores: item.scores, + error: `LLM judge request failed: ${String(error)}`, + title: item.title, + } + } finally { + clearTimeout(timeout) + } if (!response.ok) { return { @@ -103,10 +121,21 @@ async function judgeCase({ apiKey, item, model }) { const body = await response.json() const content = body.choices?.[0]?.message?.content ?? '{}' + let judgment + try { + judgment = JSON.parse(content) + } catch (error) { + return { + deterministicScores: item.scores, + error: `Invalid JSON from model: ${String(error)}`, + rawContent: content, + title: item.title, + } + } return { deterministicScores: item.scores, - judgment: JSON.parse(content), + judgment, title: item.title, } } diff --git a/evals/intent-discovery/fixture-corpus.eval.ts b/evals/intent-discovery/fixture-corpus.eval.ts index d9054a3..cf27a17 100644 --- a/evals/intent-discovery/fixture-corpus.eval.ts +++ b/evals/intent-discovery/fixture-corpus.eval.ts @@ -3,7 +3,8 @@ import { dirname, join } from 'node:path' import { fileURLToPath } from 'node:url' import { describe, expect, it } from 'vitest' import { fixtures } from './corpus/fixtures' -import { tasks, type ExpectedSkillArea } from './corpus/tasks' +import { tasks } from './corpus/tasks' +import type {ExpectedSkillArea} from './corpus/tasks'; const fixturesDir = join(dirname(fileURLToPath(import.meta.url)), 'fixtures') diff --git a/evals/intent-discovery/fixtures/router-basic/package.json b/evals/intent-discovery/fixtures/router-basic/package.json index ec20a76..268e5c7 100644 --- a/evals/intent-discovery/fixtures/router-basic/package.json +++ b/evals/intent-discovery/fixtures/router-basic/package.json @@ -3,8 +3,8 @@ "private": true, "type": "module", "dependencies": { - "@tanstack/react-router": "latest", - "react": "latest", - "react-dom": "latest" + "@tanstack/react-router": "1.170.16", + "react": "19.2.0", + "react-dom": "19.2.0" } } diff --git a/evals/intent-discovery/fixtures/saved-transcripts.ts b/evals/intent-discovery/fixtures/saved-transcripts.ts index 3471cf8..69c05d7 100644 --- a/evals/intent-discovery/fixtures/saved-transcripts.ts +++ b/evals/intent-discovery/fixtures/saved-transcripts.ts @@ -1,6 +1,6 @@ +import { tasks } from '../corpus/tasks' import type { NormalizedMessage, ToolCallRecord } from 'vitest-evals' import type { IntentDiscoveryTask } from '../corpus/tasks' -import { tasks } from '../corpus/tasks' export type SavedTranscriptCase = IntentDiscoveryTask & { finalAnswer: string diff --git a/evals/intent-discovery/fixtures/start-basic/package.json b/evals/intent-discovery/fixtures/start-basic/package.json index bd9f7a5..3204434 100644 --- a/evals/intent-discovery/fixtures/start-basic/package.json +++ b/evals/intent-discovery/fixtures/start-basic/package.json @@ -3,9 +3,9 @@ "private": true, "type": "module", "dependencies": { - "@tanstack/react-router": "latest", + "@tanstack/react-router": "1.170.16", "@tanstack/react-start": "1.168.26", - "react": "latest", - "react-dom": "latest" + "react": "19.2.0", + "react-dom": "19.2.0" } } diff --git a/evals/intent-discovery/fixtures/table-v9-basic/package.json b/evals/intent-discovery/fixtures/table-v9-basic/package.json index e58f3c1..0a17270 100644 --- a/evals/intent-discovery/fixtures/table-v9-basic/package.json +++ b/evals/intent-discovery/fixtures/table-v9-basic/package.json @@ -4,7 +4,7 @@ "type": "module", "dependencies": { "@tanstack/react-table": "9.0.0-beta.16", - "react": "latest", - "react-dom": "latest" + "react": "19.2.0", + "react-dom": "19.2.0" } } diff --git a/evals/intent-discovery/graders/correct-skill-loaded.ts b/evals/intent-discovery/graders/correct-skill-loaded.ts index 99f3e33..b24e307 100644 --- a/evals/intent-discovery/graders/correct-skill-loaded.ts +++ b/evals/intent-discovery/graders/correct-skill-loaded.ts @@ -1,7 +1,7 @@ -import type { HarnessRun } from 'vitest-evals' -import type { ExpectedSkillArea } from '../corpus/tasks' import { loadedSkillUsesFromRun } from '../harness/parse-intent-commands' import { listIncludesExpectedSkillArea } from './skill-areas' +import type { HarnessRun } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' export type CorrectSkillLoadedResult = { passed: boolean diff --git a/evals/intent-discovery/graders/eval-metadata.ts b/evals/intent-discovery/graders/eval-metadata.ts index f666baf..d09aaf1 100644 --- a/evals/intent-discovery/graders/eval-metadata.ts +++ b/evals/intent-discovery/graders/eval-metadata.ts @@ -1,5 +1,5 @@ -import type { HarnessRun, JudgeResult, JsonValue } from 'vitest-evals' import { toolCalls } from 'vitest-evals' +import type { HarnessRun, JsonValue, JudgeResult } from 'vitest-evals' export type NamedJudgeResult = JudgeResult & { name: string } @@ -34,7 +34,10 @@ export function attachEvalMetadata({ task: RuntimeTask }): void { const avgScore = - scores.reduce((total, item) => total + (item.score ?? 0), 0) / scores.length + scores.length === 0 + ? 0 + : scores.reduce((total, item) => total + (item.score ?? 0), 0) / + scores.length task.meta.harness = { name: harnessName, diff --git a/evals/intent-discovery/graders/failure-classifier.ts b/evals/intent-discovery/graders/failure-classifier.ts index 62ec9dd..82873c3 100644 --- a/evals/intent-discovery/graders/failure-classifier.ts +++ b/evals/intent-discovery/graders/failure-classifier.ts @@ -1,11 +1,11 @@ -import type { HarnessRun } from 'vitest-evals' +import { correctSkillLoaded } from './correct-skill-loaded' +import { referenceOnly } from './reference-only' +import { strictIntentInvocation } from './strict-invocation' import type { ExpectedSkillArea, IntentDiscoveryFailureClass, } from '../corpus/tasks' -import { correctSkillLoaded } from './correct-skill-loaded' -import { referenceOnly } from './reference-only' -import { strictIntentInvocation } from './strict-invocation' +import type { HarnessRun } from 'vitest-evals' export function classifyFailure( run: HarnessRun, diff --git a/evals/intent-discovery/graders/reference-only.ts b/evals/intent-discovery/graders/reference-only.ts index 1bcd2d6..a3d0f07 100644 --- a/evals/intent-discovery/graders/reference-only.ts +++ b/evals/intent-discovery/graders/reference-only.ts @@ -1,7 +1,7 @@ -import type { HarnessRun } from 'vitest-evals' -import type { ExpectedSkillArea } from '../corpus/tasks' import { jsonToSearchableText, textMatchesSkillArea } from './skill-areas' import { strictIntentInvocation } from './strict-invocation' +import type { HarnessRun } from 'vitest-evals' +import type { ExpectedSkillArea } from '../corpus/tasks' export function referenceOnly( run: HarnessRun, diff --git a/evals/intent-discovery/graders/skill-areas.ts b/evals/intent-discovery/graders/skill-areas.ts index cfd51cd..64c869b 100644 --- a/evals/intent-discovery/graders/skill-areas.ts +++ b/evals/intent-discovery/graders/skill-areas.ts @@ -4,7 +4,12 @@ import type { ExpectedSkillArea } from '../corpus/tasks' const skillAreaPatterns: Record> = { router: [/router/i, /routing/i, /@tanstack\/router/i], start: [/tanstack start/i, /react-start/i, /server function/i, /full-stack/i], - 'table-v9': [/tanstack table/i, /react-table/i, /table v9/i, /v9/i], + 'table-v9': [ + /tanstack table/i, + /react-table/i, + /@tanstack\/react-table/i, + /\btable[\s-]?v9\b/i, + ], } export function jsonToSearchableText(value: JsonValue | undefined): string { diff --git a/evals/intent-discovery/graders/strict-invocation.ts b/evals/intent-discovery/graders/strict-invocation.ts index 80eb487..06e4ff3 100644 --- a/evals/intent-discovery/graders/strict-invocation.ts +++ b/evals/intent-discovery/graders/strict-invocation.ts @@ -1,5 +1,5 @@ -import type { HarnessRun } from 'vitest-evals' import { intentCommandsFromRun } from '../harness/parse-intent-commands' +import type { HarnessRun } from 'vitest-evals' export type StrictInvocationResult = { passed: boolean diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts index d664227..621d7f8 100644 --- a/evals/intent-discovery/harness-capture.eval.ts +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -1,9 +1,7 @@ -import { existsSync, mkdirSync, readFileSync } from 'node:fs' -import { mkdtempSync, rmSync } from 'node:fs' +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { describe, expect, it } from 'vitest' -import type { ToolCallRecord } from 'vitest-evals' import { fixtures } from './corpus/fixtures' import { tasks } from './corpus/tasks' import { @@ -11,6 +9,7 @@ import { parseIntentCommand, } from './harness/parse-intent-commands' import { prepareFixtureWorkspace } from './harness/prepare-fixture' +import type { ToolCallRecord } from 'vitest-evals' describe('Intent discovery harness capture', () => { it('parses accepted Intent command forms from tool calls', () => { @@ -46,6 +45,10 @@ describe('Intent discovery harness capture', () => { name: 'shell_command', arguments: { command: 'pnpm dlx @tanstack/intent@latest list' }, }, + { + name: 'shell_command', + arguments: { command: 'pnpm dlx @tanstack/intent list' }, + }, { name: 'shell_command', arguments: { @@ -57,6 +60,10 @@ describe('Intent discovery harness capture', () => { name: 'shell_command', arguments: { command: 'yarn dlx @tanstack/intent@latest list' }, }, + { + name: 'shell_command', + arguments: { command: 'yarn dlx @tanstack/intent list' }, + }, { name: 'shell_command', arguments: { @@ -67,6 +74,10 @@ describe('Intent discovery harness capture', () => { name: 'shell_command', arguments: { command: 'bunx @tanstack/intent@latest list' }, }, + { + name: 'shell_command', + arguments: { command: 'bunx @tanstack/intent list' }, + }, ] expect(intentCommandsFromToolCalls(calls)).toEqual([ @@ -110,6 +121,12 @@ describe('Intent discovery harness capture', () => { action: 'list', source: 'tool-call', }, + { + raw: 'pnpm dlx @tanstack/intent list', + executable: 'pnpm dlx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, { raw: 'yarn dlx @tanstack/intent@latest load @tanstack/router#routing', executable: 'yarn dlx @tanstack/intent@latest', @@ -123,6 +140,12 @@ describe('Intent discovery harness capture', () => { action: 'list', source: 'tool-call', }, + { + raw: 'yarn dlx @tanstack/intent list', + executable: 'yarn dlx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, { raw: 'bunx @tanstack/intent@latest load @tanstack/router#routing', executable: 'bunx @tanstack/intent@latest', @@ -136,6 +159,12 @@ describe('Intent discovery harness capture', () => { action: 'list', source: 'tool-call', }, + { + raw: 'bunx @tanstack/intent list', + executable: 'bunx @tanstack/intent', + action: 'list', + source: 'tool-call', + }, ]) }) diff --git a/evals/intent-discovery/harness/live-copilot-harness.ts b/evals/intent-discovery/harness/live-copilot-harness.ts index 6e80365..decd58b 100644 --- a/evals/intent-discovery/harness/live-copilot-harness.ts +++ b/evals/intent-discovery/harness/live-copilot-harness.ts @@ -1,5 +1,4 @@ import { createHarness } from 'vitest-evals' -import type { IntentDiscoveryTask } from '../corpus/tasks' import { intentCommandsFromToolCalls } from './parse-intent-commands' import { prepareFixtureWorkspace } from './prepare-fixture' import { @@ -7,6 +6,7 @@ import { runCopilotTask, } from './run-copilot-task' import { applyIntentCondition } from './setup-intent-condition' +import type { IntentDiscoveryTask } from '../corpus/tasks' export type LiveCopilotOutput = { finalAnswer: string @@ -20,22 +20,24 @@ export const liveCopilotHarness = createHarness< name: 'intent-discovery-live-copilot', run: async ({ input, setArtifact }) => { const runId = `live:${input.id}` - const prepared = prepareFixtureWorkspace({ fixture: input.fixture }) - const appliedCondition = applyIntentCondition({ - condition: input.condition, - expectedSkillAreas: input.expectedSkillAreas, - workspacePath: prepared.workspacePath, - }) - - setCommonArtifacts({ - input, - runId, - setupFilesWritten: appliedCondition.filesWritten, - workspacePath: prepared.workspacePath, - setArtifact, - }) + let prepared: ReturnType | undefined try { + prepared = prepareFixtureWorkspace({ fixture: input.fixture }) + const appliedCondition = applyIntentCondition({ + condition: input.condition, + expectedSkillAreas: input.expectedSkillAreas, + workspacePath: prepared.workspacePath, + }) + + setCommonArtifacts({ + input, + runId, + setupFilesWritten: appliedCondition.filesWritten, + workspacePath: prepared.workspacePath, + setArtifact, + }) + const run = await runCopilotTask({ task: input, runId, @@ -140,7 +142,7 @@ export const liveCopilotHarness = createHarness< errors: [normalizedError], } } finally { - prepared.cleanup() + prepared?.cleanup() } }, }) diff --git a/evals/intent-discovery/harness/parse-intent-commands.ts b/evals/intent-discovery/harness/parse-intent-commands.ts index 0d2896a..fda01cd 100644 --- a/evals/intent-discovery/harness/parse-intent-commands.ts +++ b/evals/intent-discovery/harness/parse-intent-commands.ts @@ -5,12 +5,15 @@ import type { HarnessRun, ToolCallRecord } from 'vitest-evals' export type ParsedIntentCommand = { raw: string executable: + | 'bunx @tanstack/intent' | 'bunx @tanstack/intent@latest' | 'intent' | 'pnpm exec intent' + | 'pnpm dlx @tanstack/intent' | 'pnpm dlx @tanstack/intent@latest' | 'npx @tanstack/intent' | 'npx @tanstack/intent@latest' + | 'yarn dlx @tanstack/intent' | 'yarn dlx @tanstack/intent@latest' action: 'list' | 'load' skillUse?: string @@ -18,7 +21,7 @@ export type ParsedIntentCommand = { } const commandPattern = - /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:bunx\s+@tanstack\/intent@latest)|(?:pnpm\s+exec\s+intent)|(?:pnpm\s+dlx\s+@tanstack\/intent@latest)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:yarn\s+dlx\s+@tanstack\/intent@latest)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i + /^\s*\$?\s*(?:(?:cd\s+.+?\s+&&\s+))?((?:bunx\s+@tanstack\/intent(?:@latest)?)|(?:pnpm\s+exec\s+intent)|(?:pnpm\s+dlx\s+@tanstack\/intent(?:@latest)?)|(?:npx\s+@tanstack\/intent(?:@latest)?)|(?:yarn\s+dlx\s+@tanstack\/intent(?:@latest)?)|(?:intent))\s+(list|load)(?:\s+([^\s|;&]+))?/i export function parseIntentCommand( raw: string, diff --git a/evals/intent-discovery/harness/prepare-fixture.ts b/evals/intent-discovery/harness/prepare-fixture.ts index dccb6d8..59d23c4 100644 --- a/evals/intent-discovery/harness/prepare-fixture.ts +++ b/evals/intent-discovery/harness/prepare-fixture.ts @@ -7,7 +7,7 @@ import { rmSync, } from 'node:fs' import { tmpdir } from 'node:os' -import { basename, dirname, join } from 'node:path' +import { basename, dirname, join, sep } from 'node:path' import { fileURLToPath } from 'node:url' import type { IntentDiscoveryFixture } from '../corpus/tasks' @@ -47,7 +47,7 @@ export function prepareFixtureWorkspace({ cpSync(sourcePath, workspacePath, { recursive: true, verbatimSymlinks: true, - filter: (source) => !source.includes(`${fixturesDir}/runs/`), + filter: (source) => !source.includes(`${fixturesDir}${sep}runs${sep}`), }) return { diff --git a/evals/intent-discovery/harness/run-copilot-task.ts b/evals/intent-discovery/harness/run-copilot-task.ts index 5dcc21b..f1295aa 100644 --- a/evals/intent-discovery/harness/run-copilot-task.ts +++ b/evals/intent-discovery/harness/run-copilot-task.ts @@ -1,17 +1,20 @@ -import type { - NormalizedMessage, - ToolCallRecord, - UsageSummary, -} from 'vitest-evals' -import type { IntentDiscoveryTask } from '../corpus/tasks' import { mkdirSync, writeFileSync } from 'node:fs' import { dirname, join } from 'node:path' import { fileURLToPath } from 'node:url' import { spawn } from 'node:child_process' import { parseIntentCommand } from './parse-intent-commands' +import type { IntentDiscoveryTask } from '../corpus/tasks' +import type { + NormalizedMessage, + ToolCallRecord, + UsageSummary, +} from 'vitest-evals' const evalDir = dirname(dirname(fileURLToPath(import.meta.url))) const transcriptDir = join(evalDir, 'runs', 'latest', 'transcripts') +const commandTimeoutMs = Number( + process.env.INTENT_DISCOVERY_COMMAND_TIMEOUT_MS ?? '300000', +) export class LiveCopilotRunnerUnavailableError extends Error { constructor() { @@ -125,6 +128,7 @@ async function runCommand({ input: RunCopilotTaskInput }): Promise { return new Promise((resolve, reject) => { + let settled = false const child = spawn(command, { cwd: input.workspacePath, shell: true, @@ -139,11 +143,25 @@ async function runCommand({ }) const stdoutChunks: Array = [] const stderrChunks: Array = [] + const timeout = setTimeout(() => { + if (settled) return + settled = true + child.kill('SIGKILL') + reject(new Error(`Copilot command timed out after ${commandTimeoutMs}ms`)) + }, commandTimeoutMs) child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) - child.on('error', reject) + child.on('error', (error) => { + if (settled) return + settled = true + clearTimeout(timeout) + reject(error) + }) child.on('close', (exitCode) => { + if (settled) return + settled = true + clearTimeout(timeout) resolve({ stdout: Buffer.concat(stdoutChunks).toString('utf8'), stderr: Buffer.concat(stderrChunks).toString('utf8'), @@ -244,14 +262,29 @@ async function runDiff( workspacePath: string, ): Promise { return new Promise((resolve, reject) => { + let settled = false const child = spawn('diff', ['-ruN', sourcePath, workspacePath]) const stdoutChunks: Array = [] const stderrChunks: Array = [] + const timeout = setTimeout(() => { + if (settled) return + settled = true + child.kill('SIGKILL') + reject(new Error(`diff timed out after ${commandTimeoutMs}ms`)) + }, commandTimeoutMs) child.stdout.on('data', (chunk: Buffer) => stdoutChunks.push(chunk)) child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)) - child.on('error', reject) + child.on('error', (error) => { + if (settled) return + settled = true + clearTimeout(timeout) + reject(error) + }) child.on('close', (exitCode) => { + if (settled) return + settled = true + clearTimeout(timeout) resolve({ stdout: Buffer.concat(stdoutChunks).toString('utf8'), stderr: Buffer.concat(stderrChunks).toString('utf8'), diff --git a/evals/intent-discovery/harness/saved-transcript-harness.ts b/evals/intent-discovery/harness/saved-transcript-harness.ts index e8c0731..49a51be 100644 --- a/evals/intent-discovery/harness/saved-transcript-harness.ts +++ b/evals/intent-discovery/harness/saved-transcript-harness.ts @@ -95,8 +95,31 @@ function messagesWithToolCalls( index === firstAssistantIndex ? { ...message, - toolCalls: [...(message.toolCalls ?? []), ...toolCalls], + toolCalls: mergeToolCalls(message.toolCalls ?? [], toolCalls), } : message, ) } + +function mergeToolCalls( + existing: Array, + incoming: Array, +): Array { + const seen = new Set( + existing.map((call) => `${call.name}:${JSON.stringify(call.arguments ?? {})}`), + ) + + return [ + ...existing, + ...incoming.filter((call) => { + const key = `${call.name}:${JSON.stringify(call.arguments ?? {})}` + + if (seen.has(key)) { + return false + } + + seen.add(key) + return true + }), + ] +} diff --git a/evals/intent-discovery/harness/setup-intent-condition.ts b/evals/intent-discovery/harness/setup-intent-condition.ts index cf91d6e..3ae9da5 100644 --- a/evals/intent-discovery/harness/setup-intent-condition.ts +++ b/evals/intent-discovery/harness/setup-intent-condition.ts @@ -4,13 +4,13 @@ import { buildIntentSkillGuidanceBlock, buildIntentSkillsBlock, } from '../../../packages/intent/src/commands/install-writer.js' -import type { IntentDiscoveryCondition } from '../corpus/conditions' -import type { ExpectedSkillArea } from '../corpus/tasks' -import type { ScanResult } from '../../../packages/intent/src/types.js' import { expectedSkillUseByArea, packageAllowlistByArea, } from '../corpus/skill-uses' +import type { IntentDiscoveryCondition } from '../corpus/conditions' +import type { ExpectedSkillArea } from '../corpus/tasks' +import type { ScanResult } from '../../../packages/intent/src/types.js' export type AppliedIntentCondition = { condition: IntentDiscoveryCondition diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts index ff165ae..79bbceb 100644 --- a/evals/intent-discovery/intent-discovery.eval.ts +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -1,4 +1,3 @@ -import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' import { countsTowardAutonomousScore } from './corpus/conditions' @@ -10,8 +9,8 @@ import { strictIntentInvocation } from './graders/strict-invocation' import { savedTranscriptCases } from './fixtures/saved-transcripts' import { savedTranscriptHarness, - type IntentDiscoveryOutput, } from './harness/saved-transcript-harness' +import type { HarnessContext } from 'vitest-evals' describe('Intent discovery saved transcripts', () => { for (const evalCase of savedTranscriptCases) { @@ -79,7 +78,5 @@ async function runSavedTranscript( }, } - return savedTranscriptHarness.run(evalCase, context) as Promise< - HarnessRun - > + return savedTranscriptHarness.run(evalCase, context) } diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index df5879b..7beca0a 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -1,22 +1,23 @@ -import { existsSync, writeFileSync } from 'node:fs' -import { mkdtempSync, rmSync } from 'node:fs' +import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' -import type { HarnessContext, HarnessRun } from 'vitest-evals' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' import { countsTowardAutonomousScore } from './corpus/conditions' import { liveTasks } from './corpus/live-tasks' -import { tasks, type IntentDiscoveryTask } from './corpus/tasks' +import { tasks } from './corpus/tasks' import { correctSkillLoaded } from './graders/correct-skill-loaded' import { attachEvalMetadata, score } from './graders/eval-metadata' import { classifyFailure } from './graders/failure-classifier' import { referenceOnly } from './graders/reference-only' import { strictIntentInvocation } from './graders/strict-invocation' import { - liveCopilotHarness, - type LiveCopilotOutput, + + liveCopilotHarness } from './harness/live-copilot-harness' +import type {IntentDiscoveryTask} from './corpus/tasks'; +import type {LiveCopilotOutput} from './harness/live-copilot-harness'; +import type { HarnessContext, HarnessRun } from 'vitest-evals' const routerTask = tasks.find( (task) => task.id === 'router-current-intent-loads-router', @@ -221,7 +222,5 @@ async function runLiveHarness( }, } - return liveCopilotHarness.run(task, context) as Promise< - HarnessRun - > + return liveCopilotHarness.run(task, context) } From 7bb199b85da47999ed4694d3fda9c7407be23f4b Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Sat, 20 Jun 2026 23:15:22 +0000 Subject: [PATCH 12/13] ci: apply automated fixes --- evals/intent-discovery/bin/llm-judge.mjs | 54 +++++++++---------- evals/intent-discovery/fixture-corpus.eval.ts | 4 +- .../intent-discovery/harness-capture.eval.ts | 8 ++- .../harness/saved-transcript-harness.ts | 4 +- .../intent-discovery/intent-discovery.eval.ts | 4 +- .../live-copilot-harness.eval.ts | 13 ++--- 6 files changed, 45 insertions(+), 42 deletions(-) diff --git a/evals/intent-discovery/bin/llm-judge.mjs b/evals/intent-discovery/bin/llm-judge.mjs index c04538e..0e3bd57 100644 --- a/evals/intent-discovery/bin/llm-judge.mjs +++ b/evals/intent-discovery/bin/llm-judge.mjs @@ -75,33 +75,33 @@ async function judgeCase({ apiKey, item, model }) { try { response = await fetch('https://api.openai.com/v1/chat/completions', { - body: JSON.stringify({ - messages: [ - { - role: 'system', - content: - 'You judge whether a coding agent output appears to apply loaded library skill guidance. You must not decide whether Intent was invoked; that is provided by deterministic scores. Return strict JSON only.', - }, - { - role: 'user', - content: JSON.stringify({ - instruction: - 'Assess final output quality only. Return {"appliedGuidance":"yes"|"no"|"unknown","rationale":"..."}. Use unknown when evidence is insufficient.', - item, - }), - }, - ], - model, - response_format: { type: 'json_object' }, - temperature: 0, - }), - headers: { - authorization: `Bearer ${apiKey}`, - 'content-type': 'application/json', - }, - method: 'POST', - signal: controller.signal, - }) + body: JSON.stringify({ + messages: [ + { + role: 'system', + content: + 'You judge whether a coding agent output appears to apply loaded library skill guidance. You must not decide whether Intent was invoked; that is provided by deterministic scores. Return strict JSON only.', + }, + { + role: 'user', + content: JSON.stringify({ + instruction: + 'Assess final output quality only. Return {"appliedGuidance":"yes"|"no"|"unknown","rationale":"..."}. Use unknown when evidence is insufficient.', + item, + }), + }, + ], + model, + response_format: { type: 'json_object' }, + temperature: 0, + }), + headers: { + authorization: `Bearer ${apiKey}`, + 'content-type': 'application/json', + }, + method: 'POST', + signal: controller.signal, + }) } catch (error) { return { deterministicScores: item.scores, diff --git a/evals/intent-discovery/fixture-corpus.eval.ts b/evals/intent-discovery/fixture-corpus.eval.ts index cf27a17..bba16b5 100644 --- a/evals/intent-discovery/fixture-corpus.eval.ts +++ b/evals/intent-discovery/fixture-corpus.eval.ts @@ -3,8 +3,8 @@ import { dirname, join } from 'node:path' import { fileURLToPath } from 'node:url' import { describe, expect, it } from 'vitest' import { fixtures } from './corpus/fixtures' -import { tasks } from './corpus/tasks' -import type {ExpectedSkillArea} from './corpus/tasks'; +import { tasks } from './corpus/tasks' +import type { ExpectedSkillArea } from './corpus/tasks' const fixturesDir = join(dirname(fileURLToPath(import.meta.url)), 'fixtures') diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts index 621d7f8..75188a7 100644 --- a/evals/intent-discovery/harness-capture.eval.ts +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -1,4 +1,10 @@ -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs' +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, +} from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { describe, expect, it } from 'vitest' diff --git a/evals/intent-discovery/harness/saved-transcript-harness.ts b/evals/intent-discovery/harness/saved-transcript-harness.ts index 49a51be..2263e61 100644 --- a/evals/intent-discovery/harness/saved-transcript-harness.ts +++ b/evals/intent-discovery/harness/saved-transcript-harness.ts @@ -106,7 +106,9 @@ function mergeToolCalls( incoming: Array, ): Array { const seen = new Set( - existing.map((call) => `${call.name}:${JSON.stringify(call.arguments ?? {})}`), + existing.map( + (call) => `${call.name}:${JSON.stringify(call.arguments ?? {})}`, + ), ) return [ diff --git a/evals/intent-discovery/intent-discovery.eval.ts b/evals/intent-discovery/intent-discovery.eval.ts index 79bbceb..76f4e66 100644 --- a/evals/intent-discovery/intent-discovery.eval.ts +++ b/evals/intent-discovery/intent-discovery.eval.ts @@ -7,9 +7,7 @@ import { classifyFailure } from './graders/failure-classifier' import { referenceOnly } from './graders/reference-only' import { strictIntentInvocation } from './graders/strict-invocation' import { savedTranscriptCases } from './fixtures/saved-transcripts' -import { - savedTranscriptHarness, -} from './harness/saved-transcript-harness' +import { savedTranscriptHarness } from './harness/saved-transcript-harness' import type { HarnessContext } from 'vitest-evals' describe('Intent discovery saved transcripts', () => { diff --git a/evals/intent-discovery/live-copilot-harness.eval.ts b/evals/intent-discovery/live-copilot-harness.eval.ts index 7beca0a..631f7cf 100644 --- a/evals/intent-discovery/live-copilot-harness.eval.ts +++ b/evals/intent-discovery/live-copilot-harness.eval.ts @@ -1,22 +1,19 @@ -import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { describe, expect, it } from 'vitest' import { failedSpans, toolCalls } from 'vitest-evals' import { countsTowardAutonomousScore } from './corpus/conditions' import { liveTasks } from './corpus/live-tasks' -import { tasks } from './corpus/tasks' +import { tasks } from './corpus/tasks' import { correctSkillLoaded } from './graders/correct-skill-loaded' import { attachEvalMetadata, score } from './graders/eval-metadata' import { classifyFailure } from './graders/failure-classifier' import { referenceOnly } from './graders/reference-only' import { strictIntentInvocation } from './graders/strict-invocation' -import { - - liveCopilotHarness -} from './harness/live-copilot-harness' -import type {IntentDiscoveryTask} from './corpus/tasks'; -import type {LiveCopilotOutput} from './harness/live-copilot-harness'; +import { liveCopilotHarness } from './harness/live-copilot-harness' +import type { IntentDiscoveryTask } from './corpus/tasks' +import type { LiveCopilotOutput } from './harness/live-copilot-harness' import type { HarnessContext, HarnessRun } from 'vitest-evals' const routerTask = tasks.find( From 50724d5be1595321ee5d11f3fada747927023923 Mon Sep 17 00:00:00 2001 From: ladybluenotes Date: Sat, 20 Jun 2026 16:26:45 -0700 Subject: [PATCH 13/13] Refactor intent discovery types and functions: remove exports for internal types, update fixture handling, and enhance reference-only logic --- evals/intent-discovery/corpus/conditions.ts | 4 +-- evals/intent-discovery/corpus/tasks.ts | 4 +-- evals/intent-discovery/fixture-corpus.eval.ts | 12 ++++++--- .../graders/reference-only.ts | 1 + .../intent-discovery/harness-capture.eval.ts | 26 +++++++++++++++++++ .../harness/prepare-fixture.ts | 2 +- knip.json | 8 +++++- 7 files changed, 48 insertions(+), 9 deletions(-) diff --git a/evals/intent-discovery/corpus/conditions.ts b/evals/intent-discovery/corpus/conditions.ts index ec960c9..656d067 100644 --- a/evals/intent-discovery/corpus/conditions.ts +++ b/evals/intent-discovery/corpus/conditions.ts @@ -1,4 +1,4 @@ -export const intentDiscoveryConditions = [ +const intentDiscoveryConditions = [ { id: 'no-intent', countsTowardAutonomousScore: true, @@ -24,7 +24,7 @@ export const intentDiscoveryConditions = [ export type IntentDiscoveryCondition = (typeof intentDiscoveryConditions)[number]['id'] -export const promptExplicitnessLevels = [0, 1, 2, 3, 4] as const +const promptExplicitnessLevels = [0, 1, 2, 3, 4] as const export type PromptExplicitnessLevel = (typeof promptExplicitnessLevels)[number] diff --git a/evals/intent-discovery/corpus/tasks.ts b/evals/intent-discovery/corpus/tasks.ts index 29bfa63..ca425e3 100644 --- a/evals/intent-discovery/corpus/tasks.ts +++ b/evals/intent-discovery/corpus/tasks.ts @@ -3,7 +3,7 @@ import type { PromptExplicitnessLevel, } from './conditions' -export const expectedSkillAreas = ['router', 'start', 'table-v9'] as const +const expectedSkillAreas = ['router', 'start', 'table-v9'] as const export type ExpectedSkillArea = (typeof expectedSkillAreas)[number] @@ -27,7 +27,7 @@ export type IntentDiscoveryFailureClass = | 'prompt-too-vague' | 'harness-error' -export type IntentDiscoveryExpected = { +type IntentDiscoveryExpected = { strictInvocation: boolean correctSkillLoaded: boolean referenceOnly: boolean diff --git a/evals/intent-discovery/fixture-corpus.eval.ts b/evals/intent-discovery/fixture-corpus.eval.ts index bba16b5..1ab4d7a 100644 --- a/evals/intent-discovery/fixture-corpus.eval.ts +++ b/evals/intent-discovery/fixture-corpus.eval.ts @@ -4,7 +4,7 @@ import { fileURLToPath } from 'node:url' import { describe, expect, it } from 'vitest' import { fixtures } from './corpus/fixtures' import { tasks } from './corpus/tasks' -import type { ExpectedSkillArea } from './corpus/tasks' +import type { IntentDiscoveryFixtureDefinition } from './corpus/fixtures' const fixturesDir = join(dirname(fileURLToPath(import.meta.url)), 'fixtures') @@ -22,12 +22,18 @@ describe('Intent discovery fixture corpus', () => { it('points each task at a fixture that covers its expected skill areas', () => { for (const task of tasks) { - const fixture = fixtures[task.fixture] + const fixture = ( + fixtures as Partial> + )[task.fixture] expect(fixture, `${task.id} uses an unknown fixture`).toBeDefined() + if (!fixture) { + continue + } + expect( task.expectedSkillAreas.every((area) => - (fixture.skillAreas as Array).includes(area), + fixture.skillAreas.includes(area), ), `${task.id} expects ${task.expectedSkillAreas.join(', ')} but ${fixture.id} covers ${fixture.skillAreas.join(', ')}`, ).toBe(true) diff --git a/evals/intent-discovery/graders/reference-only.ts b/evals/intent-discovery/graders/reference-only.ts index a3d0f07..38f2a87 100644 --- a/evals/intent-discovery/graders/reference-only.ts +++ b/evals/intent-discovery/graders/reference-only.ts @@ -12,6 +12,7 @@ export function referenceOnly( } const transcriptText = run.session.messages + .filter((message) => message.role !== 'user') .map((message) => jsonToSearchableText(message.content)) .join('\n') diff --git a/evals/intent-discovery/harness-capture.eval.ts b/evals/intent-discovery/harness-capture.eval.ts index 75188a7..a38dbb8 100644 --- a/evals/intent-discovery/harness-capture.eval.ts +++ b/evals/intent-discovery/harness-capture.eval.ts @@ -10,6 +10,7 @@ import { join } from 'node:path' import { describe, expect, it } from 'vitest' import { fixtures } from './corpus/fixtures' import { tasks } from './corpus/tasks' +import { referenceOnly } from './graders/reference-only' import { intentCommandsFromToolCalls, parseIntentCommand, @@ -183,6 +184,31 @@ describe('Intent discovery harness capture', () => { ).toBeUndefined() }) + it('does not treat user prompt skill mentions as reference-only evidence', () => { + expect( + referenceOnly( + { + errors: [], + output: { finalAnswer: 'Done.' }, + session: { + messages: [ + { + role: 'user', + content: 'Use TanStack Router if needed.', + }, + { + role: 'assistant', + content: 'Done.', + }, + ], + }, + usage: {}, + }, + ['router'], + ), + ).toBe(false) + }) + it('prepares an isolated workspace for every task fixture', () => { const parentDir = mkdtempSync(join(tmpdir(), 'intent-eval-fixtures-')) diff --git a/evals/intent-discovery/harness/prepare-fixture.ts b/evals/intent-discovery/harness/prepare-fixture.ts index 59d23c4..dabca4f 100644 --- a/evals/intent-discovery/harness/prepare-fixture.ts +++ b/evals/intent-discovery/harness/prepare-fixture.ts @@ -21,7 +21,7 @@ export type PreparedFixtureWorkspace = { cleanup: () => void } -export function fixturePath(fixture: IntentDiscoveryFixture): string { +function fixturePath(fixture: IntentDiscoveryFixture): string { return join(fixturesDir, fixture) } diff --git a/knip.json b/knip.json index 1622a2c..fb1b4d4 100644 --- a/knip.json +++ b/knip.json @@ -2,7 +2,13 @@ "$schema": "https://unpkg.com/knip@5/schema.json", "workspaces": { ".": { - "entry": ["scripts/*.ts"] + "entry": [ + "scripts/*.ts", + "evals/intent-discovery/*.eval.ts", + "evals/intent-discovery/bin/*.mjs" + ], + "ignoreBinaries": ["copilot", "diff"], + "ignoreFiles": ["evals/intent-discovery/fixtures/**/src/**/*"] }, "packages/intent": { "entry": ["src/index.ts", "src/cli.ts", "src/core.ts", "src/setup.ts"],