TanStack · LadyBluenotes · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -63,3 +63,10 @@ vite.config.ts.timestamp-*
 .angular
 
 docs/superpowers
+
+evals/intent-discovery/.vitest-evals/
+evals/intent-discovery/runs/*
+!evals/intent-discovery/runs/.gitkeep
+!evals/intent-discovery/runs/latest/
+evals/intent-discovery/runs/latest/*
+!evals/intent-discovery/runs/latest/.gitkeep
diff --git a/eslint.config.js b/eslint.config.js
@@ -6,6 +6,10 @@ import unusedImports from 'eslint-plugin-unused-imports'
 /** @type {import('eslint').Linter.Config[]} */
 const config = [
   ...tanstackConfig,
+  {
+    name: 'intent/eval-fixture-ignores',
+    ignores: ['evals/intent-discovery/fixtures/**/*.tsx'],
+  },
   {
     name: 'tanstack/temp',
     plugins: {
@@ -73,6 +77,15 @@ const config = [
       ],
     },
   },
+  {
+    name: 'intent/evals',
+    files: ['evals/intent-discovery/**/*.ts'],
+    languageOptions: {
+      parserOptions: {
+        project: './evals/intent-discovery/tsconfig.json',
+      },
+    },
+  },
 ]
 
 export default config
diff --git a/evals/intent-discovery/README.md b/evals/intent-discovery/README.md
@@ -0,0 +1,44 @@
+# Intent discovery eval
+
+Opt-in eval suite for measuring whether Copilot discovers and invokes Intent surfaces without direct user instruction.
+
+## Commands
+
+- `pnpm eval:intent-discovery` runs the saved-transcript eval suite.
+- `pnpm eval:intent-discovery:json` writes `evals/intent-discovery/runs/latest/vitest-results.json`.
+- `pnpm eval:intent-discovery:live` runs the eval suite with the local Copilot CLI adapter enabled.
+- `pnpm eval:intent-discovery:live:json` writes a JSON report that includes live Copilot condition cases.
+- `pnpm eval:intent-discovery:judge` optionally annotates the latest JSON report with an OpenAI-backed output-quality judge when `OPENAI_API_KEY` is set.
+- `pnpm eval:intent-discovery:report` serves the saved JSON report.
+- `pnpm eval:intent-discovery:summary` writes `summary.json` and `summary.md` from the latest JSON report.
+
+The default JSON/report commands show saved-transcript efficacy cases only. To include the live Copilot condition matrix in the report artifact, run:
+
+```sh
+pnpm eval:intent-discovery:live:json
+pnpm eval:intent-discovery:summary
+pnpm eval:intent-discovery:report
+```
+
+Set `INTENT_DISCOVERY_RUN_COUNT=3` with the live commands to run each live condition three times and include `pass@k` / `pass^k` in the generated summary.
+
+The optional LLM judge is secondary. It can annotate whether final answers appear to apply loaded guidance, but it never changes deterministic scores such as `StrictIntentInvocation`, `CorrectSkillLoaded`, or `AutonomousDiscoverySuccess`.
+
+## Current scope
+
+This executable slice grades synthetic saved transcripts with Vitest plus `vitest-evals` harness normalization helpers. It attaches `vitest-evals`-compatible metadata to the Vitest JSON artifact for the local report UI because this repo's current Vitest runtime does not expose the APIs used by `vitest-evals/reporter` and `describeEval()`.
+
+The controlled fixture corpus is limited to current skill-backed surfaces. For this slice, that means TanStack Router, TanStack Start, and TanStack Table v9.
+
+Live Router runs compare four setup conditions:
+
+- `no-intent`: no Intent guidance or allowlist is added.
+- `current-intent`: `package.json#intent.skills` plus the current install-style `AGENTS.md` skill-loading guidance.
+- `mapped-intent`: `package.json#intent.skills` plus `AGENTS.md` task-to-skill mappings like `install --map`.
+- `explicit-intent-control`: current install-style setup plus a prompt that explicitly asks the agent to run Intent. This condition is diagnostic and excluded from autonomous scoring.
+
+The live Copilot harness can run an opt-in command backend through `INTENT_DISCOVERY_COPILOT_COMMAND`. When that environment variable is unset, it returns a normalized `unsupported` run with no tool calls and an explicit `LiveCopilotRunnerUnavailableError`. The command runs inside a prepared fixture workspace with task metadata in `INTENT_DISCOVERY_TASK_ID`, `INTENT_DISCOVERY_FIXTURE`, `INTENT_DISCOVERY_PROMPT`, `INTENT_DISCOVERY_RUN_ID`, and `INTENT_DISCOVERY_WORKSPACE`.
+
+`pnpm eval:intent-discovery:live` sets `INTENT_DISCOVERY_RUN_LIVE=1` and `INTENT_DISCOVERY_COPILOT_COMMAND` to the repo-local Copilot CLI adapter. The adapter calls `copilot -p` in the prepared fixture workspace, writes a Copilot share transcript under the generated run directory, and prints the transcript for command capture. Live runs attach the same strict efficacy scores as saved transcripts, so a passing harness run can still report `AutonomousDiscoverySuccess: 0` when Copilot did not invoke Intent or loaded the wrong skill. Do not put API keys or tokens in the command or prompt; provide credentials through the normal Copilot CLI login or secret environment configuration.
+
+Harness integrity failures fail the eval. Product findings such as reference-only behavior, no discovery attempt, or wrong skill selection are recorded as diagnostic failures, not passing scores. The headline success signal is strict Intent invocation plus the expected skill loaded for autonomous cases.
diff --git a/evals/intent-discovery/bin/copilot-cli-adapter.mjs b/evals/intent-discovery/bin/copilot-cli-adapter.mjs
@@ -0,0 +1,87 @@
+#!/usr/bin/env node
+
+import { existsSync, mkdirSync, readFileSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { spawnSync } from 'node:child_process'
+
+const workspace = requiredEnv('INTENT_DISCOVERY_WORKSPACE')
+const taskId = requiredEnv('INTENT_DISCOVERY_TASK_ID')
+const fixture = requiredEnv('INTENT_DISCOVERY_FIXTURE')
+const prompt = requiredEnv('INTENT_DISCOVERY_PROMPT')
+const runId = requiredEnv('INTENT_DISCOVERY_RUN_ID')
+const sharePath = join(
+  workspace,
+  '.intent-eval',
+  `${sanitizeFileName(runId)}.md`,
+)
+
+mkdirSync(dirname(sharePath), { recursive: true })
+
+const copilotPrompt = [
+  `Task id: ${taskId}`,
+  `Fixture: ${fixture}`,
+  '',
+  prompt,
+  '',
+  'Work in the current repository. Use the available project context and tools as you normally would. Do not summarize this prompt; complete the task and report what you changed.',
+].join('\n')
+
+const args = [
+  '-p',
+  copilotPrompt,
+  '-C',
+  workspace,
+  '--allow-all-tools',
+  '--add-dir',
+  workspace,
+  '--no-ask-user',
+  '--no-color',
+  '--plain-diff',
+  '--share',
+  sharePath,
+]
+
+const result = spawnSync('copilot', args, {
+  cwd: workspace,
+  encoding: 'utf8',
+  env: {
+    ...process.env,
+    NO_COLOR: '1',
+  },
+  stdio: ['ignore', 'pipe', 'pipe'],
+})
+
+if (result.error) {
+  console.error(result.error.message)
+  process.exit(1)
+}
+
+if (result.stdout.trim()) {
+  console.log(result.stdout.trim())
+}
+
+if (existsSync(sharePath)) {
+  console.log(`\nTRANSCRIPT_PATH: ${sharePath}`)
+  console.log(readFileSync(sharePath, 'utf8'))
+}
+
+if (result.stderr.trim()) {
+  console.error(result.stderr.trim())
+}
+
+process.exit(result.status ?? 1)
+
+function requiredEnv(name) {
+  const value = process.env[name]
+
+  if (!value) {
+    console.error(`Missing required environment variable: ${name}`)
+    process.exit(1)
+  }
+
+  return value
+}
+
+function sanitizeFileName(value) {
+  return value.replace(/[^a-z0-9.-]+/gi, '-')
+}
diff --git a/evals/intent-discovery/bin/llm-judge.mjs b/evals/intent-discovery/bin/llm-judge.mjs
@@ -0,0 +1,149 @@
+#!/usr/bin/env node
+
+import { mkdirSync, readFileSync, writeFileSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+
+const reportPath =
+  process.argv[2] ?? 'evals/intent-discovery/runs/latest/vitest-results.json'
+const apiKey = process.env.OPENAI_API_KEY
+const model = process.env.INTENT_DISCOVERY_LLM_JUDGE_MODEL ?? 'gpt-4o-mini'
+const requestTimeoutMs = Number(
+  process.env.INTENT_DISCOVERY_LLM_JUDGE_TIMEOUT_MS ?? '30000',
+)
+
+if (!apiKey) {
+  console.log('Skipped LLM judge: OPENAI_API_KEY is not set.')
+  process.exit(0)
+}
+
+const report = JSON.parse(readFileSync(reportPath, 'utf8'))
+const cases = reportCases(report)
+const judgments = []
+
+for (const item of cases) {
+  judgments.push(await judgeCase({ apiKey, item, model }))
+}
+
+const output = {
+  generatedAt: new Date().toISOString(),
+  judgments,
+  model,
+}
+const outDir = dirname(reportPath)
+mkdirSync(outDir, { recursive: true })
+writeFileSync(
+  join(outDir, 'llm-judge.json'),
+  `${JSON.stringify(output, null, 2)}\n`,
+)
+console.log(JSON.stringify(output, null, 2))
+
+function reportCases(report) {
+  return (report.testResults ?? []).flatMap((suite) =>
+    (suite.assertionResults ?? [])
+      .filter((test) => test.meta?.eval)
+      .map((test) => {
+        const run = test.meta.harness?.run ?? {}
+        const artifacts = run.artifacts ?? {}
+        const scores = Object.fromEntries(
+          (test.meta.eval.scores ?? []).map((score) => [
+            score.name,
+            score.score ?? 0,
+          ]),
+        )
+
+        return {
+          artifacts: pick(artifacts, [
+            'condition',
+            'expectedSkillAreas',
+            'intentCommandsInvoked',
+            'loadedSkills',
+            'runnerStatus',
+            'taskId',
+          ]),
+          finalAnswer: test.meta.eval.output?.finalAnswer ?? '',
+          scores,
+          title: test.title,
+        }
+      }),
+  )
+}
+
+async function judgeCase({ apiKey, item, model }) {
+  const controller = new AbortController()
+  const timeout = setTimeout(() => controller.abort(), requestTimeoutMs)
+  let response
+
+  try {
+    response = await fetch('https://api.openai.com/v1/chat/completions', {
+      body: JSON.stringify({
+        messages: [
+          {
+            role: 'system',
+            content:
+              'You judge whether a coding agent output appears to apply loaded library skill guidance. You must not decide whether Intent was invoked; that is provided by deterministic scores. Return strict JSON only.',
+          },
+          {
+            role: 'user',
+            content: JSON.stringify({
+              instruction:
+                'Assess final output quality only. Return {"appliedGuidance":"yes"|"no"|"unknown","rationale":"..."}. Use unknown when evidence is insufficient.',
+              item,
+            }),
+          },
+        ],
+        model,
+        response_format: { type: 'json_object' },
+        temperature: 0,
+      }),
+      headers: {
+        authorization: `Bearer ${apiKey}`,
+        'content-type': 'application/json',
+      },
+      method: 'POST',
+      signal: controller.signal,
+    })
+  } catch (error) {
+    return {
+      deterministicScores: item.scores,
+      error: `LLM judge request failed: ${String(error)}`,
+      title: item.title,
+    }
+  } finally {
+    clearTimeout(timeout)
+  }
+
+  if (!response.ok) {
+    return {
+      error: await response.text(),
+      title: item.title,
+    }
+  }
+
+  const body = await response.json()
+  const content = body.choices?.[0]?.message?.content ?? '{}'
+  let judgment
+  try {
+    judgment = JSON.parse(content)
+  } catch (error) {
+    return {
+      deterministicScores: item.scores,
+      error: `Invalid JSON from model: ${String(error)}`,
+      rawContent: content,
+      title: item.title,
+    }
+  }
+
+  return {
+    deterministicScores: item.scores,
+    judgment,
+    title: item.title,
+  }
+}
+
+function pick(value, keys) {
+  return Object.fromEntries(
+    keys
+      .filter((key) => Object.prototype.hasOwnProperty.call(value, key))
+      .map((key) => [key, value[key]]),
+  )
+}