From 52bddba1957503c4a841fc2cb76a24efc2d023ba Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 6 Jun 2026 21:15:12 +0300 Subject: [PATCH 1/2] feat(belief-state): project runtime benchmark corpus evidence --- src/belief-state/index.ts | 1 + src/belief-state/runtime-benchmark-corpus.ts | 193 ++++++++++++++++++ .../runtime-benchmark-corpus.test.ts | 189 +++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 src/belief-state/runtime-benchmark-corpus.ts create mode 100644 tests/belief-state/runtime-benchmark-corpus.test.ts diff --git a/src/belief-state/index.ts b/src/belief-state/index.ts index e8686ad..7ed7972 100644 --- a/src/belief-state/index.ts +++ b/src/belief-state/index.ts @@ -6,6 +6,7 @@ export * from './ope' export * from './phase0-measurement' export * from './report' export * from './research-evidence' +export * from './runtime-benchmark-corpus' export * from './runtime-hooks' export * from './selective' export * from './shadow-probe' diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts new file mode 100644 index 0000000..576774f --- /dev/null +++ b/src/belief-state/runtime-benchmark-corpus.ts @@ -0,0 +1,193 @@ +import type { RunSplitTag } from '../run-record' +import { + type BuildRuntimeBeliefPhase0MeasurementOptions, + buildRuntimeBeliefPhase0Measurement, + type RuntimeBeliefDecisionLabel, + type RuntimeBeliefPhase0Measurement, + type RuntimeBeliefPhase0RunRecord, +} from './phase0-measurement' +import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks' + +export interface RuntimeBenchmarkBeliefAttemptRecord { + round: number + prompt?: string + output?: string + valid?: boolean + score?: number + costUsd?: number + tokensIn?: number + tokensOut?: number + wallMs?: number + eventCount?: number + eventTypes?: Record + traceTail?: string + error?: string +} + +export interface RuntimeBenchmarkBeliefRecord { + benchmark: string + instanceId: string + condition: string + model?: string + blindResolved?: boolean + resolved?: boolean + attempts?: RuntimeBenchmarkBeliefAttemptRecord[] + infraError?: boolean + seed?: number + splitTag?: RunSplitTag + commitSha?: string + runtimeEvents?: unknown[] +} + +export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions + extends Omit< + BuildRuntimeBeliefPhase0MeasurementOptions, + 'runs' | 'events' | 'decisions' | 'labels' + > { + records: RuntimeBenchmarkBeliefRecord[] + decisions?: RuntimeBeliefDecisionPoint[] + labels?: RuntimeBeliefDecisionLabel[] + defaultSplitTag?: RunSplitTag +} + +export interface RuntimeBenchmarkBeliefPhase0Summary { + recordCount: number + recordWithRuntimeEventsCount: number + runtimeRunCount: number + lifecycleEventCount: number + decisionCount: number + labelCount: number + defaultedSplitCount: number +} + +export interface RuntimeBenchmarkBeliefPhase0Measurement { + runs: RuntimeBeliefPhase0RunRecord[] + events: RuntimeBeliefHookEvent[] + decisions: RuntimeBeliefDecisionPoint[] + labels: RuntimeBeliefDecisionLabel[] + measurement: RuntimeBeliefPhase0Measurement + summary: RuntimeBenchmarkBeliefPhase0Summary + diagnostics: string[] +} + +const DEFAULT_SPLIT_TAG: RunSplitTag = 'search' + +export function buildRuntimeBenchmarkBeliefPhase0Measurement( + options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions, +): RuntimeBenchmarkBeliefPhase0Measurement { + const diagnostics: string[] = [] + const runsById = new Map() + const events: RuntimeBeliefHookEvent[] = [] + let recordWithRuntimeEventsCount = 0 + let defaultedSplitCount = 0 + + for (const record of options.records) { + const key = benchmarkRecordKey(record) + const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG + if (record.splitTag === undefined) defaultedSplitCount += 1 + + const rawEvents = record.runtimeEvents ?? [] + if (rawEvents.length === 0) { + diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`) + continue + } + recordWithRuntimeEventsCount += 1 + + for (let index = 0; index < rawEvents.length; index += 1) { + const event = parseRuntimeHookEvent(rawEvents[index]) + if (!event) { + diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`) + continue + } + events.push(event) + + const scenarioId = event.scenarioId ?? record.instanceId + const prior = runsById.get(event.runId) + if (!prior) { + runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag }) + continue + } + if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) { + diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`) + } + } + } + + const decisions = options.decisions ?? [] + const labels = options.labels ?? [] + if (decisions.length === 0) { + diagnostics.push( + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ) + } + if (labels.length === 0 && decisions.length > 0) { + diagnostics.push( + 'no decision labels supplied; observed action/outcome joins will be incomplete', + ) + } + + const runs = [...runsById.values()] + const measurement = buildRuntimeBeliefPhase0Measurement({ + ...options, + runs, + events, + decisions, + labels, + }) + + return { + runs, + events, + decisions, + labels, + measurement, + summary: { + recordCount: options.records.length, + recordWithRuntimeEventsCount, + runtimeRunCount: runs.length, + lifecycleEventCount: events.length, + decisionCount: decisions.length, + labelCount: labels.length, + defaultedSplitCount, + }, + diagnostics: [...diagnostics, ...measurement.diagnostics], + } +} + +function benchmarkRecordKey(record: RuntimeBenchmarkBeliefRecord): string { + return `${record.benchmark}:${record.instanceId}:${record.condition}` +} + +function parseRuntimeHookEvent(input: unknown): RuntimeBeliefHookEvent | null { + if (!isRecord(input)) return null + if (typeof input.id !== 'string' || input.id.length === 0) return null + if (typeof input.runId !== 'string' || input.runId.length === 0) return null + if (typeof input.target !== 'string' || input.target.length === 0) return null + if (typeof input.phase !== 'string' || input.phase.length === 0) return null + if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null + + return { + id: input.id, + runId: input.runId, + scenarioId: stringOrUndefined(input.scenarioId), + target: input.target, + phase: input.phase, + timestamp: input.timestamp, + stepIndex: finiteNumberOrUndefined(input.stepIndex), + parentId: stringOrUndefined(input.parentId), + payload: input.payload, + metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined, + } +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function stringOrUndefined(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function finiteNumberOrUndefined(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined +} diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts new file mode 100644 index 0000000..a8aeeeb --- /dev/null +++ b/tests/belief-state/runtime-benchmark-corpus.test.ts @@ -0,0 +1,189 @@ +import { describe, expect, it } from 'vitest' +import { buildRuntimeBenchmarkBeliefPhase0Measurement } from '../../src/belief-state/runtime-benchmark-corpus' +import type { RuntimeBeliefDecisionPoint } from '../../src/belief-state/runtime-hooks' + +describe('runtime benchmark corpus belief-state projection', () => { + it('extracts lifecycle run evidence while keeping policy claims blocked without decisions', () => { + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0), + }, + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + runtimeEvents: runtimeEvents('commit0:task-1:1', 'task-1', 1), + }, + ], + targetId: 'failure-recovery', + minN: 2, + claimScope: 'selective', + }) + + expect(report.runs).toEqual([ + { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'search' }, + { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'search' }, + ]) + expect(report.events).toHaveLength(6) + expect(report.summary).toMatchObject({ + recordCount: 2, + recordWithRuntimeEventsCount: 2, + runtimeRunCount: 2, + lifecycleEventCount: 6, + decisionCount: 0, + labelCount: 0, + defaultedSplitCount: 2, + }) + expect(report.measurement.points).toEqual([]) + expect(report.measurement.summary.packetStatus).toBe('blocked') + expect(report.diagnostics).toContain( + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ) + }) + + it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => { + const decisions = Array.from({ length: 12 }, (_, index) => decision(index)) + const labels = decisions.map((item, index) => ({ + decisionId: item.id, + chosenAction: index % 2 === 0 ? 'verify' : 'stop', + confidence: index % 2 === 0 ? 0.8 : 0.4, + outcome: { success: index % 2 === 0, score: index % 2 === 0 ? 1 : 0 }, + })) + + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + splitTag: 'holdout', + runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)], + }, + ], + decisions, + labels, + targetId: 'failure-recovery', + minN: 12, + minAccepted: 6, + confidenceThreshold: 0.6, + claimScope: 'selective', + }) + + expect(report.diagnostics).toEqual([]) + expect(report.measurement.points).toHaveLength(12) + expect(report.measurement.summary).toMatchObject({ + completedPointCount: 12, + lifecycleEventCount: 3, + packetStatus: 'supported', + claimScope: 'selective', + }) + expect(report.measurement.points[0]).toMatchObject({ + id: 'commit0:task-1:0:agent.turn:0:failure-recovery', + runId: 'commit0:task-1:0', + scenarioId: 'task-1', + chosenAction: 'verify', + metadata: { splitTag: 'holdout', lifecycleEventCount: 3 }, + }) + expect(report.measurement.points[0]?.evidence.map((ref) => ref.id)).toEqual( + expect.arrayContaining([ + 'commit0:task-1:0:agent.run:before', + 'commit0:task-1:0:agent.turn:after:0', + ]), + ) + expect(report.measurement.packet.analysis.evaluation?.selectiveStatus).toBe('ship') + }) + + it('diagnoses missing or malformed runtime event arrays without inventing runs', () => { + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'swe-bench', + instanceId: 'case-empty', + condition: 'blind@1', + runtimeEvents: [], + }, + { + benchmark: 'swe-bench', + instanceId: 'case-bad', + condition: 'blind@1', + runtimeEvents: [{ id: 'bad' }], + }, + ], + targetId: 'failure-recovery', + minN: 1, + claimScope: 'selective', + }) + + expect(report.runs).toEqual([]) + expect(report.events).toEqual([]) + expect(report.diagnostics).toEqual([ + 'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted', + 'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent', + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ]) + }) +}) + +function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) { + return [ + { + id: `${runId}:agent.run:before`, + runId, + scenarioId, + target: 'agent.run', + phase: 'before', + timestamp: 1_788_624_000_000, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:before:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'before', + timestamp: 1_788_624_000_001, + stepIndex, + payload: { promptHash: 'abc123' }, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:after:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'after', + timestamp: 1_788_624_000_002, + stepIndex, + payload: { eventCount: 1, eventTypes: { result: 1 } }, + metadata: { producer: 'openSandboxRun' }, + }, + ] +} + +function decision(index: number): RuntimeBeliefDecisionPoint { + return { + id: `commit0:task-1:0:agent.turn:${index}:failure-recovery`, + runId: 'commit0:task-1:0', + scenarioId: 'task-1', + stepIndex: index, + kind: 'retry', + candidateActions: ['retry', 'verify', 'continue', 'stop'], + context: `failed patch attempt ${index}`, + evidence: [ + { + source: 'event', + id: `attempt-${index}`, + detail: `attempt ${index}`, + quality: 'direct', + }, + ], + metadata: { target: 'failure-recovery' }, + } +} From d88dd940f49ba148dd8f2f88e95595a52ae85f65 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 6 Jun 2026 21:46:10 +0300 Subject: [PATCH 2/2] refactor(runtime-trajectory): split generic runtime evidence projection --- src/belief-state/runtime-benchmark-corpus.ts | 153 ++++------------ src/index.ts | 12 ++ src/runtime-trajectory.ts | 166 ++++++++++++++++++ .../runtime-benchmark-corpus.test.ts | 6 +- tests/runtime-trajectory.test.ts | 127 ++++++++++++++ 5 files changed, 344 insertions(+), 120 deletions(-) create mode 100644 src/runtime-trajectory.ts create mode 100644 tests/runtime-trajectory.test.ts diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts index 576774f..788603f 100644 --- a/src/belief-state/runtime-benchmark-corpus.ts +++ b/src/belief-state/runtime-benchmark-corpus.ts @@ -1,4 +1,9 @@ -import type { RunSplitTag } from '../run-record' +import { + type ProjectRuntimeTrajectoryEvidenceOptions, + projectRuntimeTrajectoryEvidence, + type RuntimeTrajectoryEvidenceProjection, + type RuntimeTrajectoryRecord, +} from '../runtime-trajectory' import { type BuildRuntimeBeliefPhase0MeasurementOptions, buildRuntimeBeliefPhase0Measurement, @@ -8,35 +13,10 @@ import { } from './phase0-measurement' import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks' -export interface RuntimeBenchmarkBeliefAttemptRecord { - round: number - prompt?: string - output?: string - valid?: boolean - score?: number - costUsd?: number - tokensIn?: number - tokensOut?: number - wallMs?: number - eventCount?: number - eventTypes?: Record - traceTail?: string - error?: string -} - -export interface RuntimeBenchmarkBeliefRecord { - benchmark: string - instanceId: string - condition: string - model?: string - blindResolved?: boolean - resolved?: boolean - attempts?: RuntimeBenchmarkBeliefAttemptRecord[] - infraError?: boolean - seed?: number - splitTag?: RunSplitTag - commitSha?: string - runtimeEvents?: unknown[] +type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & { + benchmark?: unknown + condition?: unknown + instanceId?: unknown } export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions @@ -44,20 +24,15 @@ export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions BuildRuntimeBeliefPhase0MeasurementOptions, 'runs' | 'events' | 'decisions' | 'labels' > { - records: RuntimeBenchmarkBeliefRecord[] + records: RuntimeBenchmarkTrajectoryRecord[] decisions?: RuntimeBeliefDecisionPoint[] + defaultSplitTag?: ProjectRuntimeTrajectoryEvidenceOptions['defaultSplitTag'] labels?: RuntimeBeliefDecisionLabel[] - defaultSplitTag?: RunSplitTag } export interface RuntimeBenchmarkBeliefPhase0Summary { - recordCount: number - recordWithRuntimeEventsCount: number - runtimeRunCount: number - lifecycleEventCount: number decisionCount: number labelCount: number - defaultedSplitCount: number } export interface RuntimeBenchmarkBeliefPhase0Measurement { @@ -65,54 +40,22 @@ export interface RuntimeBenchmarkBeliefPhase0Measurement { events: RuntimeBeliefHookEvent[] decisions: RuntimeBeliefDecisionPoint[] labels: RuntimeBeliefDecisionLabel[] + trajectory: RuntimeTrajectoryEvidenceProjection measurement: RuntimeBeliefPhase0Measurement summary: RuntimeBenchmarkBeliefPhase0Summary diagnostics: string[] } -const DEFAULT_SPLIT_TAG: RunSplitTag = 'search' - export function buildRuntimeBenchmarkBeliefPhase0Measurement( options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions, ): RuntimeBenchmarkBeliefPhase0Measurement { const diagnostics: string[] = [] - const runsById = new Map() - const events: RuntimeBeliefHookEvent[] = [] - let recordWithRuntimeEventsCount = 0 - let defaultedSplitCount = 0 - - for (const record of options.records) { - const key = benchmarkRecordKey(record) - const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG - if (record.splitTag === undefined) defaultedSplitCount += 1 - - const rawEvents = record.runtimeEvents ?? [] - if (rawEvents.length === 0) { - diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`) - continue - } - recordWithRuntimeEventsCount += 1 - - for (let index = 0; index < rawEvents.length; index += 1) { - const event = parseRuntimeHookEvent(rawEvents[index]) - if (!event) { - diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`) - continue - } - events.push(event) - - const scenarioId = event.scenarioId ?? record.instanceId - const prior = runsById.get(event.runId) - if (!prior) { - runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag }) - continue - } - if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) { - diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`) - } - } - } - + const trajectory = projectRuntimeTrajectoryEvidence({ + records: options.records, + defaultSplitTag: options.defaultSplitTag, + recordIdOf: runtimeBenchmarkRecordId, + scenarioIdOf: runtimeBenchmarkScenarioId, + }) const decisions = options.decisions ?? [] const labels = options.labels ?? [] if (decisions.length === 0) { @@ -126,68 +69,42 @@ export function buildRuntimeBenchmarkBeliefPhase0Measurement( ) } - const runs = [...runsById.values()] const measurement = buildRuntimeBeliefPhase0Measurement({ ...options, - runs, - events, + runs: trajectory.runs, + events: trajectory.events, decisions, labels, }) return { - runs, - events, + runs: trajectory.runs, + events: trajectory.events, decisions, labels, + trajectory, measurement, summary: { - recordCount: options.records.length, - recordWithRuntimeEventsCount, - runtimeRunCount: runs.length, - lifecycleEventCount: events.length, decisionCount: decisions.length, labelCount: labels.length, - defaultedSplitCount, }, - diagnostics: [...diagnostics, ...measurement.diagnostics], + diagnostics: [...trajectory.diagnostics, ...diagnostics, ...measurement.diagnostics], } } -function benchmarkRecordKey(record: RuntimeBenchmarkBeliefRecord): string { - return `${record.benchmark}:${record.instanceId}:${record.condition}` +function runtimeBenchmarkRecordId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined { + const parts = [ + nonEmptyString(record.benchmark), + nonEmptyString(record.instanceId), + nonEmptyString(record.condition), + ].filter((part): part is string => part !== undefined) + return parts.length > 0 ? parts.join(':') : undefined } -function parseRuntimeHookEvent(input: unknown): RuntimeBeliefHookEvent | null { - if (!isRecord(input)) return null - if (typeof input.id !== 'string' || input.id.length === 0) return null - if (typeof input.runId !== 'string' || input.runId.length === 0) return null - if (typeof input.target !== 'string' || input.target.length === 0) return null - if (typeof input.phase !== 'string' || input.phase.length === 0) return null - if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null - - return { - id: input.id, - runId: input.runId, - scenarioId: stringOrUndefined(input.scenarioId), - target: input.target, - phase: input.phase, - timestamp: input.timestamp, - stepIndex: finiteNumberOrUndefined(input.stepIndex), - parentId: stringOrUndefined(input.parentId), - payload: input.payload, - metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined, - } +function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined { + return nonEmptyString(record.instanceId) } -function isRecord(value: unknown): value is Record { - return typeof value === 'object' && value !== null && !Array.isArray(value) -} - -function stringOrUndefined(value: unknown): string | undefined { +function nonEmptyString(value: unknown): string | undefined { return typeof value === 'string' && value.length > 0 ? value : undefined } - -function finiteNumberOrUndefined(value: unknown): number | undefined { - return typeof value === 'number' && Number.isFinite(value) ? value : undefined -} diff --git a/src/index.ts b/src/index.ts index 4497623..1f7f9ba 100644 --- a/src/index.ts +++ b/src/index.ts @@ -554,6 +554,18 @@ export { DEFAULT_RULES as DEFAULT_FAILURE_RULES, FAILURE_CLASSES, } from './failure-taxonomy' +export type { + ProjectRuntimeTrajectoryEvidenceOptions, + RuntimeTrajectoryEvidenceProjection, + RuntimeTrajectoryEvidenceSummary, + RuntimeTrajectoryHookEvent, + RuntimeTrajectoryRecord, + RuntimeTrajectoryRunRecord, +} from './runtime-trajectory' +export { + parseRuntimeTrajectoryHookEvent, + projectRuntimeTrajectoryEvidence, +} from './runtime-trajectory' export type { HarnessConfig, SandboxDriver, diff --git a/src/runtime-trajectory.ts b/src/runtime-trajectory.ts new file mode 100644 index 0000000..f4435f3 --- /dev/null +++ b/src/runtime-trajectory.ts @@ -0,0 +1,166 @@ +import type { RunSplitTag } from './run-record' + +export interface RuntimeTrajectoryHookEvent { + id: string + runId: string + scenarioId?: string + target: string + phase: string + timestamp: number + stepIndex?: number + parentId?: string + payload?: unknown + metadata?: Record +} + +export interface RuntimeTrajectoryRecord { + id?: string + scenarioId?: string + splitTag?: RunSplitTag + runtimeEvents?: unknown + [key: string]: unknown +} + +export interface RuntimeTrajectoryRunRecord { + runId: string + scenarioId?: string + splitTag: RunSplitTag +} + +export interface RuntimeTrajectoryEvidenceSummary { + recordCount: number + recordWithRuntimeEventsCount: number + runtimeRunCount: number + lifecycleEventCount: number + defaultedSplitCount: number +} + +export interface RuntimeTrajectoryEvidenceProjection { + runs: RuntimeTrajectoryRunRecord[] + events: RuntimeTrajectoryHookEvent[] + summary: RuntimeTrajectoryEvidenceSummary + diagnostics: string[] +} + +export interface ProjectRuntimeTrajectoryEvidenceOptions< + TRecord extends RuntimeTrajectoryRecord = RuntimeTrajectoryRecord, +> { + records: TRecord[] + defaultSplitTag?: RunSplitTag + recordIdOf?: (record: TRecord, index: number) => string | undefined + scenarioIdOf?: (record: TRecord, index: number) => string | undefined +} + +const DEFAULT_SPLIT_TAG: RunSplitTag = 'search' + +export function projectRuntimeTrajectoryEvidence( + options: ProjectRuntimeTrajectoryEvidenceOptions, +): RuntimeTrajectoryEvidenceProjection { + const diagnostics: string[] = [] + const runsById = new Map() + const events: RuntimeTrajectoryHookEvent[] = [] + let recordWithRuntimeEventsCount = 0 + let defaultedSplitCount = 0 + + for (let recordIndex = 0; recordIndex < options.records.length; recordIndex += 1) { + const record = options.records[recordIndex]! + const key = runtimeTrajectoryRecordKey(record, recordIndex, options.recordIdOf) + const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG + if (record.splitTag === undefined) defaultedSplitCount += 1 + + const rawEvents = record.runtimeEvents + if (!Array.isArray(rawEvents)) { + diagnostics.push( + `${key}: runtimeEvents is not an array; no runtime run join can be extracted`, + ) + continue + } + if (rawEvents.length === 0) { + diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`) + continue + } + recordWithRuntimeEventsCount += 1 + + for (let index = 0; index < rawEvents.length; index += 1) { + const event = parseRuntimeTrajectoryHookEvent(rawEvents[index]) + if (!event) { + diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`) + continue + } + events.push(event) + + const scenarioId = + event.scenarioId ?? + stringOrUndefined(options.scenarioIdOf?.(record, recordIndex)) ?? + stringOrUndefined(record.scenarioId) + const prior = runsById.get(event.runId) + if (!prior) { + runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag }) + continue + } + if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) { + diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`) + } + } + } + + const runs = [...runsById.values()] + return { + runs, + events, + summary: { + recordCount: options.records.length, + recordWithRuntimeEventsCount, + runtimeRunCount: runs.length, + lifecycleEventCount: events.length, + defaultedSplitCount, + }, + diagnostics, + } +} + +export function parseRuntimeTrajectoryHookEvent(input: unknown): RuntimeTrajectoryHookEvent | null { + if (!isRecord(input)) return null + if (typeof input.id !== 'string' || input.id.length === 0) return null + if (typeof input.runId !== 'string' || input.runId.length === 0) return null + if (typeof input.target !== 'string' || input.target.length === 0) return null + if (typeof input.phase !== 'string' || input.phase.length === 0) return null + if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null + + return { + id: input.id, + runId: input.runId, + scenarioId: stringOrUndefined(input.scenarioId), + target: input.target, + phase: input.phase, + timestamp: input.timestamp, + stepIndex: finiteNumberOrUndefined(input.stepIndex), + parentId: stringOrUndefined(input.parentId), + payload: input.payload, + metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined, + } +} + +function runtimeTrajectoryRecordKey( + record: TRecord, + index: number, + recordIdOf?: (record: TRecord, index: number) => string | undefined, +): string { + return ( + stringOrUndefined(recordIdOf?.(record, index)) ?? + stringOrUndefined(record.id) ?? + `record[${index}]` + ) +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function stringOrUndefined(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function finiteNumberOrUndefined(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined +} diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts index a8aeeeb..caee096 100644 --- a/tests/belief-state/runtime-benchmark-corpus.test.ts +++ b/tests/belief-state/runtime-benchmark-corpus.test.ts @@ -31,14 +31,16 @@ describe('runtime benchmark corpus belief-state projection', () => { { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'search' }, ]) expect(report.events).toHaveLength(6) - expect(report.summary).toMatchObject({ + expect(report.trajectory.summary).toEqual({ recordCount: 2, recordWithRuntimeEventsCount: 2, runtimeRunCount: 2, lifecycleEventCount: 6, + defaultedSplitCount: 2, + }) + expect(report.summary).toEqual({ decisionCount: 0, labelCount: 0, - defaultedSplitCount: 2, }) expect(report.measurement.points).toEqual([]) expect(report.measurement.summary.packetStatus).toBe('blocked') diff --git a/tests/runtime-trajectory.test.ts b/tests/runtime-trajectory.test.ts new file mode 100644 index 0000000..e9b2c0e --- /dev/null +++ b/tests/runtime-trajectory.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, it } from 'vitest' +import { + parseRuntimeTrajectoryHookEvent, + projectRuntimeTrajectoryEvidence, +} from '../src/runtime-trajectory' + +describe('runtime trajectory evidence projection', () => { + it('extracts run joins and lifecycle events from benchmark records', () => { + const projection = projectRuntimeTrajectoryEvidence({ + records: [ + { + id: 'commit0:task-1:0', + scenarioId: 'task-1', + runtimeEvents: runtimeEvents('commit0:task-1:0', undefined, 0), + }, + { + id: 'commit0:task-1:1', + scenarioId: 'task-1', + runtimeEvents: runtimeEvents('commit0:task-1:1', undefined, 1), + }, + ], + defaultSplitTag: 'holdout', + }) + + expect(projection.runs).toEqual([ + { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'holdout' }, + { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'holdout' }, + ]) + expect(projection.events).toHaveLength(6) + expect(projection.summary).toEqual({ + recordCount: 2, + recordWithRuntimeEventsCount: 2, + runtimeRunCount: 2, + lifecycleEventCount: 6, + defaultedSplitCount: 2, + }) + expect(projection.diagnostics).toEqual([]) + }) + + it('diagnoses missing and malformed runtime event arrays without inventing runs', () => { + const projection = projectRuntimeTrajectoryEvidence({ + records: [ + { + id: 'case-empty', + runtimeEvents: [], + }, + { + id: 'case-not-array', + runtimeEvents: 'bad', + }, + { + id: 'case-bad', + runtimeEvents: [{ id: 'bad' }], + }, + ], + }) + + expect(projection.runs).toEqual([]) + expect(projection.events).toEqual([]) + expect(projection.diagnostics).toEqual([ + 'case-empty: no runtimeEvents; no runtime run join can be extracted', + 'case-not-array: runtimeEvents is not an array; no runtime run join can be extracted', + 'case-bad: runtimeEvents[0] is not a RuntimeHookEvent', + ]) + }) + + it('parses only structurally complete runtime hook events', () => { + expect(parseRuntimeTrajectoryHookEvent({ id: 'bad' })).toBeNull() + expect( + parseRuntimeTrajectoryHookEvent({ + id: 'event-1', + runId: 'run-1', + target: 'agent.turn', + phase: 'after', + timestamp: 1, + stepIndex: 0, + payload: { eventCount: 1 }, + metadata: { producer: 'openSandboxRun' }, + }), + ).toEqual({ + id: 'event-1', + runId: 'run-1', + target: 'agent.turn', + phase: 'after', + timestamp: 1, + stepIndex: 0, + payload: { eventCount: 1 }, + metadata: { producer: 'openSandboxRun' }, + }) + }) +}) + +function runtimeEvents(runId: string, scenarioId: string | undefined, stepIndex: number) { + return [ + { + id: `${runId}:agent.run:before`, + runId, + scenarioId, + target: 'agent.run', + phase: 'before', + timestamp: 1_788_624_000_000, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:before:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'before', + timestamp: 1_788_624_000_001, + stepIndex, + payload: { promptHash: 'abc123' }, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:after:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'after', + timestamp: 1_788_624_000_002, + stepIndex, + payload: { eventCount: 1, eventTypes: { result: 1 } }, + metadata: { producer: 'openSandboxRun' }, + }, + ] +}