diff --git a/src/belief-state/index.ts b/src/belief-state/index.ts index e8686ad..7ed7972 100644 --- a/src/belief-state/index.ts +++ b/src/belief-state/index.ts @@ -6,6 +6,7 @@ export * from './ope' export * from './phase0-measurement' export * from './report' export * from './research-evidence' +export * from './runtime-benchmark-corpus' export * from './runtime-hooks' export * from './selective' export * from './shadow-probe' diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts new file mode 100644 index 0000000..788603f --- /dev/null +++ b/src/belief-state/runtime-benchmark-corpus.ts @@ -0,0 +1,110 @@ +import { + type ProjectRuntimeTrajectoryEvidenceOptions, + projectRuntimeTrajectoryEvidence, + type RuntimeTrajectoryEvidenceProjection, + type RuntimeTrajectoryRecord, +} from '../runtime-trajectory' +import { + type BuildRuntimeBeliefPhase0MeasurementOptions, + buildRuntimeBeliefPhase0Measurement, + type RuntimeBeliefDecisionLabel, + type RuntimeBeliefPhase0Measurement, + type RuntimeBeliefPhase0RunRecord, +} from './phase0-measurement' +import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks' + +type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & { + benchmark?: unknown + condition?: unknown + instanceId?: unknown +} + +export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions + extends Omit< + BuildRuntimeBeliefPhase0MeasurementOptions, + 'runs' | 'events' | 'decisions' | 'labels' + > { + records: RuntimeBenchmarkTrajectoryRecord[] + decisions?: RuntimeBeliefDecisionPoint[] + defaultSplitTag?: ProjectRuntimeTrajectoryEvidenceOptions['defaultSplitTag'] + labels?: RuntimeBeliefDecisionLabel[] +} + +export interface RuntimeBenchmarkBeliefPhase0Summary { + decisionCount: number + labelCount: number +} + +export interface RuntimeBenchmarkBeliefPhase0Measurement { + runs: RuntimeBeliefPhase0RunRecord[] + events: RuntimeBeliefHookEvent[] + decisions: RuntimeBeliefDecisionPoint[] + labels: RuntimeBeliefDecisionLabel[] + trajectory: RuntimeTrajectoryEvidenceProjection + measurement: RuntimeBeliefPhase0Measurement + summary: RuntimeBenchmarkBeliefPhase0Summary + diagnostics: string[] +} + +export function buildRuntimeBenchmarkBeliefPhase0Measurement( + options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions, +): RuntimeBenchmarkBeliefPhase0Measurement { + const diagnostics: string[] = [] + const trajectory = projectRuntimeTrajectoryEvidence({ + records: options.records, + defaultSplitTag: options.defaultSplitTag, + recordIdOf: runtimeBenchmarkRecordId, + scenarioIdOf: runtimeBenchmarkScenarioId, + }) + const decisions = options.decisions ?? [] + const labels = options.labels ?? [] + if (decisions.length === 0) { + diagnostics.push( + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ) + } + if (labels.length === 0 && decisions.length > 0) { + diagnostics.push( + 'no decision labels supplied; observed action/outcome joins will be incomplete', + ) + } + + const measurement = buildRuntimeBeliefPhase0Measurement({ + ...options, + runs: trajectory.runs, + events: trajectory.events, + decisions, + labels, + }) + + return { + runs: trajectory.runs, + events: trajectory.events, + decisions, + labels, + trajectory, + measurement, + summary: { + decisionCount: decisions.length, + labelCount: labels.length, + }, + diagnostics: [...trajectory.diagnostics, ...diagnostics, ...measurement.diagnostics], + } +} + +function runtimeBenchmarkRecordId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined { + const parts = [ + nonEmptyString(record.benchmark), + nonEmptyString(record.instanceId), + nonEmptyString(record.condition), + ].filter((part): part is string => part !== undefined) + return parts.length > 0 ? parts.join(':') : undefined +} + +function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined { + return nonEmptyString(record.instanceId) +} + +function nonEmptyString(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} diff --git a/src/index.ts b/src/index.ts index 4497623..1f7f9ba 100644 --- a/src/index.ts +++ b/src/index.ts @@ -554,6 +554,18 @@ export { DEFAULT_RULES as DEFAULT_FAILURE_RULES, FAILURE_CLASSES, } from './failure-taxonomy' +export type { + ProjectRuntimeTrajectoryEvidenceOptions, + RuntimeTrajectoryEvidenceProjection, + RuntimeTrajectoryEvidenceSummary, + RuntimeTrajectoryHookEvent, + RuntimeTrajectoryRecord, + RuntimeTrajectoryRunRecord, +} from './runtime-trajectory' +export { + parseRuntimeTrajectoryHookEvent, + projectRuntimeTrajectoryEvidence, +} from './runtime-trajectory' export type { HarnessConfig, SandboxDriver, diff --git a/src/runtime-trajectory.ts b/src/runtime-trajectory.ts new file mode 100644 index 0000000..f4435f3 --- /dev/null +++ b/src/runtime-trajectory.ts @@ -0,0 +1,166 @@ +import type { RunSplitTag } from './run-record' + +export interface RuntimeTrajectoryHookEvent { + id: string + runId: string + scenarioId?: string + target: string + phase: string + timestamp: number + stepIndex?: number + parentId?: string + payload?: unknown + metadata?: Record +} + +export interface RuntimeTrajectoryRecord { + id?: string + scenarioId?: string + splitTag?: RunSplitTag + runtimeEvents?: unknown + [key: string]: unknown +} + +export interface RuntimeTrajectoryRunRecord { + runId: string + scenarioId?: string + splitTag: RunSplitTag +} + +export interface RuntimeTrajectoryEvidenceSummary { + recordCount: number + recordWithRuntimeEventsCount: number + runtimeRunCount: number + lifecycleEventCount: number + defaultedSplitCount: number +} + +export interface RuntimeTrajectoryEvidenceProjection { + runs: RuntimeTrajectoryRunRecord[] + events: RuntimeTrajectoryHookEvent[] + summary: RuntimeTrajectoryEvidenceSummary + diagnostics: string[] +} + +export interface ProjectRuntimeTrajectoryEvidenceOptions< + TRecord extends RuntimeTrajectoryRecord = RuntimeTrajectoryRecord, +> { + records: TRecord[] + defaultSplitTag?: RunSplitTag + recordIdOf?: (record: TRecord, index: number) => string | undefined + scenarioIdOf?: (record: TRecord, index: number) => string | undefined +} + +const DEFAULT_SPLIT_TAG: RunSplitTag = 'search' + +export function projectRuntimeTrajectoryEvidence( + options: ProjectRuntimeTrajectoryEvidenceOptions, +): RuntimeTrajectoryEvidenceProjection { + const diagnostics: string[] = [] + const runsById = new Map() + const events: RuntimeTrajectoryHookEvent[] = [] + let recordWithRuntimeEventsCount = 0 + let defaultedSplitCount = 0 + + for (let recordIndex = 0; recordIndex < options.records.length; recordIndex += 1) { + const record = options.records[recordIndex]! + const key = runtimeTrajectoryRecordKey(record, recordIndex, options.recordIdOf) + const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG + if (record.splitTag === undefined) defaultedSplitCount += 1 + + const rawEvents = record.runtimeEvents + if (!Array.isArray(rawEvents)) { + diagnostics.push( + `${key}: runtimeEvents is not an array; no runtime run join can be extracted`, + ) + continue + } + if (rawEvents.length === 0) { + diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`) + continue + } + recordWithRuntimeEventsCount += 1 + + for (let index = 0; index < rawEvents.length; index += 1) { + const event = parseRuntimeTrajectoryHookEvent(rawEvents[index]) + if (!event) { + diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`) + continue + } + events.push(event) + + const scenarioId = + event.scenarioId ?? + stringOrUndefined(options.scenarioIdOf?.(record, recordIndex)) ?? + stringOrUndefined(record.scenarioId) + const prior = runsById.get(event.runId) + if (!prior) { + runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag }) + continue + } + if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) { + diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`) + } + } + } + + const runs = [...runsById.values()] + return { + runs, + events, + summary: { + recordCount: options.records.length, + recordWithRuntimeEventsCount, + runtimeRunCount: runs.length, + lifecycleEventCount: events.length, + defaultedSplitCount, + }, + diagnostics, + } +} + +export function parseRuntimeTrajectoryHookEvent(input: unknown): RuntimeTrajectoryHookEvent | null { + if (!isRecord(input)) return null + if (typeof input.id !== 'string' || input.id.length === 0) return null + if (typeof input.runId !== 'string' || input.runId.length === 0) return null + if (typeof input.target !== 'string' || input.target.length === 0) return null + if (typeof input.phase !== 'string' || input.phase.length === 0) return null + if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null + + return { + id: input.id, + runId: input.runId, + scenarioId: stringOrUndefined(input.scenarioId), + target: input.target, + phase: input.phase, + timestamp: input.timestamp, + stepIndex: finiteNumberOrUndefined(input.stepIndex), + parentId: stringOrUndefined(input.parentId), + payload: input.payload, + metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined, + } +} + +function runtimeTrajectoryRecordKey( + record: TRecord, + index: number, + recordIdOf?: (record: TRecord, index: number) => string | undefined, +): string { + return ( + stringOrUndefined(recordIdOf?.(record, index)) ?? + stringOrUndefined(record.id) ?? + `record[${index}]` + ) +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) +} + +function stringOrUndefined(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function finiteNumberOrUndefined(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined +} diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts new file mode 100644 index 0000000..caee096 --- /dev/null +++ b/tests/belief-state/runtime-benchmark-corpus.test.ts @@ -0,0 +1,191 @@ +import { describe, expect, it } from 'vitest' +import { buildRuntimeBenchmarkBeliefPhase0Measurement } from '../../src/belief-state/runtime-benchmark-corpus' +import type { RuntimeBeliefDecisionPoint } from '../../src/belief-state/runtime-hooks' + +describe('runtime benchmark corpus belief-state projection', () => { + it('extracts lifecycle run evidence while keeping policy claims blocked without decisions', () => { + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0), + }, + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + runtimeEvents: runtimeEvents('commit0:task-1:1', 'task-1', 1), + }, + ], + targetId: 'failure-recovery', + minN: 2, + claimScope: 'selective', + }) + + expect(report.runs).toEqual([ + { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'search' }, + { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'search' }, + ]) + expect(report.events).toHaveLength(6) + expect(report.trajectory.summary).toEqual({ + recordCount: 2, + recordWithRuntimeEventsCount: 2, + runtimeRunCount: 2, + lifecycleEventCount: 6, + defaultedSplitCount: 2, + }) + expect(report.summary).toEqual({ + decisionCount: 0, + labelCount: 0, + }) + expect(report.measurement.points).toEqual([]) + expect(report.measurement.summary.packetStatus).toBe('blocked') + expect(report.diagnostics).toContain( + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ) + }) + + it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => { + const decisions = Array.from({ length: 12 }, (_, index) => decision(index)) + const labels = decisions.map((item, index) => ({ + decisionId: item.id, + chosenAction: index % 2 === 0 ? 'verify' : 'stop', + confidence: index % 2 === 0 ? 0.8 : 0.4, + outcome: { success: index % 2 === 0, score: index % 2 === 0 ? 1 : 0 }, + })) + + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'commit0', + instanceId: 'task-1', + condition: 'random@2', + model: 'gpt-5', + splitTag: 'holdout', + runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)], + }, + ], + decisions, + labels, + targetId: 'failure-recovery', + minN: 12, + minAccepted: 6, + confidenceThreshold: 0.6, + claimScope: 'selective', + }) + + expect(report.diagnostics).toEqual([]) + expect(report.measurement.points).toHaveLength(12) + expect(report.measurement.summary).toMatchObject({ + completedPointCount: 12, + lifecycleEventCount: 3, + packetStatus: 'supported', + claimScope: 'selective', + }) + expect(report.measurement.points[0]).toMatchObject({ + id: 'commit0:task-1:0:agent.turn:0:failure-recovery', + runId: 'commit0:task-1:0', + scenarioId: 'task-1', + chosenAction: 'verify', + metadata: { splitTag: 'holdout', lifecycleEventCount: 3 }, + }) + expect(report.measurement.points[0]?.evidence.map((ref) => ref.id)).toEqual( + expect.arrayContaining([ + 'commit0:task-1:0:agent.run:before', + 'commit0:task-1:0:agent.turn:after:0', + ]), + ) + expect(report.measurement.packet.analysis.evaluation?.selectiveStatus).toBe('ship') + }) + + it('diagnoses missing or malformed runtime event arrays without inventing runs', () => { + const report = buildRuntimeBenchmarkBeliefPhase0Measurement({ + records: [ + { + benchmark: 'swe-bench', + instanceId: 'case-empty', + condition: 'blind@1', + runtimeEvents: [], + }, + { + benchmark: 'swe-bench', + instanceId: 'case-bad', + condition: 'blind@1', + runtimeEvents: [{ id: 'bad' }], + }, + ], + targetId: 'failure-recovery', + minN: 1, + claimScope: 'selective', + }) + + expect(report.runs).toEqual([]) + expect(report.events).toEqual([]) + expect(report.diagnostics).toEqual([ + 'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted', + 'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent', + 'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows', + ]) + }) +}) + +function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) { + return [ + { + id: `${runId}:agent.run:before`, + runId, + scenarioId, + target: 'agent.run', + phase: 'before', + timestamp: 1_788_624_000_000, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:before:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'before', + timestamp: 1_788_624_000_001, + stepIndex, + payload: { promptHash: 'abc123' }, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:after:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'after', + timestamp: 1_788_624_000_002, + stepIndex, + payload: { eventCount: 1, eventTypes: { result: 1 } }, + metadata: { producer: 'openSandboxRun' }, + }, + ] +} + +function decision(index: number): RuntimeBeliefDecisionPoint { + return { + id: `commit0:task-1:0:agent.turn:${index}:failure-recovery`, + runId: 'commit0:task-1:0', + scenarioId: 'task-1', + stepIndex: index, + kind: 'retry', + candidateActions: ['retry', 'verify', 'continue', 'stop'], + context: `failed patch attempt ${index}`, + evidence: [ + { + source: 'event', + id: `attempt-${index}`, + detail: `attempt ${index}`, + quality: 'direct', + }, + ], + metadata: { target: 'failure-recovery' }, + } +} diff --git a/tests/runtime-trajectory.test.ts b/tests/runtime-trajectory.test.ts new file mode 100644 index 0000000..e9b2c0e --- /dev/null +++ b/tests/runtime-trajectory.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, it } from 'vitest' +import { + parseRuntimeTrajectoryHookEvent, + projectRuntimeTrajectoryEvidence, +} from '../src/runtime-trajectory' + +describe('runtime trajectory evidence projection', () => { + it('extracts run joins and lifecycle events from benchmark records', () => { + const projection = projectRuntimeTrajectoryEvidence({ + records: [ + { + id: 'commit0:task-1:0', + scenarioId: 'task-1', + runtimeEvents: runtimeEvents('commit0:task-1:0', undefined, 0), + }, + { + id: 'commit0:task-1:1', + scenarioId: 'task-1', + runtimeEvents: runtimeEvents('commit0:task-1:1', undefined, 1), + }, + ], + defaultSplitTag: 'holdout', + }) + + expect(projection.runs).toEqual([ + { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'holdout' }, + { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'holdout' }, + ]) + expect(projection.events).toHaveLength(6) + expect(projection.summary).toEqual({ + recordCount: 2, + recordWithRuntimeEventsCount: 2, + runtimeRunCount: 2, + lifecycleEventCount: 6, + defaultedSplitCount: 2, + }) + expect(projection.diagnostics).toEqual([]) + }) + + it('diagnoses missing and malformed runtime event arrays without inventing runs', () => { + const projection = projectRuntimeTrajectoryEvidence({ + records: [ + { + id: 'case-empty', + runtimeEvents: [], + }, + { + id: 'case-not-array', + runtimeEvents: 'bad', + }, + { + id: 'case-bad', + runtimeEvents: [{ id: 'bad' }], + }, + ], + }) + + expect(projection.runs).toEqual([]) + expect(projection.events).toEqual([]) + expect(projection.diagnostics).toEqual([ + 'case-empty: no runtimeEvents; no runtime run join can be extracted', + 'case-not-array: runtimeEvents is not an array; no runtime run join can be extracted', + 'case-bad: runtimeEvents[0] is not a RuntimeHookEvent', + ]) + }) + + it('parses only structurally complete runtime hook events', () => { + expect(parseRuntimeTrajectoryHookEvent({ id: 'bad' })).toBeNull() + expect( + parseRuntimeTrajectoryHookEvent({ + id: 'event-1', + runId: 'run-1', + target: 'agent.turn', + phase: 'after', + timestamp: 1, + stepIndex: 0, + payload: { eventCount: 1 }, + metadata: { producer: 'openSandboxRun' }, + }), + ).toEqual({ + id: 'event-1', + runId: 'run-1', + target: 'agent.turn', + phase: 'after', + timestamp: 1, + stepIndex: 0, + payload: { eventCount: 1 }, + metadata: { producer: 'openSandboxRun' }, + }) + }) +}) + +function runtimeEvents(runId: string, scenarioId: string | undefined, stepIndex: number) { + return [ + { + id: `${runId}:agent.run:before`, + runId, + scenarioId, + target: 'agent.run', + phase: 'before', + timestamp: 1_788_624_000_000, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:before:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'before', + timestamp: 1_788_624_000_001, + stepIndex, + payload: { promptHash: 'abc123' }, + metadata: { producer: 'openSandboxRun' }, + }, + { + id: `${runId}:agent.turn:after:${stepIndex}`, + runId, + scenarioId, + target: 'agent.turn', + phase: 'after', + timestamp: 1_788_624_000_002, + stepIndex, + payload: { eventCount: 1, eventTypes: { result: 1 } }, + metadata: { producer: 'openSandboxRun' }, + }, + ] +}