From 52bddba1957503c4a841fc2cb76a24efc2d023ba Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 6 Jun 2026 21:15:12 +0300
Subject: [PATCH 1/2] feat(belief-state): project runtime benchmark corpus
 evidence

---
 src/belief-state/index.ts                     |   1 +
 src/belief-state/runtime-benchmark-corpus.ts  | 193 ++++++++++++++++++
 .../runtime-benchmark-corpus.test.ts          | 189 +++++++++++++++++
 3 files changed, 383 insertions(+)
 create mode 100644 src/belief-state/runtime-benchmark-corpus.ts
 create mode 100644 tests/belief-state/runtime-benchmark-corpus.test.ts

diff --git a/src/belief-state/index.ts b/src/belief-state/index.ts
index e8686ad..7ed7972 100644
--- a/src/belief-state/index.ts
+++ b/src/belief-state/index.ts
@@ -6,6 +6,7 @@ export * from './ope'
 export * from './phase0-measurement'
 export * from './report'
 export * from './research-evidence'
+export * from './runtime-benchmark-corpus'
 export * from './runtime-hooks'
 export * from './selective'
 export * from './shadow-probe'
diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts
new file mode 100644
index 0000000..576774f
--- /dev/null
+++ b/src/belief-state/runtime-benchmark-corpus.ts
@@ -0,0 +1,193 @@
+import type { RunSplitTag } from '../run-record'
+import {
+  type BuildRuntimeBeliefPhase0MeasurementOptions,
+  buildRuntimeBeliefPhase0Measurement,
+  type RuntimeBeliefDecisionLabel,
+  type RuntimeBeliefPhase0Measurement,
+  type RuntimeBeliefPhase0RunRecord,
+} from './phase0-measurement'
+import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks'
+
+export interface RuntimeBenchmarkBeliefAttemptRecord {
+  round: number
+  prompt?: string
+  output?: string
+  valid?: boolean
+  score?: number
+  costUsd?: number
+  tokensIn?: number
+  tokensOut?: number
+  wallMs?: number
+  eventCount?: number
+  eventTypes?: Record<string, number>
+  traceTail?: string
+  error?: string
+}
+
+export interface RuntimeBenchmarkBeliefRecord {
+  benchmark: string
+  instanceId: string
+  condition: string
+  model?: string
+  blindResolved?: boolean
+  resolved?: boolean
+  attempts?: RuntimeBenchmarkBeliefAttemptRecord[]
+  infraError?: boolean
+  seed?: number
+  splitTag?: RunSplitTag
+  commitSha?: string
+  runtimeEvents?: unknown[]
+}
+
+export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
+  extends Omit<
+    BuildRuntimeBeliefPhase0MeasurementOptions,
+    'runs' | 'events' | 'decisions' | 'labels'
+  > {
+  records: RuntimeBenchmarkBeliefRecord[]
+  decisions?: RuntimeBeliefDecisionPoint[]
+  labels?: RuntimeBeliefDecisionLabel[]
+  defaultSplitTag?: RunSplitTag
+}
+
+export interface RuntimeBenchmarkBeliefPhase0Summary {
+  recordCount: number
+  recordWithRuntimeEventsCount: number
+  runtimeRunCount: number
+  lifecycleEventCount: number
+  decisionCount: number
+  labelCount: number
+  defaultedSplitCount: number
+}
+
+export interface RuntimeBenchmarkBeliefPhase0Measurement {
+  runs: RuntimeBeliefPhase0RunRecord[]
+  events: RuntimeBeliefHookEvent[]
+  decisions: RuntimeBeliefDecisionPoint[]
+  labels: RuntimeBeliefDecisionLabel[]
+  measurement: RuntimeBeliefPhase0Measurement
+  summary: RuntimeBenchmarkBeliefPhase0Summary
+  diagnostics: string[]
+}
+
+const DEFAULT_SPLIT_TAG: RunSplitTag = 'search'
+
+export function buildRuntimeBenchmarkBeliefPhase0Measurement(
+  options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions,
+): RuntimeBenchmarkBeliefPhase0Measurement {
+  const diagnostics: string[] = []
+  const runsById = new Map<string, RuntimeBeliefPhase0RunRecord>()
+  const events: RuntimeBeliefHookEvent[] = []
+  let recordWithRuntimeEventsCount = 0
+  let defaultedSplitCount = 0
+
+  for (const record of options.records) {
+    const key = benchmarkRecordKey(record)
+    const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG
+    if (record.splitTag === undefined) defaultedSplitCount += 1
+
+    const rawEvents = record.runtimeEvents ?? []
+    if (rawEvents.length === 0) {
+      diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`)
+      continue
+    }
+    recordWithRuntimeEventsCount += 1
+
+    for (let index = 0; index < rawEvents.length; index += 1) {
+      const event = parseRuntimeHookEvent(rawEvents[index])
+      if (!event) {
+        diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`)
+        continue
+      }
+      events.push(event)
+
+      const scenarioId = event.scenarioId ?? record.instanceId
+      const prior = runsById.get(event.runId)
+      if (!prior) {
+        runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag })
+        continue
+      }
+      if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) {
+        diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`)
+      }
+    }
+  }
+
+  const decisions = options.decisions ?? []
+  const labels = options.labels ?? []
+  if (decisions.length === 0) {
+    diagnostics.push(
+      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+    )
+  }
+  if (labels.length === 0 && decisions.length > 0) {
+    diagnostics.push(
+      'no decision labels supplied; observed action/outcome joins will be incomplete',
+    )
+  }
+
+  const runs = [...runsById.values()]
+  const measurement = buildRuntimeBeliefPhase0Measurement({
+    ...options,
+    runs,
+    events,
+    decisions,
+    labels,
+  })
+
+  return {
+    runs,
+    events,
+    decisions,
+    labels,
+    measurement,
+    summary: {
+      recordCount: options.records.length,
+      recordWithRuntimeEventsCount,
+      runtimeRunCount: runs.length,
+      lifecycleEventCount: events.length,
+      decisionCount: decisions.length,
+      labelCount: labels.length,
+      defaultedSplitCount,
+    },
+    diagnostics: [...diagnostics, ...measurement.diagnostics],
+  }
+}
+
+function benchmarkRecordKey(record: RuntimeBenchmarkBeliefRecord): string {
+  return `${record.benchmark}:${record.instanceId}:${record.condition}`
+}
+
+function parseRuntimeHookEvent(input: unknown): RuntimeBeliefHookEvent | null {
+  if (!isRecord(input)) return null
+  if (typeof input.id !== 'string' || input.id.length === 0) return null
+  if (typeof input.runId !== 'string' || input.runId.length === 0) return null
+  if (typeof input.target !== 'string' || input.target.length === 0) return null
+  if (typeof input.phase !== 'string' || input.phase.length === 0) return null
+  if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null
+
+  return {
+    id: input.id,
+    runId: input.runId,
+    scenarioId: stringOrUndefined(input.scenarioId),
+    target: input.target,
+    phase: input.phase,
+    timestamp: input.timestamp,
+    stepIndex: finiteNumberOrUndefined(input.stepIndex),
+    parentId: stringOrUndefined(input.parentId),
+    payload: input.payload,
+    metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined,
+  }
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value)
+}
+
+function stringOrUndefined(value: unknown): string | undefined {
+  return typeof value === 'string' && value.length > 0 ? value : undefined
+}
+
+function finiteNumberOrUndefined(value: unknown): number | undefined {
+  return typeof value === 'number' && Number.isFinite(value) ? value : undefined
+}
diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts
new file mode 100644
index 0000000..a8aeeeb
--- /dev/null
+++ b/tests/belief-state/runtime-benchmark-corpus.test.ts
@@ -0,0 +1,189 @@
+import { describe, expect, it } from 'vitest'
+import { buildRuntimeBenchmarkBeliefPhase0Measurement } from '../../src/belief-state/runtime-benchmark-corpus'
+import type { RuntimeBeliefDecisionPoint } from '../../src/belief-state/runtime-hooks'
+
+describe('runtime benchmark corpus belief-state projection', () => {
+  it('extracts lifecycle run evidence while keeping policy claims blocked without decisions', () => {
+    const report = buildRuntimeBenchmarkBeliefPhase0Measurement({
+      records: [
+        {
+          benchmark: 'commit0',
+          instanceId: 'task-1',
+          condition: 'random@2',
+          model: 'gpt-5',
+          runtimeEvents: runtimeEvents('commit0:task-1:0', 'task-1', 0),
+        },
+        {
+          benchmark: 'commit0',
+          instanceId: 'task-1',
+          condition: 'random@2',
+          model: 'gpt-5',
+          runtimeEvents: runtimeEvents('commit0:task-1:1', 'task-1', 1),
+        },
+      ],
+      targetId: 'failure-recovery',
+      minN: 2,
+      claimScope: 'selective',
+    })
+
+    expect(report.runs).toEqual([
+      { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'search' },
+      { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'search' },
+    ])
+    expect(report.events).toHaveLength(6)
+    expect(report.summary).toMatchObject({
+      recordCount: 2,
+      recordWithRuntimeEventsCount: 2,
+      runtimeRunCount: 2,
+      lifecycleEventCount: 6,
+      decisionCount: 0,
+      labelCount: 0,
+      defaultedSplitCount: 2,
+    })
+    expect(report.measurement.points).toEqual([])
+    expect(report.measurement.summary.packetStatus).toBe('blocked')
+    expect(report.diagnostics).toContain(
+      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+    )
+  })
+
+  it('feeds explicit runtime decisions and labels into the existing Phase 0 measurement', () => {
+    const decisions = Array.from({ length: 12 }, (_, index) => decision(index))
+    const labels = decisions.map((item, index) => ({
+      decisionId: item.id,
+      chosenAction: index % 2 === 0 ? 'verify' : 'stop',
+      confidence: index % 2 === 0 ? 0.8 : 0.4,
+      outcome: { success: index % 2 === 0, score: index % 2 === 0 ? 1 : 0 },
+    }))
+
+    const report = buildRuntimeBenchmarkBeliefPhase0Measurement({
+      records: [
+        {
+          benchmark: 'commit0',
+          instanceId: 'task-1',
+          condition: 'random@2',
+          model: 'gpt-5',
+          splitTag: 'holdout',
+          runtimeEvents: [...runtimeEvents('commit0:task-1:0', 'task-1', 0)],
+        },
+      ],
+      decisions,
+      labels,
+      targetId: 'failure-recovery',
+      minN: 12,
+      minAccepted: 6,
+      confidenceThreshold: 0.6,
+      claimScope: 'selective',
+    })
+
+    expect(report.diagnostics).toEqual([])
+    expect(report.measurement.points).toHaveLength(12)
+    expect(report.measurement.summary).toMatchObject({
+      completedPointCount: 12,
+      lifecycleEventCount: 3,
+      packetStatus: 'supported',
+      claimScope: 'selective',
+    })
+    expect(report.measurement.points[0]).toMatchObject({
+      id: 'commit0:task-1:0:agent.turn:0:failure-recovery',
+      runId: 'commit0:task-1:0',
+      scenarioId: 'task-1',
+      chosenAction: 'verify',
+      metadata: { splitTag: 'holdout', lifecycleEventCount: 3 },
+    })
+    expect(report.measurement.points[0]?.evidence.map((ref) => ref.id)).toEqual(
+      expect.arrayContaining([
+        'commit0:task-1:0:agent.run:before',
+        'commit0:task-1:0:agent.turn:after:0',
+      ]),
+    )
+    expect(report.measurement.packet.analysis.evaluation?.selectiveStatus).toBe('ship')
+  })
+
+  it('diagnoses missing or malformed runtime event arrays without inventing runs', () => {
+    const report = buildRuntimeBenchmarkBeliefPhase0Measurement({
+      records: [
+        {
+          benchmark: 'swe-bench',
+          instanceId: 'case-empty',
+          condition: 'blind@1',
+          runtimeEvents: [],
+        },
+        {
+          benchmark: 'swe-bench',
+          instanceId: 'case-bad',
+          condition: 'blind@1',
+          runtimeEvents: [{ id: 'bad' }],
+        },
+      ],
+      targetId: 'failure-recovery',
+      minN: 1,
+      claimScope: 'selective',
+    })
+
+    expect(report.runs).toEqual([])
+    expect(report.events).toEqual([])
+    expect(report.diagnostics).toEqual([
+      'swe-bench:case-empty:blind@1: no runtimeEvents; no runtime run join can be extracted',
+      'swe-bench:case-bad:blind@1: runtimeEvents[0] is not a RuntimeHookEvent',
+      'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
+    ])
+  })
+})
+
+function runtimeEvents(runId: string, scenarioId: string, stepIndex: number) {
+  return [
+    {
+      id: `${runId}:agent.run:before`,
+      runId,
+      scenarioId,
+      target: 'agent.run',
+      phase: 'before',
+      timestamp: 1_788_624_000_000,
+      metadata: { producer: 'openSandboxRun' },
+    },
+    {
+      id: `${runId}:agent.turn:before:${stepIndex}`,
+      runId,
+      scenarioId,
+      target: 'agent.turn',
+      phase: 'before',
+      timestamp: 1_788_624_000_001,
+      stepIndex,
+      payload: { promptHash: 'abc123' },
+      metadata: { producer: 'openSandboxRun' },
+    },
+    {
+      id: `${runId}:agent.turn:after:${stepIndex}`,
+      runId,
+      scenarioId,
+      target: 'agent.turn',
+      phase: 'after',
+      timestamp: 1_788_624_000_002,
+      stepIndex,
+      payload: { eventCount: 1, eventTypes: { result: 1 } },
+      metadata: { producer: 'openSandboxRun' },
+    },
+  ]
+}
+
+function decision(index: number): RuntimeBeliefDecisionPoint {
+  return {
+    id: `commit0:task-1:0:agent.turn:${index}:failure-recovery`,
+    runId: 'commit0:task-1:0',
+    scenarioId: 'task-1',
+    stepIndex: index,
+    kind: 'retry',
+    candidateActions: ['retry', 'verify', 'continue', 'stop'],
+    context: `failed patch attempt ${index}`,
+    evidence: [
+      {
+        source: 'event',
+        id: `attempt-${index}`,
+        detail: `attempt ${index}`,
+        quality: 'direct',
+      },
+    ],
+    metadata: { target: 'failure-recovery' },
+  }
+}

From d88dd940f49ba148dd8f2f88e95595a52ae85f65 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 6 Jun 2026 21:46:10 +0300
Subject: [PATCH 2/2] refactor(runtime-trajectory): split generic runtime
 evidence projection

---
 src/belief-state/runtime-benchmark-corpus.ts  | 153 ++++------------
 src/index.ts                                  |  12 ++
 src/runtime-trajectory.ts                     | 166 ++++++++++++++++++
 .../runtime-benchmark-corpus.test.ts          |   6 +-
 tests/runtime-trajectory.test.ts              | 127 ++++++++++++++
 5 files changed, 344 insertions(+), 120 deletions(-)
 create mode 100644 src/runtime-trajectory.ts
 create mode 100644 tests/runtime-trajectory.test.ts

diff --git a/src/belief-state/runtime-benchmark-corpus.ts b/src/belief-state/runtime-benchmark-corpus.ts
index 576774f..788603f 100644
--- a/src/belief-state/runtime-benchmark-corpus.ts
+++ b/src/belief-state/runtime-benchmark-corpus.ts
@@ -1,4 +1,9 @@
-import type { RunSplitTag } from '../run-record'
+import {
+  type ProjectRuntimeTrajectoryEvidenceOptions,
+  projectRuntimeTrajectoryEvidence,
+  type RuntimeTrajectoryEvidenceProjection,
+  type RuntimeTrajectoryRecord,
+} from '../runtime-trajectory'
 import {
   type BuildRuntimeBeliefPhase0MeasurementOptions,
   buildRuntimeBeliefPhase0Measurement,
@@ -8,35 +13,10 @@ import {
 } from './phase0-measurement'
 import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks'
 
-export interface RuntimeBenchmarkBeliefAttemptRecord {
-  round: number
-  prompt?: string
-  output?: string
-  valid?: boolean
-  score?: number
-  costUsd?: number
-  tokensIn?: number
-  tokensOut?: number
-  wallMs?: number
-  eventCount?: number
-  eventTypes?: Record<string, number>
-  traceTail?: string
-  error?: string
-}
-
-export interface RuntimeBenchmarkBeliefRecord {
-  benchmark: string
-  instanceId: string
-  condition: string
-  model?: string
-  blindResolved?: boolean
-  resolved?: boolean
-  attempts?: RuntimeBenchmarkBeliefAttemptRecord[]
-  infraError?: boolean
-  seed?: number
-  splitTag?: RunSplitTag
-  commitSha?: string
-  runtimeEvents?: unknown[]
+type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & {
+  benchmark?: unknown
+  condition?: unknown
+  instanceId?: unknown
 }
 
 export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
@@ -44,20 +24,15 @@ export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
     BuildRuntimeBeliefPhase0MeasurementOptions,
     'runs' | 'events' | 'decisions' | 'labels'
   > {
-  records: RuntimeBenchmarkBeliefRecord[]
+  records: RuntimeBenchmarkTrajectoryRecord[]
   decisions?: RuntimeBeliefDecisionPoint[]
+  defaultSplitTag?: ProjectRuntimeTrajectoryEvidenceOptions['defaultSplitTag']
   labels?: RuntimeBeliefDecisionLabel[]
-  defaultSplitTag?: RunSplitTag
 }
 
 export interface RuntimeBenchmarkBeliefPhase0Summary {
-  recordCount: number
-  recordWithRuntimeEventsCount: number
-  runtimeRunCount: number
-  lifecycleEventCount: number
   decisionCount: number
   labelCount: number
-  defaultedSplitCount: number
 }
 
 export interface RuntimeBenchmarkBeliefPhase0Measurement {
@@ -65,54 +40,22 @@ export interface RuntimeBenchmarkBeliefPhase0Measurement {
   events: RuntimeBeliefHookEvent[]
   decisions: RuntimeBeliefDecisionPoint[]
   labels: RuntimeBeliefDecisionLabel[]
+  trajectory: RuntimeTrajectoryEvidenceProjection
   measurement: RuntimeBeliefPhase0Measurement
   summary: RuntimeBenchmarkBeliefPhase0Summary
   diagnostics: string[]
 }
 
-const DEFAULT_SPLIT_TAG: RunSplitTag = 'search'
-
 export function buildRuntimeBenchmarkBeliefPhase0Measurement(
   options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions,
 ): RuntimeBenchmarkBeliefPhase0Measurement {
   const diagnostics: string[] = []
-  const runsById = new Map<string, RuntimeBeliefPhase0RunRecord>()
-  const events: RuntimeBeliefHookEvent[] = []
-  let recordWithRuntimeEventsCount = 0
-  let defaultedSplitCount = 0
-
-  for (const record of options.records) {
-    const key = benchmarkRecordKey(record)
-    const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG
-    if (record.splitTag === undefined) defaultedSplitCount += 1
-
-    const rawEvents = record.runtimeEvents ?? []
-    if (rawEvents.length === 0) {
-      diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`)
-      continue
-    }
-    recordWithRuntimeEventsCount += 1
-
-    for (let index = 0; index < rawEvents.length; index += 1) {
-      const event = parseRuntimeHookEvent(rawEvents[index])
-      if (!event) {
-        diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`)
-        continue
-      }
-      events.push(event)
-
-      const scenarioId = event.scenarioId ?? record.instanceId
-      const prior = runsById.get(event.runId)
-      if (!prior) {
-        runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag })
-        continue
-      }
-      if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) {
-        diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`)
-      }
-    }
-  }
-
+  const trajectory = projectRuntimeTrajectoryEvidence({
+    records: options.records,
+    defaultSplitTag: options.defaultSplitTag,
+    recordIdOf: runtimeBenchmarkRecordId,
+    scenarioIdOf: runtimeBenchmarkScenarioId,
+  })
   const decisions = options.decisions ?? []
   const labels = options.labels ?? []
   if (decisions.length === 0) {
@@ -126,68 +69,42 @@ export function buildRuntimeBenchmarkBeliefPhase0Measurement(
     )
   }
 
-  const runs = [...runsById.values()]
   const measurement = buildRuntimeBeliefPhase0Measurement({
     ...options,
-    runs,
-    events,
+    runs: trajectory.runs,
+    events: trajectory.events,
     decisions,
     labels,
   })
 
   return {
-    runs,
-    events,
+    runs: trajectory.runs,
+    events: trajectory.events,
     decisions,
     labels,
+    trajectory,
     measurement,
     summary: {
-      recordCount: options.records.length,
-      recordWithRuntimeEventsCount,
-      runtimeRunCount: runs.length,
-      lifecycleEventCount: events.length,
       decisionCount: decisions.length,
       labelCount: labels.length,
-      defaultedSplitCount,
     },
-    diagnostics: [...diagnostics, ...measurement.diagnostics],
+    diagnostics: [...trajectory.diagnostics, ...diagnostics, ...measurement.diagnostics],
   }
 }
 
-function benchmarkRecordKey(record: RuntimeBenchmarkBeliefRecord): string {
-  return `${record.benchmark}:${record.instanceId}:${record.condition}`
+function runtimeBenchmarkRecordId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined {
+  const parts = [
+    nonEmptyString(record.benchmark),
+    nonEmptyString(record.instanceId),
+    nonEmptyString(record.condition),
+  ].filter((part): part is string => part !== undefined)
+  return parts.length > 0 ? parts.join(':') : undefined
 }
 
-function parseRuntimeHookEvent(input: unknown): RuntimeBeliefHookEvent | null {
-  if (!isRecord(input)) return null
-  if (typeof input.id !== 'string' || input.id.length === 0) return null
-  if (typeof input.runId !== 'string' || input.runId.length === 0) return null
-  if (typeof input.target !== 'string' || input.target.length === 0) return null
-  if (typeof input.phase !== 'string' || input.phase.length === 0) return null
-  if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null
-
-  return {
-    id: input.id,
-    runId: input.runId,
-    scenarioId: stringOrUndefined(input.scenarioId),
-    target: input.target,
-    phase: input.phase,
-    timestamp: input.timestamp,
-    stepIndex: finiteNumberOrUndefined(input.stepIndex),
-    parentId: stringOrUndefined(input.parentId),
-    payload: input.payload,
-    metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined,
-  }
+function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined {
+  return nonEmptyString(record.instanceId)
 }
 
-function isRecord(value: unknown): value is Record<string, unknown> {
-  return typeof value === 'object' && value !== null && !Array.isArray(value)
-}
-
-function stringOrUndefined(value: unknown): string | undefined {
+function nonEmptyString(value: unknown): string | undefined {
   return typeof value === 'string' && value.length > 0 ? value : undefined
 }
-
-function finiteNumberOrUndefined(value: unknown): number | undefined {
-  return typeof value === 'number' && Number.isFinite(value) ? value : undefined
-}
diff --git a/src/index.ts b/src/index.ts
index 4497623..1f7f9ba 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -554,6 +554,18 @@ export {
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
   FAILURE_CLASSES,
 } from './failure-taxonomy'
+export type {
+  ProjectRuntimeTrajectoryEvidenceOptions,
+  RuntimeTrajectoryEvidenceProjection,
+  RuntimeTrajectoryEvidenceSummary,
+  RuntimeTrajectoryHookEvent,
+  RuntimeTrajectoryRecord,
+  RuntimeTrajectoryRunRecord,
+} from './runtime-trajectory'
+export {
+  parseRuntimeTrajectoryHookEvent,
+  projectRuntimeTrajectoryEvidence,
+} from './runtime-trajectory'
 export type {
   HarnessConfig,
   SandboxDriver,
diff --git a/src/runtime-trajectory.ts b/src/runtime-trajectory.ts
new file mode 100644
index 0000000..f4435f3
--- /dev/null
+++ b/src/runtime-trajectory.ts
@@ -0,0 +1,166 @@
+import type { RunSplitTag } from './run-record'
+
+export interface RuntimeTrajectoryHookEvent {
+  id: string
+  runId: string
+  scenarioId?: string
+  target: string
+  phase: string
+  timestamp: number
+  stepIndex?: number
+  parentId?: string
+  payload?: unknown
+  metadata?: Record<string, unknown>
+}
+
+export interface RuntimeTrajectoryRecord {
+  id?: string
+  scenarioId?: string
+  splitTag?: RunSplitTag
+  runtimeEvents?: unknown
+  [key: string]: unknown
+}
+
+export interface RuntimeTrajectoryRunRecord {
+  runId: string
+  scenarioId?: string
+  splitTag: RunSplitTag
+}
+
+export interface RuntimeTrajectoryEvidenceSummary {
+  recordCount: number
+  recordWithRuntimeEventsCount: number
+  runtimeRunCount: number
+  lifecycleEventCount: number
+  defaultedSplitCount: number
+}
+
+export interface RuntimeTrajectoryEvidenceProjection {
+  runs: RuntimeTrajectoryRunRecord[]
+  events: RuntimeTrajectoryHookEvent[]
+  summary: RuntimeTrajectoryEvidenceSummary
+  diagnostics: string[]
+}
+
+export interface ProjectRuntimeTrajectoryEvidenceOptions<
+  TRecord extends RuntimeTrajectoryRecord = RuntimeTrajectoryRecord,
+> {
+  records: TRecord[]
+  defaultSplitTag?: RunSplitTag
+  recordIdOf?: (record: TRecord, index: number) => string | undefined
+  scenarioIdOf?: (record: TRecord, index: number) => string | undefined
+}
+
+const DEFAULT_SPLIT_TAG: RunSplitTag = 'search'
+
+export function projectRuntimeTrajectoryEvidence<TRecord extends RuntimeTrajectoryRecord>(
+  options: ProjectRuntimeTrajectoryEvidenceOptions<TRecord>,
+): RuntimeTrajectoryEvidenceProjection {
+  const diagnostics: string[] = []
+  const runsById = new Map<string, RuntimeTrajectoryRunRecord>()
+  const events: RuntimeTrajectoryHookEvent[] = []
+  let recordWithRuntimeEventsCount = 0
+  let defaultedSplitCount = 0
+
+  for (let recordIndex = 0; recordIndex < options.records.length; recordIndex += 1) {
+    const record = options.records[recordIndex]!
+    const key = runtimeTrajectoryRecordKey(record, recordIndex, options.recordIdOf)
+    const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG
+    if (record.splitTag === undefined) defaultedSplitCount += 1
+
+    const rawEvents = record.runtimeEvents
+    if (!Array.isArray(rawEvents)) {
+      diagnostics.push(
+        `${key}: runtimeEvents is not an array; no runtime run join can be extracted`,
+      )
+      continue
+    }
+    if (rawEvents.length === 0) {
+      diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`)
+      continue
+    }
+    recordWithRuntimeEventsCount += 1
+
+    for (let index = 0; index < rawEvents.length; index += 1) {
+      const event = parseRuntimeTrajectoryHookEvent(rawEvents[index])
+      if (!event) {
+        diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`)
+        continue
+      }
+      events.push(event)
+
+      const scenarioId =
+        event.scenarioId ??
+        stringOrUndefined(options.scenarioIdOf?.(record, recordIndex)) ??
+        stringOrUndefined(record.scenarioId)
+      const prior = runsById.get(event.runId)
+      if (!prior) {
+        runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag })
+        continue
+      }
+      if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) {
+        diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`)
+      }
+    }
+  }
+
+  const runs = [...runsById.values()]
+  return {
+    runs,
+    events,
+    summary: {
+      recordCount: options.records.length,
+      recordWithRuntimeEventsCount,
+      runtimeRunCount: runs.length,
+      lifecycleEventCount: events.length,
+      defaultedSplitCount,
+    },
+    diagnostics,
+  }
+}
+
+export function parseRuntimeTrajectoryHookEvent(input: unknown): RuntimeTrajectoryHookEvent | null {
+  if (!isRecord(input)) return null
+  if (typeof input.id !== 'string' || input.id.length === 0) return null
+  if (typeof input.runId !== 'string' || input.runId.length === 0) return null
+  if (typeof input.target !== 'string' || input.target.length === 0) return null
+  if (typeof input.phase !== 'string' || input.phase.length === 0) return null
+  if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null
+
+  return {
+    id: input.id,
+    runId: input.runId,
+    scenarioId: stringOrUndefined(input.scenarioId),
+    target: input.target,
+    phase: input.phase,
+    timestamp: input.timestamp,
+    stepIndex: finiteNumberOrUndefined(input.stepIndex),
+    parentId: stringOrUndefined(input.parentId),
+    payload: input.payload,
+    metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined,
+  }
+}
+
+function runtimeTrajectoryRecordKey<TRecord extends RuntimeTrajectoryRecord>(
+  record: TRecord,
+  index: number,
+  recordIdOf?: (record: TRecord, index: number) => string | undefined,
+): string {
+  return (
+    stringOrUndefined(recordIdOf?.(record, index)) ??
+    stringOrUndefined(record.id) ??
+    `record[${index}]`
+  )
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value)
+}
+
+function stringOrUndefined(value: unknown): string | undefined {
+  return typeof value === 'string' && value.length > 0 ? value : undefined
+}
+
+function finiteNumberOrUndefined(value: unknown): number | undefined {
+  return typeof value === 'number' && Number.isFinite(value) ? value : undefined
+}
diff --git a/tests/belief-state/runtime-benchmark-corpus.test.ts b/tests/belief-state/runtime-benchmark-corpus.test.ts
index a8aeeeb..caee096 100644
--- a/tests/belief-state/runtime-benchmark-corpus.test.ts
+++ b/tests/belief-state/runtime-benchmark-corpus.test.ts
@@ -31,14 +31,16 @@ describe('runtime benchmark corpus belief-state projection', () => {
       { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'search' },
     ])
     expect(report.events).toHaveLength(6)
-    expect(report.summary).toMatchObject({
+    expect(report.trajectory.summary).toEqual({
       recordCount: 2,
       recordWithRuntimeEventsCount: 2,
       runtimeRunCount: 2,
       lifecycleEventCount: 6,
+      defaultedSplitCount: 2,
+    })
+    expect(report.summary).toEqual({
       decisionCount: 0,
       labelCount: 0,
-      defaultedSplitCount: 2,
     })
     expect(report.measurement.points).toEqual([])
     expect(report.measurement.summary.packetStatus).toBe('blocked')
diff --git a/tests/runtime-trajectory.test.ts b/tests/runtime-trajectory.test.ts
new file mode 100644
index 0000000..e9b2c0e
--- /dev/null
+++ b/tests/runtime-trajectory.test.ts
@@ -0,0 +1,127 @@
+import { describe, expect, it } from 'vitest'
+import {
+  parseRuntimeTrajectoryHookEvent,
+  projectRuntimeTrajectoryEvidence,
+} from '../src/runtime-trajectory'
+
+describe('runtime trajectory evidence projection', () => {
+  it('extracts run joins and lifecycle events from benchmark records', () => {
+    const projection = projectRuntimeTrajectoryEvidence({
+      records: [
+        {
+          id: 'commit0:task-1:0',
+          scenarioId: 'task-1',
+          runtimeEvents: runtimeEvents('commit0:task-1:0', undefined, 0),
+        },
+        {
+          id: 'commit0:task-1:1',
+          scenarioId: 'task-1',
+          runtimeEvents: runtimeEvents('commit0:task-1:1', undefined, 1),
+        },
+      ],
+      defaultSplitTag: 'holdout',
+    })
+
+    expect(projection.runs).toEqual([
+      { runId: 'commit0:task-1:0', scenarioId: 'task-1', splitTag: 'holdout' },
+      { runId: 'commit0:task-1:1', scenarioId: 'task-1', splitTag: 'holdout' },
+    ])
+    expect(projection.events).toHaveLength(6)
+    expect(projection.summary).toEqual({
+      recordCount: 2,
+      recordWithRuntimeEventsCount: 2,
+      runtimeRunCount: 2,
+      lifecycleEventCount: 6,
+      defaultedSplitCount: 2,
+    })
+    expect(projection.diagnostics).toEqual([])
+  })
+
+  it('diagnoses missing and malformed runtime event arrays without inventing runs', () => {
+    const projection = projectRuntimeTrajectoryEvidence({
+      records: [
+        {
+          id: 'case-empty',
+          runtimeEvents: [],
+        },
+        {
+          id: 'case-not-array',
+          runtimeEvents: 'bad',
+        },
+        {
+          id: 'case-bad',
+          runtimeEvents: [{ id: 'bad' }],
+        },
+      ],
+    })
+
+    expect(projection.runs).toEqual([])
+    expect(projection.events).toEqual([])
+    expect(projection.diagnostics).toEqual([
+      'case-empty: no runtimeEvents; no runtime run join can be extracted',
+      'case-not-array: runtimeEvents is not an array; no runtime run join can be extracted',
+      'case-bad: runtimeEvents[0] is not a RuntimeHookEvent',
+    ])
+  })
+
+  it('parses only structurally complete runtime hook events', () => {
+    expect(parseRuntimeTrajectoryHookEvent({ id: 'bad' })).toBeNull()
+    expect(
+      parseRuntimeTrajectoryHookEvent({
+        id: 'event-1',
+        runId: 'run-1',
+        target: 'agent.turn',
+        phase: 'after',
+        timestamp: 1,
+        stepIndex: 0,
+        payload: { eventCount: 1 },
+        metadata: { producer: 'openSandboxRun' },
+      }),
+    ).toEqual({
+      id: 'event-1',
+      runId: 'run-1',
+      target: 'agent.turn',
+      phase: 'after',
+      timestamp: 1,
+      stepIndex: 0,
+      payload: { eventCount: 1 },
+      metadata: { producer: 'openSandboxRun' },
+    })
+  })
+})
+
+function runtimeEvents(runId: string, scenarioId: string | undefined, stepIndex: number) {
+  return [
+    {
+      id: `${runId}:agent.run:before`,
+      runId,
+      scenarioId,
+      target: 'agent.run',
+      phase: 'before',
+      timestamp: 1_788_624_000_000,
+      metadata: { producer: 'openSandboxRun' },
+    },
+    {
+      id: `${runId}:agent.turn:before:${stepIndex}`,
+      runId,
+      scenarioId,
+      target: 'agent.turn',
+      phase: 'before',
+      timestamp: 1_788_624_000_001,
+      stepIndex,
+      payload: { promptHash: 'abc123' },
+      metadata: { producer: 'openSandboxRun' },
+    },
+    {
+      id: `${runId}:agent.turn:after:${stepIndex}`,
+      runId,
+      scenarioId,
+      target: 'agent.turn',
+      phase: 'after',
+      timestamp: 1_788_624_000_002,
+      stepIndex,
+      payload: { eventCount: 1, eventTypes: { result: 1 } },
+      metadata: { producer: 'openSandboxRun' },
+    },
+  ]
+}