Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/belief-state/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export * from './ope'
export * from './phase0-measurement'
export * from './report'
export * from './research-evidence'
export * from './runtime-benchmark-corpus'
export * from './runtime-hooks'
export * from './selective'
export * from './shadow-probe'
Expand Down
110 changes: 110 additions & 0 deletions src/belief-state/runtime-benchmark-corpus.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import {
type ProjectRuntimeTrajectoryEvidenceOptions,
projectRuntimeTrajectoryEvidence,
type RuntimeTrajectoryEvidenceProjection,
type RuntimeTrajectoryRecord,
} from '../runtime-trajectory'
import {
type BuildRuntimeBeliefPhase0MeasurementOptions,
buildRuntimeBeliefPhase0Measurement,
type RuntimeBeliefDecisionLabel,
type RuntimeBeliefPhase0Measurement,
type RuntimeBeliefPhase0RunRecord,
} from './phase0-measurement'
import type { RuntimeBeliefDecisionPoint, RuntimeBeliefHookEvent } from './runtime-hooks'

type RuntimeBenchmarkTrajectoryRecord = RuntimeTrajectoryRecord & {
benchmark?: unknown
condition?: unknown
instanceId?: unknown
}

export interface BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions
extends Omit<
BuildRuntimeBeliefPhase0MeasurementOptions,
'runs' | 'events' | 'decisions' | 'labels'
> {
records: RuntimeBenchmarkTrajectoryRecord[]
decisions?: RuntimeBeliefDecisionPoint[]
defaultSplitTag?: ProjectRuntimeTrajectoryEvidenceOptions['defaultSplitTag']
labels?: RuntimeBeliefDecisionLabel[]
}

export interface RuntimeBenchmarkBeliefPhase0Summary {
decisionCount: number
labelCount: number
}

export interface RuntimeBenchmarkBeliefPhase0Measurement {
runs: RuntimeBeliefPhase0RunRecord[]
events: RuntimeBeliefHookEvent[]
decisions: RuntimeBeliefDecisionPoint[]
labels: RuntimeBeliefDecisionLabel[]
trajectory: RuntimeTrajectoryEvidenceProjection
measurement: RuntimeBeliefPhase0Measurement
summary: RuntimeBenchmarkBeliefPhase0Summary
diagnostics: string[]
}

export function buildRuntimeBenchmarkBeliefPhase0Measurement(
options: BuildRuntimeBenchmarkBeliefPhase0MeasurementOptions,
): RuntimeBenchmarkBeliefPhase0Measurement {
const diagnostics: string[] = []
const trajectory = projectRuntimeTrajectoryEvidence({
records: options.records,
defaultSplitTag: options.defaultSplitTag,
recordIdOf: runtimeBenchmarkRecordId,
scenarioIdOf: runtimeBenchmarkScenarioId,
})
const decisions = options.decisions ?? []
const labels = options.labels ?? []
if (decisions.length === 0) {
diagnostics.push(
'no runtime decision points supplied; benchmark lifecycle events alone cannot produce belief decision rows',
)
}
if (labels.length === 0 && decisions.length > 0) {
diagnostics.push(
'no decision labels supplied; observed action/outcome joins will be incomplete',
)
}

const measurement = buildRuntimeBeliefPhase0Measurement({
...options,
runs: trajectory.runs,
events: trajectory.events,
decisions,
labels,
})

return {
runs: trajectory.runs,
events: trajectory.events,
decisions,
labels,
trajectory,
measurement,
summary: {
decisionCount: decisions.length,
labelCount: labels.length,
},
diagnostics: [...trajectory.diagnostics, ...diagnostics, ...measurement.diagnostics],
}
}

function runtimeBenchmarkRecordId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined {
const parts = [
nonEmptyString(record.benchmark),
nonEmptyString(record.instanceId),
nonEmptyString(record.condition),
].filter((part): part is string => part !== undefined)
return parts.length > 0 ? parts.join(':') : undefined
}

function runtimeBenchmarkScenarioId(record: RuntimeBenchmarkTrajectoryRecord): string | undefined {
return nonEmptyString(record.instanceId)
}

function nonEmptyString(value: unknown): string | undefined {
return typeof value === 'string' && value.length > 0 ? value : undefined
}
12 changes: 12 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,18 @@ export {
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
FAILURE_CLASSES,
} from './failure-taxonomy'
export type {
ProjectRuntimeTrajectoryEvidenceOptions,
RuntimeTrajectoryEvidenceProjection,
RuntimeTrajectoryEvidenceSummary,
RuntimeTrajectoryHookEvent,
RuntimeTrajectoryRecord,
RuntimeTrajectoryRunRecord,
} from './runtime-trajectory'
export {
parseRuntimeTrajectoryHookEvent,
projectRuntimeTrajectoryEvidence,
} from './runtime-trajectory'
export type {
HarnessConfig,
SandboxDriver,
Expand Down
166 changes: 166 additions & 0 deletions src/runtime-trajectory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import type { RunSplitTag } from './run-record'

export interface RuntimeTrajectoryHookEvent {
id: string
runId: string
scenarioId?: string
target: string
phase: string
timestamp: number
stepIndex?: number
parentId?: string
payload?: unknown
metadata?: Record<string, unknown>
}

export interface RuntimeTrajectoryRecord {
id?: string
scenarioId?: string
splitTag?: RunSplitTag
runtimeEvents?: unknown
[key: string]: unknown
}

export interface RuntimeTrajectoryRunRecord {
runId: string
scenarioId?: string
splitTag: RunSplitTag
}

export interface RuntimeTrajectoryEvidenceSummary {
recordCount: number
recordWithRuntimeEventsCount: number
runtimeRunCount: number
lifecycleEventCount: number
defaultedSplitCount: number
}

export interface RuntimeTrajectoryEvidenceProjection {
runs: RuntimeTrajectoryRunRecord[]
events: RuntimeTrajectoryHookEvent[]
summary: RuntimeTrajectoryEvidenceSummary
diagnostics: string[]
}

export interface ProjectRuntimeTrajectoryEvidenceOptions<
TRecord extends RuntimeTrajectoryRecord = RuntimeTrajectoryRecord,
> {
records: TRecord[]
defaultSplitTag?: RunSplitTag
recordIdOf?: (record: TRecord, index: number) => string | undefined
scenarioIdOf?: (record: TRecord, index: number) => string | undefined
}

const DEFAULT_SPLIT_TAG: RunSplitTag = 'search'

export function projectRuntimeTrajectoryEvidence<TRecord extends RuntimeTrajectoryRecord>(
options: ProjectRuntimeTrajectoryEvidenceOptions<TRecord>,
): RuntimeTrajectoryEvidenceProjection {
const diagnostics: string[] = []
const runsById = new Map<string, RuntimeTrajectoryRunRecord>()
const events: RuntimeTrajectoryHookEvent[] = []
let recordWithRuntimeEventsCount = 0
let defaultedSplitCount = 0

for (let recordIndex = 0; recordIndex < options.records.length; recordIndex += 1) {
const record = options.records[recordIndex]!
const key = runtimeTrajectoryRecordKey(record, recordIndex, options.recordIdOf)
const splitTag = record.splitTag ?? options.defaultSplitTag ?? DEFAULT_SPLIT_TAG
if (record.splitTag === undefined) defaultedSplitCount += 1

const rawEvents = record.runtimeEvents
if (!Array.isArray(rawEvents)) {
diagnostics.push(
`${key}: runtimeEvents is not an array; no runtime run join can be extracted`,
)
continue
}
if (rawEvents.length === 0) {
diagnostics.push(`${key}: no runtimeEvents; no runtime run join can be extracted`)
continue
}
recordWithRuntimeEventsCount += 1

for (let index = 0; index < rawEvents.length; index += 1) {
const event = parseRuntimeTrajectoryHookEvent(rawEvents[index])
if (!event) {
diagnostics.push(`${key}: runtimeEvents[${index}] is not a RuntimeHookEvent`)
continue
}
events.push(event)

const scenarioId =
event.scenarioId ??
stringOrUndefined(options.scenarioIdOf?.(record, recordIndex)) ??
stringOrUndefined(record.scenarioId)
const prior = runsById.get(event.runId)
if (!prior) {
runsById.set(event.runId, { runId: event.runId, scenarioId, splitTag })
continue
}
if (prior.scenarioId !== scenarioId || prior.splitTag !== splitTag) {
diagnostics.push(`${key}: runId ${event.runId} has conflicting scenario/split metadata`)
}
}
}

const runs = [...runsById.values()]
return {
runs,
events,
summary: {
recordCount: options.records.length,
recordWithRuntimeEventsCount,
runtimeRunCount: runs.length,
lifecycleEventCount: events.length,
defaultedSplitCount,
},
diagnostics,
}
}

export function parseRuntimeTrajectoryHookEvent(input: unknown): RuntimeTrajectoryHookEvent | null {
if (!isRecord(input)) return null
if (typeof input.id !== 'string' || input.id.length === 0) return null
if (typeof input.runId !== 'string' || input.runId.length === 0) return null
if (typeof input.target !== 'string' || input.target.length === 0) return null
if (typeof input.phase !== 'string' || input.phase.length === 0) return null
if (typeof input.timestamp !== 'number' || !Number.isFinite(input.timestamp)) return null

return {
id: input.id,
runId: input.runId,
scenarioId: stringOrUndefined(input.scenarioId),
target: input.target,
phase: input.phase,
timestamp: input.timestamp,
stepIndex: finiteNumberOrUndefined(input.stepIndex),
parentId: stringOrUndefined(input.parentId),
payload: input.payload,
metadata: isRecord(input.metadata) ? { ...input.metadata } : undefined,
}
}

function runtimeTrajectoryRecordKey<TRecord extends RuntimeTrajectoryRecord>(
record: TRecord,
index: number,
recordIdOf?: (record: TRecord, index: number) => string | undefined,
): string {
return (
stringOrUndefined(recordIdOf?.(record, index)) ??
stringOrUndefined(record.id) ??
`record[${index}]`
)
}

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value)
}

function stringOrUndefined(value: unknown): string | undefined {
return typeof value === 'string' && value.length > 0 ? value : undefined
}

function finiteNumberOrUndefined(value: unknown): number | undefined {
return typeof value === 'number' && Number.isFinite(value) ? value : undefined
}
Loading
Loading