Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 40 additions & 17 deletions apps/cli/src/commands/results/remote.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import {
directPushResults,
directorySizeBytes,
getResultsRepoStatus,
listGitRuns,
loadConfig,
normalizeResultsConfig,
resolveResultsRepoRunsDir,
syncResultsRepo,
} from '@agentv/core';
Expand Down Expand Up @@ -59,15 +61,6 @@ function getStatusMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}

function normalizeResultsConfig(config: ResultsConfig): Required<ResultsConfig> {
return {
repo: config.repo,
path: config.path,
auto_push: config.auto_push === true,
branch_prefix: config.branch_prefix?.trim() || 'eval-results',
};
}

function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' {
if (result.executionStatus === 'execution_error' || result.error) {
return 'ERROR';
Expand Down Expand Up @@ -185,15 +178,45 @@ export async function listMergedResultFiles(
};
}

const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
let remoteRuns: SourcedResultFileMeta[] = [];
if (config.mode === 'github') {
try {
const gitRuns = await listGitRuns(resolveResultsRepoRunsDir(config));
remoteRuns = gitRuns.map((r) => ({
filename: encodeRemoteRunId(r.run_id),
raw_filename: r.run_id,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
path: r.manifest_path,
displayName: r.display_name,
timestamp: r.timestamp,
testCount: r.test_count,
passRate: r.pass_rate || 0,
avgScore: r.avg_score || 0,
sizeBytes: r.size_bytes || 0,
}));
} catch (error) {
console.error('git-native listing failed, falling back', error);
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
}
} else {
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
(meta) =>
({
...meta,
filename: encodeRemoteRunId(meta.filename),
raw_filename: meta.filename,
source: 'remote' as const,
}) satisfies SourcedResultFileMeta,
);
}

const merged = [...localRuns, ...remoteRuns].sort((a, b) =>
b.timestamp.localeCompare(a.timestamp),
Expand Down
58 changes: 34 additions & 24 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -501,18 +501,15 @@ describe('serve app', () => {
writeFileSync(
path.join(tempDir, '.agentv', 'config.yaml'),
`results:
mode: github
repo: EntityProcess/agentv-evals
path: autopilot-dev/runs
`,
);

const remoteRunDir = path.join(
process.env.AGENTV_HOME,
'cache',
'results-repo',
'results',
'EntityProcess-agentv-evals',
'repo',
'autopilot-dev',
'runs',
'default',
'2026-03-26T10-00-00-000Z',
Expand Down Expand Up @@ -581,29 +578,42 @@ describe('serve app', () => {

describe('GET /api/remote/status', () => {
it('reports configured remote status with graceful local-only fallback', async () => {
mkdirSync(path.join(tempDir, '.agentv'), { recursive: true });
writeFileSync(
path.join(tempDir, '.agentv', 'config.yaml'),
`results:
const previousHome = process.env.AGENTV_HOME;
process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home-status');

try {
mkdirSync(path.join(tempDir, '.agentv'), { recursive: true });
writeFileSync(
path.join(tempDir, '.agentv', 'config.yaml'),
`results:
mode: github
repo: EntityProcess/agentv-evals
path: autopilot-dev/runs
`,
);
);

const app = createApp([], tempDir, tempDir, undefined, { studioDir });
const res = await app.request('/api/remote/status');
const app = createApp([], tempDir, tempDir, undefined, { studioDir });
const res = await app.request('/api/remote/status');

expect(res.status).toBe(200);
const data = (await res.json()) as {
configured: boolean;
available: boolean;
repo: string;
path: string;
};
expect(data.configured).toBe(true);
expect(data.available).toBe(false);
expect(data.repo).toBe('EntityProcess/agentv-evals');
expect(data.path).toBe('autopilot-dev/runs');
expect(res.status).toBe(200);
const data = (await res.json()) as {
configured: boolean;
available: boolean;
repo: string;
path: string;
};
expect(data.configured).toBe(true);
expect(data.available).toBe(false);
expect(data.repo).toBe('EntityProcess/agentv-evals');
expect(data.path).toBe(
path.join(tempDir, 'agentv-home-status', 'results', 'EntityProcess-agentv-evals'),
);
} finally {
if (previousHome === undefined) {
process.env.AGENTV_HOME = undefined;
} else {
process.env.AGENTV_HOME = previousHome;
}
}
});
});

Expand Down
2 changes: 1 addition & 1 deletion apps/studio/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ export interface RemoteStatusResponse {
configured: boolean;
available: boolean;
repo?: string;
cache_dir?: string;
local_dir?: string;
path?: string;
auto_push?: boolean;
branch_prefix?: string;
Expand Down
42 changes: 42 additions & 0 deletions docs/plans/git-native-results-goal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Goal: Complete git-native-results PR (#1261)

## Objective
Implement the git-native results storage architecture and land PR #1261 as a clean, tested, manually verified change.

## Success Criteria
- All implementation passes completed per design doc
- Full test suite green (unit + integration + existing 1782 core + 553 CLI tests)
- E2E manual test using agent-browser against real test results repo
- Red/green UAT documented before review
- No regressions

## Work Location
- Worktree: `agentv.worktrees/git-native-results/`
- Branch: `feat/git-native-results`

## Key Decisions Confirmed
- Dedicated results repo model → write directly to `main` of results repo (no separate branch needed)
- Use raw `git` subprocess (not go-git) for ls-tree / cat-file path
- Follow exact order in design doc

## Non-Goals
- P5 zero-config mode
- Caching
- Multi-mode beyond github

## Verification
1. Automated tests
2. Manual agent-browser E2E in Studio
3. Performance check with 500+ runs repo
4. Lint + typecheck clean

Owner: Agent + Chris T

## Latest Progress (2026-05-21)

- Docker ownership fix implemented in docker-compose.yml (`user: "${UID}:${GID}"`)
- Write path (`commitAndPushRun`) largely complete via parallel work
- Read path functional but needs hardening
- Bun dependencies reinstalled in worktree
- GitHub Actions currently failing on dependency resolution in CI
- Next focus: Fix CI, add tests, implement pagination
162 changes: 162 additions & 0 deletions docs/plans/git-native-results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Git-native results storage

**Status**: design approved, implementation pending
**Tracks**: issue #1259 (supersedes closed PR #1260)
**Scope**: single PR; breaking changes accepted (no production users yet)

---

## Why

`/api/runs` polls every 5s and does O(N) per-manifest reads (`readdir` + `statSync` + `loadResultFile` per run). At hundreds of runs it stalls; at thousands it falls over. The original PR #1260 tried to fix this with an append-only `index/runs.jsonl` file, which works but adds a second source of truth that can drift, grows forever, and requires a sha-amend dance plus a `reindex` migration command.

After comparing with **entireio** (single-ref + git tree as index) and **skillfully** (explicit `sourceMode = github_import` pattern with PR-based writes for human-curated content), the cleaner architecture treats **git as the canonical store**, not as a transport layer.

## Core idea

The git tree IS the index. `git ls-tree -r origin/main -- runs/` lists every run path without reading any blob. `git cat-file --batch` reads existing `benchmark.json` blobs in one subprocess call. No separate index file. No drift. Natural pruning when runs are deleted. With `--filter=blob:none` clone, individual run blobs are only fetched lazily when a user opens the detail view.

## Architecture

### Storage

- The configured remote `results.repo` is **the** storage location.
- The local clone at `results.path` (filesystem path) is the working copy.
- No more `.agentv/results/runs/` writes in the source project. No more gitignored results.

```yaml
# config.yaml
results:
mode: github # required, only valid value today
repo: myorg/eval-results # remote
path: ~/data/agentv-results # optional; default ~/.agentv/results/<slug>/
auto_push: true # default
```

`mode: github` is explicit (extension point; mirrors skillfully's `sourceMode` pattern). `path` is the **local filesystem location** of the clone (breaking change — was previously the subdir within the remote repo). Runs always land at `<clone>/runs/<experiment>/<timestamp>/` regardless.

### Writes

Every `agentv eval` is one atomic operation:

1. `git fetch origin --prune` (refresh; no checkout)
2. Write artifacts into working tree at `<clone>/runs/<experiment>/<timestamp>/`
3. `git add runs/<experiment>/<timestamp>/`
4. `git commit -m "<title>" -m "Agentv-Run: <run-id>"` (P6 trailer baked in)
5. If `auto_push`: `git push origin HEAD:main` with retry-on-non-fast-forward (rebase + retry)

Each run is one commit. Files are unique to that run, so rebases never content-conflict.

### Reads

**Listing** (replaces `listResultFilesFromRunsDir`):
- `git ls-tree -r origin/main -- runs/` → filter for `benchmark.json` paths
- `git cat-file --batch` → read those blobs in one subprocess
- Derive `run_id` from path (same logic as current `buildRunId`)
- Sort by timestamp descending
- Apply cursor pagination

**Detail view file reads** (replaces `readFileSync(meta.path)`):
- Committed: `git cat-file -p origin/main:runs/.../<file>`
- In-progress (post-write, pre-commit): `readFileSync(<path>)` from working tree

**In-progress detection**: between artifact write and commit, files exist only in the working tree. `git status --porcelain runs/` surfaces them; merge with the committed list for the Studio runs view.

### Sync

- `agentv eval` does its own fetch + push (no separate sync needed for own work)
- `agentv results sync` = `git fetch origin --prune` (refresh view of others' work)
- No more `git checkout`, no more `git pull --ff-only`
- Studio polls `/api/runs` which reads from git object DB (already current after the most recent fetch)

### Pagination

`/api/runs?limit=50&cursor=<run_id>`:
- Cursor is the `run_id` of the last item from the previous page
- Server reads the full sorted list (one `git ls-tree` + one `git cat-file --batch`), finds the cursor, slices `[cursorIdx+1 : cursorIdx+1+limit]`, returns `next_cursor` if more remain
- Studio uses `useInfiniteQuery` + an `IntersectionObserver` sentinel row

## Implementation passes

The PR is large but bounded. Suggested order within the single PR:

### Pass 1 — config + paths

- Update `ResultsConfig` schema: require `mode: github`, repurpose `path` as filesystem location
- Rename `getResultsRepoCachePaths` → `getResultsRepoLocalPaths`
- Rename `cache_dir` → `local_dir` in `ResultsRepoStatus` (wire format too)
- Add config validation: refuse old-style `path: runs` values with migration message

### Pass 2 — write path

- Replace `.agentv/results/runs/` writes with direct writes to `<results.path>/runs/...`
- `directPushResults` becomes the only write path (rename to `commitAndPushRun` since it's no longer just a "direct push" mode)
- Add `Agentv-Run:` commit trailer
- Drop `git checkout` from `updateCacheRepo` — only `git fetch --prune` remains
- Rename `updateCacheRepo` → `fetchResultsRepo`

### Pass 3 — read path

- New `listResultFilesFromGitTree(repoDir, baseBranch)` using `git ls-tree` + `git cat-file --batch` on `benchmark.json` blobs
- Replace `listResultFilesFromRunsDir` calls for remote runs with the new function
- Detail view reads in `serve.ts` use `git cat-file -p <ref>:<path>` for committed runs
- Working-tree readdir for in-progress runs (detected via `git status --porcelain`)
- Drop `loadLightweightResults` enrichment loop in `handleRuns` — `benchmark.json` already has `target`, `experiment`, and `pass_rate`

### Pass 4 — pagination

- `/api/runs` accepts `limit` and `cursor` query params
- Server slices the sorted list by cursor, returns `next_cursor`
- `RunListResponse` gets `next_cursor?: string`
- Studio: `runListOptions` → `infiniteQueryOptions`
- `RunList.tsx`: flatten pages, add `IntersectionObserver` sentinel

### Pass 5 — cleanup

- Remove the entire P1 PR scope (closed PR #1260): `RunIndexEntry`, `appendToRunIndex`, `readRunIndex`, `reindexResultsRepo`, `agentv results reindex` command, `index/runs.jsonl` writes
- Remove `localResults` listing — local-only mode is no longer supported
- Remove `SourcedResultFileMeta.source` field — runs are no longer "local" or "remote", they're either committed or in-progress
- Update docs site (`apps/web/src/content/docs/`)
- Update skill files (`plugins/agentv-dev/skills/agentv-eval-builder/`)
- Update examples that hardcoded `.agentv/results/runs/` paths

## Breaking changes

| Change | Impact |
|--------|--------|
| `results.repo` becomes required | Users without a results repo can't run evals until they configure one |
| `results.path` repurposed (subdir → filesystem path) | Existing configs with `path: runs` fail loudly with migration message |
| No more `.agentv/results/runs/` writes | Project-local results no longer exist; everything lives in the configured `path` |
| `cache_dir` → `local_dir` in status responses | Studio + any external scripts reading status need to update |
| `SourcedResultFileMeta.source` removed | Studio "source" badge becomes "in progress / shared" |

Breaking changes accepted because no production users yet. Document in release notes; require fresh config to upgrade.

## Test plan

- Unit tests for `git ls-tree` + `git cat-file --batch` parsing helpers
- Integration test that spins up a tmp git repo, writes runs via the new write path, lists via the new read path, asserts results
- Pagination unit tests (cursor in/out of bounds, exact-boundary cases)
- E2E: run an actual eval against a real (test-scoped) results repo, verify the commit lands with the `Agentv-Run:` trailer, `git ls-tree` shows the run, Studio renders it

## Deferred to future PRs

- **P5 zero-config same-repo mode** — write to `refs/agentv/runs/v1` in the source repo when no `results.repo` is configured. Independent feature; design pattern works the same.
- **Multi-mode support** — if a cloud Studio gets built later, `mode: cloud` would mirror skillfully's "managed in Skillfully" mode. The current explicit `mode: github` field is the extension point.
- **PR-based publishing** — for human-curated content. Eval results are machine-generated, so direct commit is correct. If users want review-before-merge for sensitive evals (e.g., regulatory benchmarks), add `share: auto-pr` later.
- **In-memory list caching** — P2 from #1259. The git-object-DB read path is fast enough that caching is not needed today. Revisit if profiling shows it's a bottleneck.

## Open implementation questions

1. **Branch model**: `origin/main` or a dedicated `origin/agentv-runs/main`? Current vote: `main`, since this is a dedicated results repo.
2. **What to do on `git fetch` failures during `agentv eval`**? Current vote: warn, proceed with stale local state, surface the error in Studio. Don't block the eval — local commit always works.
3. **`gh` CLI dependency**: stays scoped to existing PR-related code paths. The new git-native flow uses raw `git` only.

## What this PR does NOT do

- Doesn't add a separate index file (the index IS the git tree)
- Doesn't ship a `reindex` migration command (nothing to backfill — `benchmark.json` already exists per run)
- Doesn't change the artifact format (`benchmark.json`, `index.jsonl`, per-test dirs stay as-is)
- Doesn't add server-side caching (deferred)
- Doesn't add PR-based publishing (deferred)
- Doesn't touch the source repo's commit history (only the configured `results.repo`)
Loading
Loading