Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions src/runtime/sandbox-acquire.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ export async function acquireSandbox(
const sleep = acquire.sleep ?? ((ms: number) => abortableSleep(ms, acquire.signal))
const pollMs = acquire.pollIntervalMs ?? 3000
const deadline = now() + (acquire.readyTimeoutMs ?? 600_000)
// After a retryable create error (commonly a gateway/request timeout on a cold
// scale-from-zero), the orchestrator has usually ACCEPTED the request and is
// still provisioning the NAMED box — which appears in list() a few seconds
// AFTER the create call gave up. Scan list() this many windows for it to
// appear before re-POSTing: re-creating immediately restarts a fresh cold
// provision and hits the same wall — that thrash is why a cold acquire never
// converges within the budget; attaching to the in-flight box does.
const appearScans = 5
// crypto.randomUUID is collision-resistant — find-by-name recovery scans
// list() for this exact name, so two concurrent acquires must never collide.
const name = options.name ?? acquire.name ?? `loop-sbx-${randomUuid()}`
Expand All @@ -97,14 +105,22 @@ export async function acquireSandbox(
// Non-retryable (auth/validation/budget) fails loud immediately.
if (!isRetryable(err)) throw err
lastErr = err
// Two recoveries for a gateway-timed-out create, in order:
// (a) some orchestrators leave a pending sandbox behind — attach to it;
// (b) others roll the create back — so retry create with backoff (a
// retry lands once a warm host exists / the autoscaler caught up).
// Recovery for a gateway-timed-out create, in order:
// (a) the orchestrator usually ACCEPTED the create and is provisioning
// the named box — it appears in list() a few seconds later, so poll
// for it across `appearScans` windows and attach (this is the cold-
// start fix: a single scan misses a row not yet written and the loop
// would otherwise re-POST a fresh cold provision every backoff);
// (b) only if it never appears did the create truly roll back — retry
// create with backoff (lands once a warm host exists / autoscaler
// caught up).
if (typeof c.list === 'function') {
const found = (await c.list().catch(() => []))?.find((b) => b.name === name)
if (found)
return await waitReadyOrDestroy(found, deadline, pollMs, acquire.signal, now, sleep)
for (let scan = 0; scan < appearScans && now() < deadline; scan += 1) {
const found = (await c.list().catch(() => []))?.find((b) => b.name === name)
if (found)
return await waitReadyOrDestroy(found, deadline, pollMs, acquire.signal, now, sleep)
if (scan < appearScans - 1) await sleep(pollMs)
}
}
attempt += 1
await sleep(Math.min(pollMs * attempt, 15_000))
Expand Down
6 changes: 5 additions & 1 deletion tests/loops/sandbox-acquire.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,12 @@ describe('acquireSandbox — cold-start resilience', () => {
},
list: async () => [], // orchestrator rolled back — nothing to attach to
}
// Budget spans several create→appear-scan→re-create cycles (fake clock, so
// instant): each cold create fails, list() never shows the box (true
// rollback), so after scanning it re-creates — proving it doesn't give up
// after one attempt and still times out loud when no host ever comes up.
await expect(
acquireSandbox(client, OPTS, { ...clock(), readyTimeoutMs: 10_000, pollIntervalMs: 3000 }),
acquireSandbox(client, OPTS, { ...clock(), readyTimeoutMs: 120_000, pollIntervalMs: 3000 }),
).rejects.toThrow(/could not acquire a running sandbox "sbx-1"/)
expect(creates).toBeGreaterThan(1) // retried create, not a single wait-then-fail
})
Expand Down
Loading