From 4e864179f3e9c4d74712fb8080011b306f2de15a Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 3 Jun 2026 14:34:38 -0400 Subject: [PATCH] feat: opt-in control-plane fallback on browser_gone Replace the prior broad-trigger fallback (any 5xx on any idempotent GET) with a tighter, opt-in design that assumes metro-api kernel#2317: a routed request targeting a deleted/gone browser returns HTTP 404 with a JSON body code "browser_gone". Fallback now fires IFF all hold: the request was actually routed to the VM (allowlisted subresource + cached route), method is GET, the routed path is in a fallback-eligible registry, and the response is 404 with body code "browser_gone". On fallback the cached route is evicted and the original control-plane request is replayed exactly once (restoring Authorization, dropping the jwt query param). Transient 5xx, connection errors, other 4xx, and non-browser_gone 404s propagate unchanged. The eligibility registry pre-registers only the prospective pull endpoint GET /browsers/{id}/telemetry/events. Adding future eligible endpoints is a one-line registry edit. The default routing subresource list is intentionally not modified by this PR. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/browserrouting/route_cache.go | 135 ++++++++ lib/browserrouting/route_cache_test.go | 419 +++++++++++++++++++++++++ 2 files changed, 554 insertions(+) diff --git a/lib/browserrouting/route_cache.go b/lib/browserrouting/route_cache.go index 3631bd14..dcbc5617 100644 --- a/lib/browserrouting/route_cache.go +++ b/lib/browserrouting/route_cache.go @@ -31,6 +31,45 @@ type cacheLifecycle struct { evictSessionID string } +// fallbackEndpoint identifies a routed subresource path that is eligible for +// control-plane fallback, expressed against the parsed routed path +// (subresource + suffix). +type fallbackEndpoint struct { + subresource string + suffix string +} + +// fallbackEligibleEndpoints is the routing-layer registry of endpoints that +// may fall back to the control plane when the VM reports the browser is gone. +// Everything not listed here is fallback-OFF by default. Adding a future +// eligible endpoint is a one-line edit here. +var fallbackEligibleEndpoints = map[fallbackEndpoint]struct{}{ + // PROSPECTIVE: GET /browsers/{id}/telemetry/events. The telemetry pull + // method does not exist yet; this pre-wires the opt-in so fallback works + // the moment that method ships. + {subresource: "telemetry", suffix: "/events"}: {}, +} + +// isFallbackEligible reports whether the parsed routed path opts into +// control-plane fallback. +func isFallbackEligible(subresource, suffix string) bool { + _, ok := fallbackEligibleEndpoints[fallbackEndpoint{subresource: subresource, suffix: suffix}] + return ok +} + +// browserGoneCode is the JSON body code metro-api (kernel#2317) returns, with +// HTTP 404, when a routed request targets a deleted/gone browser. A live VM's +// own 404 does not carry this code, and transient/upstream failures stay 5xx. +const browserGoneCode = "browser_gone" + +// originalRequest captures the control-plane-bound request before the routing +// middleware mutates it, so it can be replayed verbatim on fallback. +type originalRequest struct { + url *url.URL + host string + authorization []string +} + // NewRouteCache returns an empty browser route cache. func NewRouteCache() *RouteCache { return &RouteCache{routes: map[string]Route{}} @@ -86,6 +125,14 @@ func DirectVMRoutingMiddleware(cache *RouteCache, subresources []string) option. if err != nil { return nil, err } + + var ( + routed bool + routedSess string + routedSub string + routedSuf string + snapshot originalRequest + ) sessionID, subresource, suffix, ok := parseDirectVMPath(req.URL.Path) if ok { if _, ok := allowed[subresource]; ok { @@ -95,6 +142,15 @@ func DirectVMRoutingMiddleware(cache *RouteCache, subresources []string) option. if err != nil { return nil, err } + + // Snapshot the original control-plane-bound request before + // mutating it, so fallback can replay it verbatim. + snapshot = snapshotRequest(req) + routed = true + routedSess = sessionID + routedSub = subresource + routedSuf = suffix + req.Header.Del("Authorization") if route.JWT != "" { q := req.URL.Query() @@ -117,10 +173,89 @@ func DirectVMRoutingMiddleware(cache *RouteCache, subresources []string) option. if err != nil { return res, err } + + if routed && shouldFallbackToControlPlane(req.Method, routedSub, routedSuf, res) { + return controlPlaneFallback(req, next, cache, routedSess, snapshot) + } + return finalizeResponse(res, cache, lifecycle) } } +// snapshotRequest captures the control-plane-bound request state (URL, Host, +// Authorization) before the routing middleware rewrites it. +func snapshotRequest(req *http.Request) originalRequest { + snap := originalRequest{host: req.Host} + if req.URL != nil { + urlCopy := *req.URL + snap.url = &urlCopy + } + if auth, ok := req.Header["Authorization"]; ok { + snap.authorization = append([]string(nil), auth...) + } + return snap +} + +// shouldFallbackToControlPlane reports whether a routed VM response warrants a +// control-plane fallback. It is the single point that decides fallback, and +// only inspects the body on a 404. On a 404 the body is buffered and restored +// so callers that do NOT fall back still receive it intact. +func shouldFallbackToControlPlane(method, subresource, suffix string, res *http.Response) bool { + if method != http.MethodGet { + return false + } + if !isFallbackEligible(subresource, suffix) { + return false + } + if res == nil || res.StatusCode != http.StatusNotFound { + return false + } + return responseHasBrowserGoneCode(res) +} + +// responseHasBrowserGoneCode buffers the response body, restores it for later +// callers, and reports whether its JSON body carries code == "browser_gone". +func responseHasBrowserGoneCode(res *http.Response) bool { + if res == nil || res.Body == nil { + return false + } + + body, err := io.ReadAll(res.Body) + if err != nil { + return false + } + _ = res.Body.Close() + res.Body = io.NopCloser(bytes.NewReader(body)) + res.ContentLength = int64(len(body)) + + var payload struct { + Code string `json:"code"` + } + if err := json.Unmarshal(body, &payload); err != nil { + return false + } + return payload.Code == browserGoneCode +} + +// controlPlaneFallback evicts the authoritatively-gone route and replays the +// original control-plane request exactly once, returning its response. It never +// loops back through VM routing. +func controlPlaneFallback(req *http.Request, next option.MiddlewareNext, cache *RouteCache, sessionID string, snapshot originalRequest) (*http.Response, error) { + cache.Delete(sessionID) + + if snapshot.url != nil { + urlCopy := *snapshot.url + req.URL = &urlCopy + } + req.Host = snapshot.host + req.Header.Del("Authorization") + for _, value := range snapshot.authorization { + req.Header.Add("Authorization", value) + } + + return next(req) +} + func parseCacheLifecycle(req *http.Request) (cacheLifecycle, error) { if req == nil || req.URL == nil { return cacheLifecycle{}, nil diff --git a/lib/browserrouting/route_cache_test.go b/lib/browserrouting/route_cache_test.go index b4899412..d36ffb0f 100644 --- a/lib/browserrouting/route_cache_test.go +++ b/lib/browserrouting/route_cache_test.go @@ -340,3 +340,422 @@ func TestDirectVMRoutingMiddlewareDeleteWinsOverJSONCacheSniff(t *testing.T) { t.Fatal("expected delete response to leave cached route evicted") } } + +// browserGoneResponse builds a routed-VM 404 with the browser_gone body code. +func browserGoneResponse() *http.Response { + body := `{"code":"browser_gone","message":"browser not found"}` + return &http.Response{ + StatusCode: http.StatusNotFound, + Header: http.Header{"Content-Type": []string{"application/json"}}, + Body: io.NopCloser(strings.NewReader(body)), + } +} + +// telemetryEventsRequest builds a GET /browsers/{id}/telemetry/events request +// carrying a control-plane Authorization header. +func telemetryEventsRequest(t *testing.T) *http.Request { + t.Helper() + reqURL, err := url.Parse("https://api.example/browsers/sess-1/telemetry/events") + if err != nil { + t.Fatal(err) + } + return &http.Request{ + Method: http.MethodGet, + URL: reqURL, + Host: "api.example", + Header: http.Header{"Authorization": []string{"Bearer sk_test"}}, + } +} + +func warmTelemetryCache(t *testing.T) *RouteCache { + t.Helper() + cache := NewRouteCache() + cache.Store(Route{ + SessionID: "sess-1", + BaseURL: "https://browser.example/browser/kernel", + JWT: "jwt-123", + }) + return cache +} + +func TestDirectVMRoutingMiddlewareFallsBackToControlPlaneOnBrowserGone(t *testing.T) { + cache := warmTelemetryCache(t) + // "telemetry" is enabled explicitly here; the default subresource list is + // intentionally not modified by this PR. + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + var calls []*http.Request + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + // Snapshot the request state observed on each call. + clone := *next + cloneURL := *next.URL + clone.URL = &cloneURL + clone.Header = next.Header.Clone() + calls = append(calls, &clone) + if len(calls) == 1 { + return browserGoneResponse(), nil + } + return &http.Response{ + StatusCode: http.StatusOK, + Header: http.Header{"Content-Type": []string{"application/json"}}, + Body: io.NopCloser(strings.NewReader(`{"events":[]}`)), + }, nil + }) + if err != nil { + t.Fatal(err) + } + + if len(calls) != 2 { + t.Fatalf("expected exactly one control-plane re-issue (2 calls), got %d", len(calls)) + } + + // First call routed to the VM: Authorization stripped, jwt injected. + if calls[0].URL.Host != "browser.example" { + t.Fatalf("expected first call routed to VM host, got %q", calls[0].URL.Host) + } + if calls[0].Header.Get("Authorization") != "" { + t.Fatalf("expected VM call to drop Authorization, got %q", calls[0].Header.Get("Authorization")) + } + if calls[0].URL.Query().Get("jwt") != "jwt-123" { + t.Fatalf("expected VM call to carry jwt, got %q", calls[0].URL.Query().Get("jwt")) + } + + // Second call replays the original control-plane request. + if calls[1].URL.Host != "api.example" { + t.Fatalf("expected fallback to original CP host, got %q", calls[1].URL.Host) + } + if calls[1].URL.Path != "/browsers/sess-1/telemetry/events" { + t.Fatalf("expected fallback to original CP path, got %q", calls[1].URL.Path) + } + if calls[1].Header.Get("Authorization") != "Bearer sk_test" { + t.Fatalf("expected fallback to restore Authorization, got %q", calls[1].Header.Get("Authorization")) + } + if calls[1].URL.Query().Get("jwt") != "" { + t.Fatalf("expected fallback to drop jwt query param, got %q", calls[1].URL.Query().Get("jwt")) + } + + if _, ok := cache.Load("sess-1"); ok { + t.Fatal("expected fallback to evict the cached route") + } + + gotBody, err := io.ReadAll(res.Body) + if err != nil { + t.Fatal(err) + } + if string(gotBody) != `{"events":[]}` { + t.Fatalf("expected control-plane response body, got %q", gotBody) + } +} + +func TestDirectVMRoutingMiddlewareReturnsControlPlaneErrorWithoutLooping(t *testing.T) { + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + if callCount == 1 { + return browserGoneResponse(), nil + } + // Control plane also reports the browser gone; must NOT loop. + return browserGoneResponse(), nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 2 { + t.Fatalf("expected exactly 2 calls (VM + one CP re-issue), got %d", callCount) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected control-plane response returned as-is, got %d", res.StatusCode) + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackForIneligiblePath(t *testing.T) { + cache := warmTelemetryCache(t) + // Route a different subresource that is NOT in the fallback registry. + middleware := DirectVMRoutingMiddleware(cache, []string{"process"}) + reqURL, err := url.Parse("https://api.example/browsers/sess-1/process/exec") + if err != nil { + t.Fatal(err) + } + req := &http.Request{ + Method: http.MethodGet, + URL: reqURL, + Host: "api.example", + Header: http.Header{"Authorization": []string{"Bearer sk_test"}}, + } + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return browserGoneResponse(), nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback for ineligible path, got %d calls", callCount) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected VM 404 returned, got %d", res.StatusCode) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatal("expected route to remain cached for ineligible path") + } + gotBody, err := io.ReadAll(res.Body) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(string(gotBody), "browser_gone") { + t.Fatalf("expected VM 404 body returned intact, got %q", gotBody) + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackOnTransientServerError(t *testing.T) { + for _, status := range []int{http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout} { + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return &http.Response{ + StatusCode: status, + Header: http.Header{}, + Body: io.NopCloser(strings.NewReader("upstream error")), + }, nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("status %d: expected transient error to propagate without fallback, got %d calls", status, callCount) + } + if res.StatusCode != status { + t.Fatalf("status %d: expected propagated unchanged, got %d", status, res.StatusCode) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatalf("status %d: expected route to remain cached", status) + } + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackOnConnectionError(t *testing.T) { + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + _, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return nil, io.ErrUnexpectedEOF + }) + if err != io.ErrUnexpectedEOF { + t.Fatalf("expected connection error to propagate, got %v", err) + } + if callCount != 1 { + t.Fatalf("expected no fallback on connection error, got %d calls", callCount) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatal("expected route to remain cached after connection error") + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackOnSuccess(t *testing.T) { + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return &http.Response{ + StatusCode: http.StatusOK, + Header: http.Header{"Content-Type": []string{"application/json"}}, + Body: io.NopCloser(strings.NewReader(`{"events":[]}`)), + }, nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback on success, got %d calls", callCount) + } + if res.StatusCode != http.StatusOK { + t.Fatalf("expected 200 returned, got %d", res.StatusCode) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatal("expected route to remain cached on success") + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackOnNon404BrowserGone(t *testing.T) { + // A live VM's own 404 with a different code must NOT trigger fallback. + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return &http.Response{ + StatusCode: http.StatusNotFound, + Header: http.Header{"Content-Type": []string{"application/json"}}, + Body: io.NopCloser(strings.NewReader(`{"code":"not_found","message":"no such event"}`)), + }, nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback for non-browser_gone 404, got %d calls", callCount) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected VM 404 returned, got %d", res.StatusCode) + } + gotBody, err := io.ReadAll(res.Body) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(string(gotBody), "not_found") { + t.Fatalf("expected VM 404 body returned intact, got %q", gotBody) + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackForEligibleSubresourceWrongSuffix(t *testing.T) { + // The "telemetry" subresource is routed and the body is browser_gone, but the + // suffix (/foo) is not the registered "/events" — registry keying is exact on + // {subresource, suffix}, so this must NOT fall back. + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + reqURL, err := url.Parse("https://api.example/browsers/sess-1/telemetry/foo") + if err != nil { + t.Fatal(err) + } + req := &http.Request{ + Method: http.MethodGet, + URL: reqURL, + Host: "api.example", + Header: http.Header{"Authorization": []string{"Bearer sk_test"}}, + } + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return browserGoneResponse(), nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback for eligible subresource with wrong suffix, got %d calls", callCount) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected VM 404 returned, got %d", res.StatusCode) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatal("expected route to remain cached for wrong-suffix path") + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackOnOther4xx(t *testing.T) { + // A 403 on an eligible GET must NOT fall back — only a 404 with the + // browser_gone body code is authoritative for fallback. + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return &http.Response{ + StatusCode: http.StatusForbidden, + Header: http.Header{"Content-Type": []string{"application/json"}}, + Body: io.NopCloser(strings.NewReader(`{"code":"forbidden","message":"nope"}`)), + }, nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback on 403, got %d calls", callCount) + } + if res.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403 returned unchanged, got %d", res.StatusCode) + } + if _, ok := cache.Load("sess-1"); !ok { + t.Fatal("expected route to remain cached on 403") + } + gotBody, err := io.ReadAll(res.Body) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(string(gotBody), "forbidden") { + t.Fatalf("expected 403 body returned intact, got %q", gotBody) + } +} + +func TestDirectVMRoutingMiddlewareNoFallbackForNonGetMethod(t *testing.T) { + cache := warmTelemetryCache(t) + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + reqURL, err := url.Parse("https://api.example/browsers/sess-1/telemetry/events") + if err != nil { + t.Fatal(err) + } + req := &http.Request{ + Method: http.MethodPost, + URL: reqURL, + Host: "api.example", + Header: http.Header{"Authorization": []string{"Bearer sk_test"}}, + } + + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + return browserGoneResponse(), nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected no fallback for POST, got %d calls", callCount) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected VM 404 returned for POST, got %d", res.StatusCode) + } +} + +func TestDirectVMRoutingMiddlewareNonRoutedRequestUntouched(t *testing.T) { + // No cached route -> request is never routed, so fallback never applies even + // for an eligible path + browser_gone body. + cache := NewRouteCache() + middleware := DirectVMRoutingMiddleware(cache, []string{"telemetry"}) + req := telemetryEventsRequest(t) + + var got *http.Request + callCount := 0 + res, err := middleware(req, func(next *http.Request) (*http.Response, error) { + callCount++ + got = next + return browserGoneResponse(), nil + }) + if err != nil { + t.Fatal(err) + } + if callCount != 1 { + t.Fatalf("expected non-routed request to make exactly one call, got %d", callCount) + } + if got.URL.Host != "api.example" { + t.Fatalf("expected non-routed request untouched, got host %q", got.URL.Host) + } + if got.Header.Get("Authorization") != "Bearer sk_test" { + t.Fatalf("expected non-routed Authorization untouched, got %q", got.Header.Get("Authorization")) + } + if res.StatusCode != http.StatusNotFound { + t.Fatalf("expected response returned unchanged, got %d", res.StatusCode) + } +}