From e30bdc8bd9dc3714b5a2459542ad8195f2fc407e Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 11:51:49 +0300 Subject: [PATCH 1/6] feat(telemetry): emit docker isolation flag, CLI source + docker failure codes (schema v5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the #696 (docker CLI not on PATH) and image/interpreter-mismatch failure modes visible in the telemetry dashboard instead of guessing. Feature flags (schema bumped 4 -> 5, additive / forward-compatible): - docker_isolation_enabled (bool) from cfg.DockerIsolation in BuildFeatureFlagSnapshot — distinguishes "isolation on, 0 matching servers" from "isolation off". - docker_cli_source (enum path|bundled|login_shell|absent) — the direct #696 fleet signal. New shellwrap.ResolveDockerSource reports which resolution branch found docker (shares the docker-path cache); Runtime.GetDockerCLISource mirrors IsDockerAvailable. Coarse enum only — never the path string. Diagnostics (flow into error_code_counts_24h, MCPX-prefix gated/PII-safe): - MCPX_DOCKER_CLI_NOT_FOUND — isolation requested but docker binary unresolved. - MCPX_DOCKER_EXEC_NOT_FOUND — in-container interpreter missing (e.g. uvx absent in python:3.11); previously misclassified as MCPX_STDIO_SPAWN_ENOENT. - MCPX_DOCKER_OCI_RUNTIME — OCI runtime / exec-format failures. ClassifierHints gains DockerIsolated; the supervisor threads it from a side-effect-free mirror of IsolationManager.ShouldIsolate so containerized ENOENT routes to DOCKER codes. Catalog entries added for all three. Privacy: all new fields are coarse enums/bools; docker_cli_source canary test asserts only the 4 enum values ever reach the wire and ScanForPII stays green. Docs/contracts updated (docs/features/telemetry.md + v2/v3 contract notes). DASHBOARD HAND-OFF (telemetry.mcpproxy.app, separate service): add panels for feature_flags.docker_isolation_enabled, the 4-way docker_cli_source breakdown, and the 3 new MCPX_DOCKER_* rows in the error_code_counts_24h panel. Related #696 Co-Authored-By: Paperclip --- docs/features/telemetry.md | 8 +- internal/diagnostics/classifier.go | 68 ++++++++++++++++ .../diagnostics/classifier_domains_test.go | 58 ++++++++++++++ internal/diagnostics/codes.go | 11 +++ internal/diagnostics/registry.go | 28 +++++++ internal/diagnostics/types.go | 6 ++ internal/runtime/runtime.go | 10 +++ internal/runtime/supervisor/supervisor.go | 40 ++++++++-- internal/shellwrap/shellwrap.go | 80 ++++++++++++++++--- internal/shellwrap/shellwrap_test.go | 58 ++++++++++++++ internal/telemetry/feature_flags.go | 20 +++++ internal/telemetry/feature_flags_test.go | 20 +++++ internal/telemetry/optout_test.go | 1 + internal/telemetry/payload_privacy_test.go | 49 +++++++++++- internal/telemetry/payload_v2_test.go | 6 +- internal/telemetry/telemetry.go | 18 ++++- internal/telemetry/telemetry_test.go | 44 +++++++--- .../contracts/heartbeat-v2.schema.json | 1 + .../contracts/heartbeat-v3.json | 19 ++++- 19 files changed, 511 insertions(+), 34 deletions(-) diff --git a/docs/features/telemetry.md b/docs/features/telemetry.md index f4351117e..1f7c2e462 100644 --- a/docs/features/telemetry.md +++ b/docs/features/telemetry.md @@ -4,7 +4,7 @@ MCPProxy collects anonymous usage statistics to help improve the product. This p ## What is collected -MCPProxy sends a **daily heartbeat** containing only aggregate, non-identifying information. The current schema is **version 3** (`schema_version: 3` in the JSON payload); the schema is forward-compatible so older consumers simply ignore fields they don't recognize. +MCPProxy sends a **daily heartbeat** containing only aggregate, non-identifying information. The current schema is **version 5** (`schema_version: 5` in the JSON payload); the schema is forward-compatible so older consumers simply ignore fields they don't recognize. | Field | Example | Purpose | |-------|---------|---------| @@ -22,9 +22,15 @@ MCPProxy sends a **daily heartbeat** containing only aggregate, non-identifying | `feature_flags.docker_available` | `true` | Fraction of installs with a reachable Docker daemon (schema v3) | | `server_protocol_counts` | `{"stdio":3,"http":2,"sse":0,"streamable_http":1,"auto":0}` | Ratio of remote-HTTP vs local-stdio upstreams (schema v3) | | `server_docker_isolated_count` | `2` | How many configured servers the runtime actually wraps in Docker isolation (schema v3) | +| `feature_flags.docker_isolation_enabled` | `true` | Whether global Docker isolation is turned on (schema v5). Lets us tell "isolation on, 0 matching servers" apart from "isolation off" | +| `feature_flags.docker_cli_source` | `bundled` | How the `docker` CLI was located — fixed enum `path` / `bundled` / `login_shell` / `absent` (schema v5). The direct signal for "Docker installed but not on the spawn PATH" (issue #696). **Never** the path string itself | The `server_protocol_counts` map uses a **fixed enum of keys** (`stdio`, `http`, `sse`, `streamable_http`, `auto`) — server names and URLs are never included. Unknown or misconfigured protocol values are bucketed into `auto`. +The `docker_cli_source` field is likewise a **fixed enum** (`path`, `bundled`, `login_shell`, `absent`); the resolved path is never transmitted. + +Docker isolation failures surface in `error_code_counts_24h` via three stable diagnostic codes (schema v5): `MCPX_DOCKER_CLI_NOT_FOUND` (isolation requested but the `docker` binary is unresolved — issue #696), `MCPX_DOCKER_EXEC_NOT_FOUND` (the image lacks the interpreter the server needs, e.g. `uvx` missing in `python:3.11`), and `MCPX_DOCKER_OCI_RUNTIME` (OCI runtime / architecture-mismatch failures). + ## What is NOT collected The following is **never** collected: diff --git a/internal/diagnostics/classifier.go b/internal/diagnostics/classifier.go index b0877db4b..7f688063c 100644 --- a/internal/diagnostics/classifier.go +++ b/internal/diagnostics/classifier.go @@ -119,6 +119,20 @@ func classifyDocker(err error, _ ClassifierHints) Code { strings.Contains(msg, "docker") && strings.Contains(msg, "image") && strings.Contains(msg, "pull") && strings.Contains(msg, "fail"), strings.Contains(msg, "manifest unknown"): return DockerImagePullFailed + // docker CLI unresolved (#696). These shapes are unambiguous about the + // docker BINARY being missing, so they classify even without the + // DockerIsolated hint (e.g. shellwrap's resolution-failure error). + case strings.Contains(msg, "docker not found in path"), + strings.Contains(msg, "docker not found in login shell"), + strings.Contains(msg, "docker: command not found"), + strings.Contains(msg, "docker: not found"), + strings.Contains(msg, `"docker": executable file not found`): + return DockerCLINotFound + // OCI runtime / architecture-mismatch failures from `docker run`. + case strings.Contains(msg, "oci runtime"), + strings.Contains(msg, "exec format error"), + strings.Contains(msg, "runc"): + return DockerOCIRuntime } return "" } @@ -155,8 +169,62 @@ func classifyQuarantine(err error, _ ClassifierHints) Code { return "" } +// classifyDockerIsolatedSpawn maps a spawn/exec failure on a Docker-isolated +// server to a specific DOCKER code. Returns "" when the error is not a +// recognised docker-isolation failure (caller falls through to generic stdio +// handling). +// +// Case order is load-bearing: +// 1. The docker BINARY itself is missing (#696) — must win even though its +// message also contains "command not found" / "executable file not found". +// 2. The in-container interpreter is missing — real docker output nests this +// inside an "OCI runtime create failed: … exec: \"x\": executable file not +// found" string, so it must be checked BEFORE the generic OCI case below. +// 3. Any other OCI runtime failure (exec format error / runc). +func classifyDockerIsolatedSpawn(err error) Code { + // Host couldn't even start the docker binary (direct exec path). + var execErr *exec.Error + if errors.As(err, &execErr) && errors.Is(execErr.Err, syscall.ENOENT) && + strings.Contains(strings.ToLower(execErr.Name), "docker") { + return DockerCLINotFound + } + + msg := strings.ToLower(err.Error()) + switch { + // (1) docker binary unresolved: shellwrap resolution failure, or the shell + // / Go exec layer reporting `docker` itself missing. + case strings.Contains(msg, `"docker": executable file not found`), + strings.Contains(msg, "docker: command not found"), + strings.Contains(msg, "docker: not found"), + strings.Contains(msg, "docker not found in path"), + strings.Contains(msg, "docker not found in login shell"): + return DockerCLINotFound + // (2) in-container interpreter missing (image lacks uvx/node/python/…). + case strings.Contains(msg, "executable file not found"), + strings.Contains(msg, "no such file or directory"), + strings.Contains(msg, "command not found"): + return DockerExecNotFound + // (3) other OCI runtime failures (arch mismatch, runc start failure). + case strings.Contains(msg, "oci runtime"), + strings.Contains(msg, "exec format error"), + strings.Contains(msg, "runc"): + return DockerOCIRuntime + } + return "" +} + // classifyStdio handles os/exec spawn errors and handshake failures. func classifyStdio(err error, hints ClassifierHints) Code { + // Docker-isolated servers run `docker run …` over the stdio transport, so + // ENOENT-class failures here are docker-specific (#696 CLI missing, or an + // image/interpreter mismatch) rather than a plain host-binary miss. Resolve + // those to DOCKER codes before the generic stdio matching below. + if hints.DockerIsolated { + if c := classifyDockerIsolatedSpawn(err); c != "" { + return c + } + } + var execErr *exec.Error if errors.As(err, &execErr) { // exec.Error wraps os.PathError which wraps syscall.Errno; ENOENT/EACCES diff --git a/internal/diagnostics/classifier_domains_test.go b/internal/diagnostics/classifier_domains_test.go index b16b85559..84a262283 100644 --- a/internal/diagnostics/classifier_domains_test.go +++ b/internal/diagnostics/classifier_domains_test.go @@ -78,6 +78,64 @@ func TestClassify_Docker_SnapAppArmor(t *testing.T) { } } +// TestClassify_Docker_IsolationSpawn exercises the #696 / image-mismatch +// routing: when the docker-isolation hint is set, ENOENT-class failures on the +// stdio transport must resolve to DOCKER codes rather than a plain +// MCPX_STDIO_SPAWN_ENOENT, so the telemetry dashboard sees the real cause. +func TestClassify_Docker_IsolationSpawn(t *testing.T) { + cases := []struct { + name string + err error + hint ClassifierHints + want Code + }{ + { + // #696: docker CLI not on the spawn PATH; the login-shell wrap + // reports `docker: command not found`. + name: "cli_not_found_shell", + err: errors.New("stdio transport (command=\"/bin/zsh\", docker_isolation=true): recent stderr: docker: command not found"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerCLINotFound, + }, + { + // shellwrap resolution failure surfaces this even without the hint. + name: "cli_not_found_resolve", + err: errors.New("docker not found in PATH or well-known locations"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerCLINotFound, + }, + { + // In-container interpreter missing (e.g. uvx absent in python:3.11). + name: "exec_not_found", + err: errors.New("docker: Error response from daemon: failed to create task: OCI runtime create failed: runc create failed: exec: \"uvx\": executable file not found in $PATH: unknown"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerExecNotFound, + }, + { + // OCI runtime / arch mismatch with no interpreter-missing detail. + name: "oci_runtime", + err: errors.New("docker: Error response from daemon: failed to create shim task: OCI runtime create failed: exec format error: unknown"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerOCIRuntime, + }, + { + // Same ENOENT string WITHOUT the isolation hint stays a plain stdio + // spawn failure — no false DOCKER attribution for host stdio servers. + name: "non_containerized_enoent", + err: errors.New("failed to spawn: executable file not found in $PATH"), + hint: ClassifierHints{Transport: "stdio"}, + want: STDIOSpawnENOENT, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := Classify(tc.err, tc.hint); got != tc.want { + t.Errorf("Classify(%q) = %q, want %q", tc.err, got, tc.want) + } + }) + } +} + // --- CONFIG ----------------------------------------------------------------- func TestClassify_Config_ParseError(t *testing.T) { diff --git a/internal/diagnostics/codes.go b/internal/diagnostics/codes.go index 157dc3b97..6c6b1a78b 100644 --- a/internal/diagnostics/codes.go +++ b/internal/diagnostics/codes.go @@ -51,6 +51,17 @@ const ( DockerImagePullFailed Code = "MCPX_DOCKER_IMAGE_PULL_FAILED" DockerNoPermission Code = "MCPX_DOCKER_NO_PERMISSION" DockerSnapAppArmor Code = "MCPX_DOCKER_SNAP_APPARMOR" + // DockerCLINotFound: isolation was requested but the `docker` binary could + // not be resolved on the spawn PATH (issue #696 — Docker Desktop installed + // without the admin-gated CLI shim, or a LaunchAgent's minimal PATH). + DockerCLINotFound Code = "MCPX_DOCKER_CLI_NOT_FOUND" + // DockerExecNotFound: the container started but its entrypoint interpreter + // is missing from the image (e.g. `uvx` absent in `python:3.11`). Distinct + // from a HOST stdio ENOENT, which is MCPX_STDIO_SPAWN_ENOENT. + DockerExecNotFound Code = "MCPX_DOCKER_EXEC_NOT_FOUND" + // DockerOCIRuntime: the OCI runtime (runc) failed to start the container — + // e.g. an `exec format error` (image/host architecture mismatch). + DockerOCIRuntime Code = "MCPX_DOCKER_OCI_RUNTIME" ) // CONFIG domain — configuration parsing and validation failures. diff --git a/internal/diagnostics/registry.go b/internal/diagnostics/registry.go index b5f5ea9b7..29efde178 100644 --- a/internal/diagnostics/registry.go +++ b/internal/diagnostics/registry.go @@ -292,6 +292,34 @@ func seedDOCKER() { }, DocsURL: docsURL(DockerSnapAppArmor), }) + register(CatalogEntry{ + Code: DockerCLINotFound, + Severity: SeverityError, + UserMessage: "Docker isolation is enabled but the `docker` command could not be found. Install Docker, or add its CLI to your PATH.", + FixSteps: []FixStep{ + {Type: FixStepCommand, Label: "Check docker is on PATH", Command: "docker --version"}, + {Type: FixStepLink, Label: "Install Docker / enable the CLI", URL: docsURL(DockerCLINotFound)}, + }, + DocsURL: docsURL(DockerCLINotFound), + }) + register(CatalogEntry{ + Code: DockerExecNotFound, + Severity: SeverityError, + UserMessage: "The Docker image is missing the interpreter this server needs (e.g. the image has no `uvx`/`node`). Pick an image that includes it.", + FixSteps: []FixStep{ + {Type: FixStepLink, Label: "Choosing a Docker isolation image", URL: docsURL(DockerExecNotFound)}, + }, + DocsURL: docsURL(DockerExecNotFound), + }) + register(CatalogEntry{ + Code: DockerOCIRuntime, + Severity: SeverityError, + UserMessage: "The Docker container failed to start (OCI runtime error). This is often an image/CPU architecture mismatch.", + FixSteps: []FixStep{ + {Type: FixStepLink, Label: "Troubleshooting OCI runtime errors", URL: docsURL(DockerOCIRuntime)}, + }, + DocsURL: docsURL(DockerOCIRuntime), + }) } // --- CONFIG -------------------------------------------------------------- diff --git a/internal/diagnostics/types.go b/internal/diagnostics/types.go index 26e2ab460..9090e5e73 100644 --- a/internal/diagnostics/types.go +++ b/internal/diagnostics/types.go @@ -67,6 +67,12 @@ type DiagnosticError struct { type ClassifierHints struct { Transport string // "stdio", "http", "sse", "docker", etc. ServerID string + // DockerIsolated is true when the failing server is launched through Docker + // isolation (`docker run …` over the stdio transport). It lets the + // classifier route ENOENT-class spawn failures to DOCKER codes (CLI missing + // per #696, in-container interpreter missing) instead of a generic + // MCPX_STDIO_SPAWN_ENOENT. See classifyDockerIsolatedSpawn. + DockerIsolated bool } // FixRequest is the input to a registered fixer. diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index 19ea928d8..f635b7be0 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -2596,6 +2596,16 @@ func (r *Runtime) IsDockerAvailable() bool { return r.dockerProbeResult } +// GetDockerCLISource returns the coarse, fixed-enum branch that resolved the +// docker CLI — "path" | "bundled" | "login_shell" | "absent" (implements +// telemetry.RuntimeStats, schema v5 / MCP-2745). This is the direct #696 fleet +// signal (docker installed but not on the spawn PATH). It delegates to +// shellwrap.ResolveDockerSource, which shares the process-wide docker-path +// cache, so this is cheap on the heartbeat path. NEVER returns the path itself. +func (r *Runtime) GetDockerCLISource() string { + return shellwrap.ResolveDockerSource(r.logger) +} + // GetDockerIsolatedServerCount returns how many currently-configured servers // the runtime actually wraps in a Docker container (implements // telemetry.RuntimeStats, schema v3). diff --git a/internal/runtime/supervisor/supervisor.go b/internal/runtime/supervisor/supervisor.go index acf10b547..e61ab817f 100644 --- a/internal/runtime/supervisor/supervisor.go +++ b/internal/runtime/supervisor/supervisor.go @@ -3,6 +3,7 @@ package supervisor import ( "context" "fmt" + "path/filepath" "sync" "sync/atomic" "time" @@ -22,14 +23,15 @@ import ( // classifyAndAttach converts a raw connection error into a DiagnosticError and // stores it on the server status. Called from the supervisor's reconcile and // event paths. Spec 044. -func classifyAndAttach(status *stateview.ServerStatus, err error, transport string) { +func classifyAndAttach(status *stateview.ServerStatus, err error, transport string, dockerIsolated bool) { if err == nil { status.Diagnostic = nil return } code := diagnostics.Classify(err, diagnostics.ClassifierHints{ - Transport: transport, - ServerID: status.Name, + Transport: transport, + ServerID: status.Name, + DockerIsolated: dockerIsolated, }) if code == "" { // Classify always returns at least UnknownUnclassified for non-nil err, @@ -52,6 +54,34 @@ func classifyAndAttach(status *stateview.ServerStatus, err error, transport stri } } +// usesDockerIsolation reports whether the given server would be launched +// through Docker isolation, so the classifier can attribute spawn/exec failures +// to DOCKER codes (#696 CLI missing, in-container interpreter missing) rather +// than a generic stdio ENOENT. This is a side-effect-free mirror of +// core.IsolationManager.ShouldIsolate (internal/upstream/core/isolation.go) — +// it is only a classifier hint, so faithfulness matters more than sharing the +// (logging) implementation. +func (s *Supervisor) usesDockerIsolation(srv *config.ServerConfig) bool { + if srv == nil || srv.Command == "" { + return false + } + snap := s.configSvc.Current() + if snap == nil || snap.Config == nil || snap.Config.DockerIsolation == nil || + !snap.Config.DockerIsolation.Enabled { + return false + } + // Per-server explicit opt-out wins over the global enable. + if srv.Isolation != nil && srv.Isolation.Enabled != nil && !*srv.Isolation.Enabled { + return false + } + // Servers already running docker themselves are not double-isolated. + cmdBase := filepath.Base(srv.Command) + if cmdBase == "docker" || strings.Contains(srv.Command, "docker") { + return false + } + return true +} + // Supervisor manages the desired vs actual state reconciliation for upstream servers. // It subscribes to config changes and emits events when server states change. type Supervisor struct { @@ -680,7 +710,7 @@ func (s *Supervisor) updateStateView(name string, state *ServerState) { if state.Config != nil { transport = transportpkg.DetermineTransportType(state.Config) } - classifyAndAttach(status, state.ConnectionInfo.LastError, transport) + classifyAndAttach(status, state.ConnectionInfo.LastError, transport, s.usesDockerIsolation(state.Config)) // Spec 044 Phase H: notify telemetry counter store. if status.Diagnostic != nil { s.callbackMu.RLock() @@ -978,7 +1008,7 @@ func (s *Supervisor) updateSnapshotFromEvent(event Event) { if status.Config != nil { transport = transportpkg.DetermineTransportType(status.Config) } - classifyAndAttach(status, connInfo.LastError, transport) + classifyAndAttach(status, connInfo.LastError, transport, s.usesDockerIsolation(status.Config)) // Spec 044 Phase H: notify telemetry counter store. if status.Diagnostic != nil { s.callbackMu.RLock() diff --git a/internal/shellwrap/shellwrap.go b/internal/shellwrap/shellwrap.go index ee4517d59..55a44be88 100644 --- a/internal/shellwrap/shellwrap.go +++ b/internal/shellwrap/shellwrap.go @@ -136,11 +136,23 @@ var dockerPathNegativeTTL = 60 * time.Second var ( dockerPathMu sync.Mutex dockerPath string + dockerPathSource string // which branch resolved docker (see DockerSource* enum) dockerPathErr error dockerPathHasResult bool dockerPathExpires time.Time // zero for cached success (never expires) ) +// DockerSource* are the coarse, fixed-enum labels describing HOW the docker +// CLI was resolved (or that it is absent). They are emitted in telemetry as the +// #696 fleet signal (docker-installed-but-not-on-PATH). They deliberately carry +// no path, host, or user information — only the resolution branch. +const ( + DockerSourcePath = "path" // found via exec.LookPath (ambient PATH) + DockerSourceBundled = "bundled" // found at a well-known install location (e.g. Docker Desktop bundle) + DockerSourceLoginShell = "login_shell" // recovered via the user's login-shell PATH + DockerSourceAbsent = "absent" // not resolvable anywhere (#696 worst case) +) + // wellKnownDockerPathsFn returns docker install locations to probe directly // when neither $PATH nor the user's login shell exposes a docker binary. // Exposed as a package variable so tests can stub the list. @@ -240,8 +252,9 @@ func ResolveDockerPath(logger *zap.Logger) (string, error) { return dockerPath, dockerPathErr } - path, err := resolveDockerPathUncached(logger) + path, source, err := resolveDockerPathUncached(logger) dockerPath = path + dockerPathSource = source dockerPathErr = err dockerPathHasResult = true if err != nil { @@ -252,13 +265,57 @@ func ResolveDockerPath(logger *zap.Logger) (string, error) { return path, err } -func resolveDockerPathUncached(logger *zap.Logger) (string, error) { +// ResolveDockerSource returns the coarse, fixed-enum label describing how the +// docker CLI was resolved (DockerSourcePath / DockerSourceBundled / +// DockerSourceLoginShell), or DockerSourceAbsent when docker cannot be found. +// It shares the same cache as ResolveDockerPath, so a prior successful or +// failed resolution is reused (honoring the negative TTL). Never returns the +// resolved path — only the branch — so callers (telemetry) cannot leak it. +func ResolveDockerSource(logger *zap.Logger) string { + dockerPathMu.Lock() + defer dockerPathMu.Unlock() + + if dockerPathHasResult { + if dockerPathErr == nil { + return sourceOrAbsent(dockerPathSource) + } + if !dockerPathExpires.IsZero() && time.Now().Before(dockerPathExpires) { + return DockerSourceAbsent + } + } + + path, source, err := resolveDockerPathUncached(logger) + dockerPath = path + dockerPathSource = source + dockerPathErr = err + dockerPathHasResult = true + if err != nil { + dockerPathExpires = time.Now().Add(dockerPathNegativeTTL) + return DockerSourceAbsent + } + dockerPathExpires = time.Time{} + return sourceOrAbsent(source) +} + +// sourceOrAbsent normalizes an empty source string to DockerSourceAbsent so the +// enum is never blank on the wire. +func sourceOrAbsent(source string) string { + if source == "" { + return DockerSourceAbsent + } + return source +} + +// resolveDockerPathUncached resolves docker and reports which branch found it. +// The returned source is one of DockerSourcePath / DockerSourceBundled / +// DockerSourceLoginShell on success, or DockerSourceAbsent on failure. +func resolveDockerPathUncached(logger *zap.Logger) (path, source string, err error) { // Fast path: ask Go's standard PATH lookup first. - if p, err := exec.LookPath("docker"); err == nil && p != "" { + if p, lookErr := exec.LookPath("docker"); lookErr == nil && p != "" { if logger != nil { logger.Debug("shellwrap: resolved docker via PATH", zap.String("path", p)) } - return p, nil + return p, DockerSourcePath, nil } // Well-known path probe: covers PKG-installer / launchd / sandboxed @@ -266,13 +323,13 @@ func resolveDockerPathUncached(logger *zap.Logger) (string, error) { // customizations are unreachable, but Docker Desktop is installed at // a standard location. Cheap (just os.Stat) and reliable. if p := probeWellKnownDocker(logger); p != "" { - return p, nil + return p, DockerSourceBundled, nil } // Slow path: shell out via the user's login shell. Only useful for // non-standard installs (Colima, custom prefixes); skipped on Windows. if runtime.GOOS == osWindows { - return "", fmt.Errorf("docker not found in PATH or well-known locations") + return "", DockerSourceAbsent, fmt.Errorf("docker not found in PATH or well-known locations") } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -280,19 +337,19 @@ func resolveDockerPathUncached(logger *zap.Logger) (string, error) { shell, shellArgs := WrapWithUserShell(logger, "command", []string{"-v", "docker"}) cmd := exec.CommandContext(ctx, shell, shellArgs...) - out, err := cmd.Output() - if err != nil { - return "", fmt.Errorf("login-shell docker lookup failed: %w", err) + out, lookErr := cmd.Output() + if lookErr != nil { + return "", DockerSourceAbsent, fmt.Errorf("login-shell docker lookup failed: %w", lookErr) } resolved := strings.TrimSpace(string(out)) if resolved == "" { - return "", fmt.Errorf("docker not found in login shell PATH") + return "", DockerSourceAbsent, fmt.Errorf("docker not found in login shell PATH") } if logger != nil { logger.Debug("shellwrap: resolved docker via login shell", zap.String("path", resolved)) } - return resolved, nil + return resolved, DockerSourceLoginShell, nil } // resetDockerPathCacheForTest is used by tests to clear the cache between @@ -302,6 +359,7 @@ func resetDockerPathCacheForTest() { dockerPathMu.Lock() defer dockerPathMu.Unlock() dockerPath = "" + dockerPathSource = "" dockerPathErr = nil dockerPathHasResult = false dockerPathExpires = time.Time{} diff --git a/internal/shellwrap/shellwrap_test.go b/internal/shellwrap/shellwrap_test.go index 809b8243e..1d23400c0 100644 --- a/internal/shellwrap/shellwrap_test.go +++ b/internal/shellwrap/shellwrap_test.go @@ -294,6 +294,64 @@ func TestResolveDockerPath_WellKnownPathFallback(t *testing.T) { assert.Equal(t, want, got) } +// TestResolveDockerSource reports which resolution branch found docker so the +// telemetry layer can emit the coarse #696 fleet signal (path/bundled/ +// login_shell/absent) — never the path string itself. +func TestResolveDockerSource(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("unix-only fixtures") + } + + t.Run("path", func(t *testing.T) { + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + dir := t.TempDir() + writeFakeDocker(t, dir) + t.Setenv("PATH", dir) + t.Setenv("SHELL", "/nonexistent/shell-must-not-be-invoked") + assert.Equal(t, DockerSourcePath, ResolveDockerSource(nil)) + }) + + t.Run("bundled", func(t *testing.T) { + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + dockerDir := t.TempDir() + want := writeFakeDocker(t, dockerDir) + t.Setenv("PATH", t.TempDir()) // ambient excludes docker + prev := wellKnownDockerPathsFn + wellKnownDockerPathsFn = func() []string { return []string{want} } + t.Cleanup(func() { wellKnownDockerPathsFn = prev }) + t.Setenv("SHELL", "/nonexistent/shell-must-not-be-invoked") + assert.Equal(t, DockerSourceBundled, ResolveDockerSource(nil)) + }) + + t.Run("login_shell", func(t *testing.T) { + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + dockerDir := t.TempDir() + dockerPath := writeFakeDocker(t, dockerDir) + t.Setenv("PATH", t.TempDir()) + prev := wellKnownDockerPathsFn + wellKnownDockerPathsFn = func() []string { return nil } + t.Cleanup(func() { wellKnownDockerPathsFn = prev }) + shellDir := t.TempDir() + t.Setenv("SHELL", writeFakeShell(t, shellDir, dockerPath)) + assert.Equal(t, DockerSourceLoginShell, ResolveDockerSource(nil)) + }) + + t.Run("absent", func(t *testing.T) { + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + t.Setenv("PATH", t.TempDir()) + prev := wellKnownDockerPathsFn + wellKnownDockerPathsFn = func() []string { return nil } + t.Cleanup(func() { wellKnownDockerPathsFn = prev }) + shellDir := t.TempDir() + t.Setenv("SHELL", writeFakeShell(t, shellDir, "")) + assert.Equal(t, DockerSourceAbsent, ResolveDockerSource(nil)) + }) +} + func TestMinimalEnv_DropsSecrets(t *testing.T) { t.Setenv("AWS_ACCESS_KEY_ID", "AKIA_test_dummy_value_00000000") t.Setenv("GITHUB_TOKEN", "ghp_dummy_test_token_1234567890abcdef") diff --git a/internal/telemetry/feature_flags.go b/internal/telemetry/feature_flags.go index 5daed7178..0c973e151 100644 --- a/internal/telemetry/feature_flags.go +++ b/internal/telemetry/feature_flags.go @@ -26,6 +26,20 @@ type FeatureFlagSnapshot struct { // BuildFeatureFlagSnapshot) so the snapshot helper stays side-effect-free // and doesn't shell out to `docker info`. DockerAvailable bool `json:"docker_available"` + + // Schema v5 (MCP-2745): DockerIsolationEnabled is the global + // docker_isolation.enabled flag. Set by BuildFeatureFlagSnapshot (pure, + // config-only). Together with DockerCLISource it lets the dashboard tell + // "isolation on, 0 matching servers" apart from "isolation off". + DockerIsolationEnabled bool `json:"docker_isolation_enabled"` + + // Schema v5 (MCP-2745): DockerCLISource is the coarse, fixed-enum branch + // that resolved the `docker` CLI — one of "path" | "bundled" | + // "login_shell" | "absent". This is the direct #696 fleet signal (docker + // installed but not on the spawn PATH). NEVER the path string itself. + // Populated by the telemetry service at heartbeat time (the resolution is a + // runtime concern) — mirrors DockerAvailable. + DockerCLISource string `json:"docker_cli_source,omitempty"` } // protocolKeys is the canonical fixed-enum set of protocol labels emitted by @@ -122,6 +136,12 @@ func BuildFeatureFlagSnapshot(cfg *config.Config) *FeatureFlagSnapshot { snap.SensitiveDataDetectionEnabled = cfg.SensitiveDataDetection.IsEnabled() } + // Schema v5 (MCP-2745): global docker isolation toggle. Nil-guarded so a + // config without the block reports false rather than panicking. + if cfg.DockerIsolation != nil { + snap.DockerIsolationEnabled = cfg.DockerIsolation.Enabled + } + // Derive OAuth provider types from upstream server URLs. var providerTypes []string for _, srv := range cfg.Servers { diff --git a/internal/telemetry/feature_flags_test.go b/internal/telemetry/feature_flags_test.go index b26805794..95c975cfa 100644 --- a/internal/telemetry/feature_flags_test.go +++ b/internal/telemetry/feature_flags_test.go @@ -81,6 +81,26 @@ func TestBuildFeatureFlagSnapshotFromConfig(t *testing.T) { } } +func TestBuildFeatureFlagSnapshotDockerIsolationEnabled(t *testing.T) { + // Enabled global docker isolation surfaces as true. + cfg := &config.Config{DockerIsolation: &config.DockerIsolationConfig{Enabled: true}} + if snap := BuildFeatureFlagSnapshot(cfg); !snap.DockerIsolationEnabled { + t.Error("DockerIsolationEnabled should be true when cfg.DockerIsolation.Enabled is true") + } + + // Disabled global docker isolation surfaces as false. + cfgOff := &config.Config{DockerIsolation: &config.DockerIsolationConfig{Enabled: false}} + if snap := BuildFeatureFlagSnapshot(cfgOff); snap.DockerIsolationEnabled { + t.Error("DockerIsolationEnabled should be false when cfg.DockerIsolation.Enabled is false") + } + + // Nil DockerIsolation must not panic and reports false. + cfgNil := &config.Config{} + if snap := BuildFeatureFlagSnapshot(cfgNil); snap.DockerIsolationEnabled { + t.Error("DockerIsolationEnabled should be false when cfg.DockerIsolation is nil") + } +} + func TestBuildFeatureFlagSnapshotNilFeatures(t *testing.T) { // When cfg.Features is nil, EnableWebUI should fall back to false rather // than panic. Guards against a nil-pointer deref in the emitter. diff --git a/internal/telemetry/optout_test.go b/internal/telemetry/optout_test.go index 16c37e6d1..2ec5952ef 100644 --- a/internal/telemetry/optout_test.go +++ b/internal/telemetry/optout_test.go @@ -291,6 +291,7 @@ func (h *hookStats) GetRoutingMode() string { return "retrieve_tools" func (h *hookStats) IsQuarantineEnabled() bool { return false } func (h *hookStats) IsDockerAvailable() bool { return false } func (h *hookStats) GetDockerIsolatedServerCount() int { return 0 } +func (h *hookStats) GetDockerCLISource() string { return "absent" } // TestSendHeartbeat_RechecksOptOutBeforeTransmit covers Codex fix #2: a heartbeat // already in flight when the opt-out latch flips must NOT transmit a usage diff --git a/internal/telemetry/payload_privacy_test.go b/internal/telemetry/payload_privacy_test.go index 147aa59fd..494e77d30 100644 --- a/internal/telemetry/payload_privacy_test.go +++ b/internal/telemetry/payload_privacy_test.go @@ -144,7 +144,7 @@ func TestPayloadHasNoForbiddenSubstrings(t *testing.T) { // Sanity check: the payload should still contain the legitimate fields, // otherwise we've over-redacted. for _, required := range []string{ - `"schema_version":4`, + `"schema_version":5`, `"surface_requests"`, `"builtin_tool_calls"`, `"upstream_tool_call_count_bucket"`, @@ -383,3 +383,50 @@ func TestPayloadV4_OnboardingDoesNotLeakUserStrings(t *testing.T) { t.Errorf("expected fixed-enum client ID 'claude-code' to reach payload, got:\n%s", js) } } + +// TestPayloadV5_DockerCLISourceIsEnumOnly is the Spec MCP-2745 privacy canary +// for the #696 docker-CLI-resolution signal. docker_cli_source must ONLY ever +// carry one of the four fixed-enum branch labels — never the resolved path, +// host, or any user string. The source of truth (shellwrap.ResolveDockerSource) +// already constrains the value to these enums; this test pins the contract at +// the wire and fails loudly if a future change widens the field. +func TestPayloadV5_DockerCLISourceIsEnumOnly(t *testing.T) { + t.Setenv("DO_NOT_TRACK", "") + t.Setenv("CI", "") + t.Setenv("MCPPROXY_TELEMETRY", "") + + cfg := &config.Config{ + DockerIsolation: &config.DockerIsolationConfig{Enabled: true}, + Telemetry: &config.TelemetryConfig{ + AnonymousID: "550e8400-e29b-41d4-a716-446655440000", + AnonymousIDCreatedAt: "2026-04-10T12:00:00Z", + }, + } + + for _, src := range []string{"path", "bundled", "login_shell", "absent"} { + t.Run(src, func(t *testing.T) { + svc := New(cfg, "", "v1.2.3", "personal", zap.NewNop()) + svc.SetRuntimeStats(&mockRuntimeStats{dockerAvailable: true, dockerCLISource: src}) + + payload := svc.BuildPayload() + data, err := json.Marshal(payload) + if err != nil { + t.Fatalf("marshal: %v", err) + } + js := string(data) + + if !strings.Contains(js, `"docker_cli_source":"`+src+`"`) { + t.Errorf("expected docker_cli_source=%q on the wire, got:\n%s", src, js) + } + // A path or URL slipping into the enum would be a privacy breach. + for _, forbidden := range []string{"/Users/", "/home/", "/Applications/", `C:\\`, ".docker", "http://", "https://"} { + if strings.Contains(js, forbidden) { + t.Errorf("PRIVACY VIOLATION: docker_cli_source leaked %q\npayload:\n%s", forbidden, js) + } + } + if err := ScanForPII(data); err != nil { + t.Errorf("v5 payload with docker_cli_source=%q should pass ScanForPII: %v", src, err) + } + }) + } +} diff --git a/internal/telemetry/payload_v2_test.go b/internal/telemetry/payload_v2_test.go index 319b61b1b..3408286d8 100644 --- a/internal/telemetry/payload_v2_test.go +++ b/internal/telemetry/payload_v2_test.go @@ -88,8 +88,8 @@ func TestHeartbeatPayloadV2Marshal(t *testing.T) { payload := svc.BuildPayload() - if payload.SchemaVersion != 4 { - t.Errorf("schema_version = %d, want 4", payload.SchemaVersion) + if payload.SchemaVersion != 5 { + t.Errorf("schema_version = %d, want 5", payload.SchemaVersion) } if payload.AnonymousID != "fixed-id" { t.Errorf("anonymous_id = %q", payload.AnonymousID) @@ -135,7 +135,7 @@ func TestHeartbeatPayloadV2Marshal(t *testing.T) { } js := string(data) for _, key := range []string{ - `"schema_version":4`, + `"schema_version":5`, `"surface_requests"`, `"builtin_tool_calls"`, `"upstream_tool_call_count_bucket":"11-100"`, diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 937d6c5c8..6ac5b444f 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -30,7 +30,14 @@ import ( // connected_client_count, connected_client_ids, wizard_engaged, // wizard_connect_step, wizard_server_step. Forward-compatible: existing // v3 consumers ignore the new fields. -const SchemaVersion = 4 +// +// v5 (schema bump from 4): adds docker-isolation visibility per MCP-2745 — +// feature_flags.docker_isolation_enabled, feature_flags.docker_cli_source +// (4-way enum: path|bundled|login_shell|absent, the #696 fleet signal), and +// three new diagnostics codes surfaced via error_code_counts_24h +// (MCPX_DOCKER_CLI_NOT_FOUND / MCPX_DOCKER_EXEC_NOT_FOUND / +// MCPX_DOCKER_OCI_RUNTIME). Additive only — v3/v4 consumers ignore them. +const SchemaVersion = 5 // HeartbeatPayload is the anonymous telemetry payload sent periodically. // Spec 042 expanded the payload with Tier 2 fields; v1 fields are preserved. @@ -167,6 +174,11 @@ type RuntimeStats interface { // GetDockerIsolatedServerCount returns how many currently-configured // servers the runtime is actually wrapping in a Docker container. GetDockerIsolatedServerCount() int + // GetDockerCLISource returns the coarse, fixed-enum branch that resolved + // the docker CLI — "path" | "bundled" | "login_shell" | "absent" (schema + // v5, MCP-2745). Implementations should memoize the resolution (it shares + // the shellwrap docker-path cache) and NEVER return the path string. + GetDockerCLISource() string } // Service manages anonymous telemetry heartbeats and feedback submission. @@ -561,6 +573,10 @@ func (s *Service) buildHeartbeat() HeartbeatPayload { payload.FeatureFlags = BuildFeatureFlagSnapshot(s.config) if s.stats != nil && payload.FeatureFlags != nil { payload.FeatureFlags.DockerAvailable = s.stats.IsDockerAvailable() + // Schema v5 (MCP-2745): coarse docker-CLI resolution branch (the #696 + // fleet signal). Resolution is a runtime concern, so it is spliced in + // here rather than in the side-effect-free BuildFeatureFlagSnapshot. + payload.FeatureFlags.DockerCLISource = s.stats.GetDockerCLISource() } // Schema v3: fixed-key per-protocol counter over cfg.Servers. Logs diff --git a/internal/telemetry/telemetry_test.go b/internal/telemetry/telemetry_test.go index b8511ed0a..7f5864bda 100644 --- a/internal/telemetry/telemetry_test.go +++ b/internal/telemetry/telemetry_test.go @@ -23,6 +23,7 @@ type mockRuntimeStats struct { quarantine bool dockerAvailable bool dockerIsolatedServers int + dockerCLISource string } func (m *mockRuntimeStats) GetServerCount() int { return m.serverCount } @@ -34,6 +35,7 @@ func (m *mockRuntimeStats) IsDockerAvailable() bool { return m.dockerAvaila func (m *mockRuntimeStats) GetDockerIsolatedServerCount() int { return m.dockerIsolatedServers } +func (m *mockRuntimeStats) GetDockerCLISource() string { return m.dockerCLISource } func TestHeartbeatSend(t *testing.T) { // Clear env vars that would disable telemetry (GitHub Actions sets CI=true). @@ -300,20 +302,20 @@ func TestEnsureAnonymousID(t *testing.T) { } } -// TestSchemaVersionV4 verifies that HeartbeatPayload carries schema_version=4 -// once the Spec 046 onboarding fields ship. This is a tripwire against -// accidental downgrades. -func TestSchemaVersionV4(t *testing.T) { - if SchemaVersion != 4 { - t.Fatalf("SchemaVersion = %d, want 4", SchemaVersion) +// TestSchemaVersionV5 verifies that HeartbeatPayload carries schema_version=5 +// once the MCP-2745 docker-isolation visibility fields ship. This is a tripwire +// against accidental downgrades. +func TestSchemaVersionV5(t *testing.T) { + if SchemaVersion != 5 { + t.Fatalf("SchemaVersion = %d, want 5", SchemaVersion) } cfg := &config.Config{} svc := New(cfg, "", "v1.0.0", "personal", zap.NewNop()) svc.SetRuntimeStats(&mockRuntimeStats{}) payload := svc.BuildPayload() - if payload.SchemaVersion != 4 { - t.Errorf("payload.SchemaVersion = %d, want 4", payload.SchemaVersion) + if payload.SchemaVersion != 5 { + t.Errorf("payload.SchemaVersion = %d, want 5", payload.SchemaVersion) } } @@ -345,6 +347,26 @@ func TestV3PayloadDockerAvailable(t *testing.T) { } } +// TestV5PayloadDockerCLISource verifies the telemetry service forwards the +// runtime GetDockerCLISource() resolution branch into +// feature_flags.docker_cli_source (the #696 fleet signal). +func TestV5PayloadDockerCLISource(t *testing.T) { + cfg := &config.Config{DockerIsolation: &config.DockerIsolationConfig{Enabled: true}} + + svc := New(cfg, "", "v1.0.0", "personal", zap.NewNop()) + svc.SetRuntimeStats(&mockRuntimeStats{dockerAvailable: true, dockerCLISource: "bundled"}) + p := svc.BuildPayload() + if p.FeatureFlags == nil { + t.Fatal("FeatureFlags nil") + } + if p.FeatureFlags.DockerCLISource != "bundled" { + t.Errorf("DockerCLISource = %q, want %q", p.FeatureFlags.DockerCLISource, "bundled") + } + if !p.FeatureFlags.DockerIsolationEnabled { + t.Error("DockerIsolationEnabled should be true when cfg enables global isolation") + } +} + // TestV3PayloadServerProtocolCounts verifies that the payload carries // per-protocol counts for the configured servers. func TestV3PayloadServerProtocolCounts(t *testing.T) { @@ -453,8 +475,8 @@ func TestAnonymousIDStable_V2ToV3(t *testing.T) { if p1.AnonymousID != p2.AnonymousID { t.Errorf("anonymous_id drifted between builds: %q vs %q", p1.AnonymousID, p2.AnonymousID) } - // SchemaVersion is 4 after spec 046's onboarding-funnel additions. - if p1.SchemaVersion != 4 { - t.Errorf("schema_version = %d, want 4 (spec 046 onboarding funnel)", p1.SchemaVersion) + // SchemaVersion is 5 after MCP-2745's docker-isolation visibility additions. + if p1.SchemaVersion != 5 { + t.Errorf("schema_version = %d, want 5 (MCP-2745 docker telemetry)", p1.SchemaVersion) } } diff --git a/specs/042-telemetry-tier2/contracts/heartbeat-v2.schema.json b/specs/042-telemetry-tier2/contracts/heartbeat-v2.schema.json index f78fe2f70..ed9cad4ee 100644 --- a/specs/042-telemetry-tier2/contracts/heartbeat-v2.schema.json +++ b/specs/042-telemetry-tier2/contracts/heartbeat-v2.schema.json @@ -3,6 +3,7 @@ "$id": "https://mcpproxy.app/schemas/telemetry/heartbeat-v2.json", "title": "MCPProxy Telemetry Heartbeat v2", "description": "Anonymous telemetry payload sent once per 24 hours from mcpproxy installs. Purely additive over v1: v1 fields are preserved.", + "$comment": "Frozen v2 snapshot. Later schema versions extend feature_flags forward-compatibly without appearing here: v3 added docker_available/server_protocol_counts, v5 (MCP-2745) added feature_flags.docker_isolation_enabled + feature_flags.docker_cli_source (enum path|bundled|login_shell|absent). See specs/044-retention-telemetry-v3/contracts/heartbeat-v3.json for the current shape.", "type": "object", "required": [ "schema_version", diff --git a/specs/044-retention-telemetry-v3/contracts/heartbeat-v3.json b/specs/044-retention-telemetry-v3/contracts/heartbeat-v3.json index 92a03ffe1..8d0a52830 100644 --- a/specs/044-retention-telemetry-v3/contracts/heartbeat-v3.json +++ b/specs/044-retention-telemetry-v3/contracts/heartbeat-v3.json @@ -72,7 +72,24 @@ "required": ["has_ci_env", "has_cloud_ide_env", "is_container", "has_tty", "has_display"], "additionalProperties": false, "description": "Booleans only. The payload builder self-check MUST reject any non-boolean value here." + }, + "feature_flags": { + "type": "object", + "additionalProperties": true, + "description": "Passthrough of the v2/v3 feature_flags object. Only the schema-v5 (MCP-2745) docker-isolation additions are enumerated here; older fields pass through.", + "properties": { + "docker_isolation_enabled": { + "type": "boolean", + "description": "Schema v5 (MCP-2745): global docker_isolation.enabled flag. Distinguishes 'isolation on, 0 matching servers' from 'isolation off'." + }, + "docker_cli_source": { + "type": "string", + "enum": ["path", "bundled", "login_shell", "absent"], + "description": "Schema v5 (MCP-2745): how the docker CLI was resolved (the #696 fleet signal). Fixed enum — NEVER the path string." + } + } } }, - "additionalProperties": true + "additionalProperties": true, + "$comment": "Schema v5 (MCP-2745) extends this forward-compatibly: feature_flags.docker_isolation_enabled + feature_flags.docker_cli_source, and three new error_code_counts_24h rows MCPX_DOCKER_CLI_NOT_FOUND / MCPX_DOCKER_EXEC_NOT_FOUND / MCPX_DOCKER_OCI_RUNTIME. Existing v3/v4 consumers ignore them." } From 4c3fbc05f1fa86db686cbe58458b0ae81e38dfe6 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 11:55:35 +0300 Subject: [PATCH 2/6] test(httpapi): add GetDockerCLISource to fakeRuntimeStats for schema v5 The RuntimeStats interface gained GetDockerCLISource() (MCP-2745); the httpapi telemetry payload test's fake stub must implement it or the package fails to typecheck under the full-repo CI lint. Related #696 Co-Authored-By: Paperclip --- internal/httpapi/telemetry_payload_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/httpapi/telemetry_payload_test.go b/internal/httpapi/telemetry_payload_test.go index c8b48a985..3258b2ee0 100644 --- a/internal/httpapi/telemetry_payload_test.go +++ b/internal/httpapi/telemetry_payload_test.go @@ -36,6 +36,7 @@ func (fakeRuntimeStats) GetRoutingMode() string { return "retrieve_to func (fakeRuntimeStats) IsQuarantineEnabled() bool { return true } func (fakeRuntimeStats) IsDockerAvailable() bool { return false } func (fakeRuntimeStats) GetDockerIsolatedServerCount() int { return 0 } +func (fakeRuntimeStats) GetDockerCLISource() string { return "absent" } func TestHandleGetTelemetryPayload_OK(t *testing.T) { logger := zap.NewNop().Sugar() From 6dcbba2f8bf86c993b0058a5b367d8b086e2b230 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 12:20:35 +0300 Subject: [PATCH 3/6] fix(shellwrap): set docker source on stat-probe cache upgrade (Codex P2) ResolveDockerPath's MCP-2744 stat-probe override (upgrading a live cached negative to a permanent success when the well-known-path probe finds docker) updated dockerPath/dockerPathErr but not dockerPathSource. A stale 'absent' from the prior failed resolution then leaked into docker_cli_source telemetry (schema v5) even though docker WAS resolved via the bundled path. Set dockerPathSource = DockerSourceBundled in that branch. Regression test asserts a cached negative upgraded via the stat probe reports 'bundled', not 'absent' (fails without the one-line fix). Related #696 Co-Authored-By: Paperclip --- internal/shellwrap/shellwrap.go | 5 ++++ internal/shellwrap/shellwrap_test.go | 41 ++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/internal/shellwrap/shellwrap.go b/internal/shellwrap/shellwrap.go index 55a44be88..1eeb1d991 100644 --- a/internal/shellwrap/shellwrap.go +++ b/internal/shellwrap/shellwrap.go @@ -245,6 +245,11 @@ func ResolveDockerPath(logger *zap.Logger) (string, error) { !dockerPathExpires.IsZero() && time.Now().Before(dockerPathExpires) { if p := probeWellKnownDocker(logger); p != "" { dockerPath = p + // The override resolves via the well-known-path probe, so the + // source is "bundled". Must be set here too, else a stale "absent" + // from the prior failed resolution leaks into docker_cli_source + // telemetry on the next ResolveDockerSource call (schema v5). + dockerPathSource = DockerSourceBundled dockerPathErr = nil dockerPathExpires = time.Time{} return p, nil diff --git a/internal/shellwrap/shellwrap_test.go b/internal/shellwrap/shellwrap_test.go index 1d23400c0..ad5e49eee 100644 --- a/internal/shellwrap/shellwrap_test.go +++ b/internal/shellwrap/shellwrap_test.go @@ -352,6 +352,47 @@ func TestResolveDockerSource(t *testing.T) { }) } +// TestResolveDockerSource_StatProbeUpgradeReportsBundled is the regression +// guard for the MCP-2744 cache-upgrade path: when a live cached NEGATIVE is +// overridden by the well-known-path stat probe, the cached source must be +// upgraded to "bundled" too — otherwise the stale "absent" leaks into the +// schema-v5 docker_cli_source telemetry even though docker WAS resolved. +func TestResolveDockerSource_StatProbeUpgradeReportsBundled(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("unix-only fixture") + } + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + + // FIRST resolution fails everywhere → caches a live negative whose source + // is "absent". Keep the negative TTL at its default so it stays live. + prevPaths := wellKnownDockerPathsFn + wellKnownDockerPathsFn = func() []string { return nil } + t.Cleanup(func() { wellKnownDockerPathsFn = prevPaths }) + + t.Setenv("PATH", t.TempDir()) + shellDir := t.TempDir() + t.Setenv("SHELL", writeFakeShell(t, shellDir, "")) + + _, err := ResolveDockerPath(nil) + require.Error(t, err, "first call should cache a negative (docker absent everywhere)") + require.Equal(t, DockerSourceAbsent, ResolveDockerSource(nil), "cached negative reports absent") + + // Docker now appears at a well-known path. The stat-probe override must + // upgrade BOTH the path and the source on the next call. + dockerDir := t.TempDir() + want := writeFakeDocker(t, dockerDir) + wellKnownDockerPathsFn = func() []string { return []string{want} } + + got, err := ResolveDockerPath(nil) + require.NoError(t, err, "stat probe must override the live negative") + require.Equal(t, want, got) + // The bug: this returned "absent" before the fix because the upgrade path + // never updated dockerPathSource. + require.Equal(t, DockerSourceBundled, ResolveDockerSource(nil), + "source must upgrade to bundled when the stat probe overrides the cached negative") +} + func TestMinimalEnv_DropsSecrets(t *testing.T) { t.Setenv("AWS_ACCESS_KEY_ID", "AKIA_test_dummy_value_00000000") t.Setenv("GITHUB_TOKEN", "ghp_dummy_test_token_1234567890abcdef") From 71c1b4ec639a9232a6fc97da7f4beab765a736fa Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 14:08:44 +0300 Subject: [PATCH 4/6] fix(diagnostics): classify zsh 'command not found: docker' as CLI-not-found (Codex P2 r3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit zsh's login-shell wording is reversed — 'zsh:1: command not found: docker' (name AFTER the colon), not bash/sh's 'docker: command not found'. The CLI-not- found case only matched the bash shape, so the common macOS #696 scenario fell through to the generic 'command not found' branch and recorded MCPX_DOCKER_EXEC_NOT_FOUND, conflating host-CLI-missing (#696) with in-container exec ENOENT. Match 'command not found: docker' in both the isolation-spawn helper and the unhinted classifyDocker fallback. Added a classifier table case for the zsh wording (verified red→green: routed to EXEC_NOT_FOUND without the fix, CLI_NOT_FOUND with it). Related #696 Co-Authored-By: Paperclip --- internal/diagnostics/classifier.go | 7 ++++++- internal/diagnostics/classifier_domains_test.go | 9 +++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/internal/diagnostics/classifier.go b/internal/diagnostics/classifier.go index 7f688063c..8f78ebc3e 100644 --- a/internal/diagnostics/classifier.go +++ b/internal/diagnostics/classifier.go @@ -125,6 +125,7 @@ func classifyDocker(err error, _ ClassifierHints) Code { case strings.Contains(msg, "docker not found in path"), strings.Contains(msg, "docker not found in login shell"), strings.Contains(msg, "docker: command not found"), + strings.Contains(msg, "command not found: docker"), // zsh: "zsh:1: command not found: docker" strings.Contains(msg, "docker: not found"), strings.Contains(msg, `"docker": executable file not found`): return DockerCLINotFound @@ -192,9 +193,13 @@ func classifyDockerIsolatedSpawn(err error) Code { msg := strings.ToLower(err.Error()) switch { // (1) docker binary unresolved: shellwrap resolution failure, or the shell - // / Go exec layer reporting `docker` itself missing. + // / Go exec layer reporting `docker` itself missing. Cover both shell + // wordings: bash/sh `docker: command not found` AND zsh's reversed + // `zsh:1: command not found: docker` (the common macOS login-shell shape) — + // the latter must beat the generic "command not found" → EXEC case below. case strings.Contains(msg, `"docker": executable file not found`), strings.Contains(msg, "docker: command not found"), + strings.Contains(msg, "command not found: docker"), strings.Contains(msg, "docker: not found"), strings.Contains(msg, "docker not found in path"), strings.Contains(msg, "docker not found in login shell"): diff --git a/internal/diagnostics/classifier_domains_test.go b/internal/diagnostics/classifier_domains_test.go index 84a262283..5af627cfc 100644 --- a/internal/diagnostics/classifier_domains_test.go +++ b/internal/diagnostics/classifier_domains_test.go @@ -97,6 +97,15 @@ func TestClassify_Docker_IsolationSpawn(t *testing.T) { hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, want: DockerCLINotFound, }, + { + // #696 via zsh login shell: the common macOS shape is the REVERSED + // wording `zsh:1: command not found: docker` (name after the colon), + // which must still classify as CLI-not-found, not in-container EXEC. + name: "cli_not_found_zsh_reversed", + err: errors.New("stdio transport (docker_isolation=true): recent stderr: zsh:1: command not found: docker"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerCLINotFound, + }, { // shellwrap resolution failure surfaces this even without the hint. name: "cli_not_found_resolve", From 89cf2619fd467a59310b45f37266222a398c0c71 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 14:27:04 +0300 Subject: [PATCH 5/6] fix(shellwrap): unify ResolveDockerSource with ResolveDockerPath cache path (Codex P2 r4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResolveDockerSource honored a live cached negative by returning 'absent' WITHOUT running the well-known-path stat probe that ResolveDockerPath uses during the negative-TTL window — so docker_cli_source reported 'absent' while ResolveDockerPath returned the bundled path (schema-v5 telemetry mismatch). Root-cause the recurring divergence (this is the 2nd source-tracking drift between the two functions) by extracting a single shared resolveDockerPathLocked helper that is the only writer of the cache + dockerPathSource. ResolveDockerPath and ResolveDockerSource now drive the identical cache path, including the MCP-2744 override, so they cannot disagree on the same cache state. Regression test asserts ResolveDockerSource reports 'bundled' (not 'absent') when a well-known docker appears during the negative-TTL window — verified red->green against the shared-resolver refactor. Related #696 Co-Authored-By: Paperclip --- internal/shellwrap/shellwrap.go | 38 +++++++++++--------------- internal/shellwrap/shellwrap_test.go | 40 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/internal/shellwrap/shellwrap.go b/internal/shellwrap/shellwrap.go index 1eeb1d991..0598ae670 100644 --- a/internal/shellwrap/shellwrap.go +++ b/internal/shellwrap/shellwrap.go @@ -228,7 +228,16 @@ func probeWellKnownDocker(logger *zap.Logger) string { func ResolveDockerPath(logger *zap.Logger) (string, error) { dockerPathMu.Lock() defer dockerPathMu.Unlock() + return resolveDockerPathLocked(logger) +} +// resolveDockerPathLocked is the single cache-aware resolver. Callers MUST hold +// dockerPathMu. It is the one place the docker-path cache (path, err, expiry) +// AND the parallel dockerPathSource enum are written, so ResolveDockerPath and +// ResolveDockerSource can never diverge on the same cache state — including the +// MCP-2744 stat-probe override. (Earlier the two functions duplicated this +// logic and the source-tracking drifted between them.) +func resolveDockerPathLocked(logger *zap.Logger) (string, error) { // Honor cache: keep successful resolutions forever. if dockerPathHasResult && dockerPathErr == nil { return dockerPath, nil @@ -273,33 +282,18 @@ func ResolveDockerPath(logger *zap.Logger) (string, error) { // ResolveDockerSource returns the coarse, fixed-enum label describing how the // docker CLI was resolved (DockerSourcePath / DockerSourceBundled / // DockerSourceLoginShell), or DockerSourceAbsent when docker cannot be found. -// It shares the same cache as ResolveDockerPath, so a prior successful or -// failed resolution is reused (honoring the negative TTL). Never returns the -// resolved path — only the branch — so callers (telemetry) cannot leak it. +// It drives the SAME cache path as ResolveDockerPath (via resolveDockerPathLocked), +// so the reported source always matches the resolution ResolveDockerPath would +// give for the current cache state — including the MCP-2744 stat-probe override +// during the negative-TTL window. Never returns the resolved path — only the +// branch — so callers (telemetry) cannot leak it. func ResolveDockerSource(logger *zap.Logger) string { dockerPathMu.Lock() defer dockerPathMu.Unlock() - - if dockerPathHasResult { - if dockerPathErr == nil { - return sourceOrAbsent(dockerPathSource) - } - if !dockerPathExpires.IsZero() && time.Now().Before(dockerPathExpires) { - return DockerSourceAbsent - } - } - - path, source, err := resolveDockerPathUncached(logger) - dockerPath = path - dockerPathSource = source - dockerPathErr = err - dockerPathHasResult = true - if err != nil { - dockerPathExpires = time.Now().Add(dockerPathNegativeTTL) + if _, err := resolveDockerPathLocked(logger); err != nil { return DockerSourceAbsent } - dockerPathExpires = time.Time{} - return sourceOrAbsent(source) + return sourceOrAbsent(dockerPathSource) } // sourceOrAbsent normalizes an empty source string to DockerSourceAbsent so the diff --git a/internal/shellwrap/shellwrap_test.go b/internal/shellwrap/shellwrap_test.go index ad5e49eee..44b207c46 100644 --- a/internal/shellwrap/shellwrap_test.go +++ b/internal/shellwrap/shellwrap_test.go @@ -393,6 +393,46 @@ func TestResolveDockerSource_StatProbeUpgradeReportsBundled(t *testing.T) { "source must upgrade to bundled when the stat probe overrides the cached negative") } +// TestResolveDockerSource_NegativeTTLProbeOverride is the Codex round-4 +// regression guard: ResolveDockerSource must apply the same well-known-path +// stat-probe override that ResolveDockerPath does during the negative-TTL +// window. Before the shared-resolver refactor, ResolveDockerSource returned +// "absent" off the cached negative without ever probing, so docker_cli_source +// reported the wrong source while ResolveDockerPath returned the bundled path. +func TestResolveDockerSource_NegativeTTLProbeOverride(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("unix-only fixture") + } + resetDockerPathCacheForTest() + t.Cleanup(resetDockerPathCacheForTest) + + // FIRST resolution fails everywhere → caches a LIVE negative (default TTL). + prevPaths := wellKnownDockerPathsFn + wellKnownDockerPathsFn = func() []string { return nil } + t.Cleanup(func() { wellKnownDockerPathsFn = prevPaths }) + + t.Setenv("PATH", t.TempDir()) + shellDir := t.TempDir() + t.Setenv("SHELL", writeFakeShell(t, shellDir, "")) + + _, err := ResolveDockerPath(nil) + require.Error(t, err, "first call should cache a live negative") + + // Docker now appears at a well-known path. The very next ResolveDockerSource + // call must run the stat probe (not honor the cached "absent") and report + // bundled — without any TTL expiry. + dockerDir := t.TempDir() + want := writeFakeDocker(t, dockerDir) + wellKnownDockerPathsFn = func() []string { return []string{want} } + + require.Equal(t, DockerSourceBundled, ResolveDockerSource(nil), + "ResolveDockerSource must apply the stat-probe override during the negative-TTL window") + // And ResolveDockerPath agrees (cache was upgraded to a permanent success). + got, err := ResolveDockerPath(nil) + require.NoError(t, err) + require.Equal(t, want, got) +} + func TestMinimalEnv_DropsSecrets(t *testing.T) { t.Setenv("AWS_ACCESS_KEY_ID", "AKIA_test_dummy_value_00000000") t.Setenv("GITHUB_TOKEN", "ghp_dummy_test_token_1234567890abcdef") From 60dfe13e895d86da163624827d069609a0b00de1 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Wed, 17 Jun 2026 14:59:21 +0300 Subject: [PATCH 6/6] fix(diagnostics): don't label non-docker 'exec format error' as OCI (Codex P2 r5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unhinted classifyDocker matched a BARE 'exec format error' as MCPX_DOCKER_OCI_RUNTIME, but a non-docker wrong-architecture host stdio binary emits the same string (ENOEXEC) — it would be mislabeled a Docker/OCI failure. - classifyDocker OCI case now requires real OCI/runc context ('oci runtime' / 'runc'); bare 'exec format error' no longer matches there. - classifyStdio routes a bare 'exec format error' (and typed syscall.ENOEXEC) to a new STDIO code MCPX_STDIO_SPAWN_EXEC_FORMAT, guarded against docker OCI context so a real containerized exec-format still falls through to OCI. - The docker-isolated path keeps classifying bare 'exec format error' as OCI via the hinted classifyDockerIsolatedSpawn. - New catalog entry for MCPX_STDIO_SPAWN_EXEC_FORMAT. Classifier table cases: bare exec-format WITH hint -> OCI; bare exec-format WITHOUT hint -> STDIO (verified red MCPX_DOCKER_OCI_RUNTIME -> green MCPX_STDIO_SPAWN_EXEC_FORMAT); 'oci runtime' context without hint -> OCI. Related #696 Co-Authored-By: Paperclip --- internal/diagnostics/classifier.go | 17 ++++++++++-- .../diagnostics/classifier_domains_test.go | 26 +++++++++++++++++++ internal/diagnostics/codes.go | 9 +++++-- internal/diagnostics/registry.go | 10 +++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/internal/diagnostics/classifier.go b/internal/diagnostics/classifier.go index 8f78ebc3e..2b0245846 100644 --- a/internal/diagnostics/classifier.go +++ b/internal/diagnostics/classifier.go @@ -129,9 +129,12 @@ func classifyDocker(err error, _ ClassifierHints) Code { strings.Contains(msg, "docker: not found"), strings.Contains(msg, `"docker": executable file not found`): return DockerCLINotFound - // OCI runtime / architecture-mismatch failures from `docker run`. + // OCI runtime failures from `docker run`. NOTE: a BARE "exec format error" + // is intentionally NOT matched here — a non-docker, wrong-architecture host + // stdio binary emits the same string and must stay STDIO-classified. The + // docker-isolated path routes bare "exec format error" via the hinted + // classifyDockerIsolatedSpawn; here we require real OCI/runc context. case strings.Contains(msg, "oci runtime"), - strings.Contains(msg, "exec format error"), strings.Contains(msg, "runc"): return DockerOCIRuntime } @@ -240,6 +243,9 @@ func classifyStdio(err error, hints ClassifierHints) Code { if errors.Is(execErr.Err, syscall.EACCES) { return STDIOSpawnEACCES } + if errors.Is(execErr.Err, syscall.ENOEXEC) { + return STDIOSpawnExecFormat + } } // exec.ExitError — process started but exited non-zero during handshake. @@ -266,6 +272,13 @@ func classifyStdio(err error, hints ClassifierHints) Code { msg := err.Error() lmsg := strings.ToLower(msg) switch { + // Wrong-arch / non-executable host binary (ENOEXEC). Guarded against + // docker OCI context ("oci runtime"/"runc") so a real containerized + // exec-format failure still falls through to classifyDocker → OCI; a + // BARE "exec format error" is a host stdio problem, not a Docker one. + case strings.Contains(lmsg, "exec format error") && + !strings.Contains(lmsg, "oci runtime") && !strings.Contains(lmsg, "runc"): + return STDIOSpawnExecFormat case strings.Contains(lmsg, "no such file or directory"), strings.Contains(lmsg, "executable file not found"), strings.Contains(lmsg, "command not found"): diff --git a/internal/diagnostics/classifier_domains_test.go b/internal/diagnostics/classifier_domains_test.go index 5af627cfc..eeded868e 100644 --- a/internal/diagnostics/classifier_domains_test.go +++ b/internal/diagnostics/classifier_domains_test.go @@ -127,6 +127,32 @@ func TestClassify_Docker_IsolationSpawn(t *testing.T) { hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, want: DockerOCIRuntime, }, + { + // Bare "exec format error" WITH the isolation hint → OCI (wrong-arch + // image under docker isolation). + name: "bare_exec_format_isolated", + err: errors.New("stdio transport (docker_isolation=true): recent stderr: exec format error"), + hint: ClassifierHints{Transport: "stdio", DockerIsolated: true}, + want: DockerOCIRuntime, + }, + { + // Bare "exec format error" WITHOUT the isolation hint must stay + // STDIO-classified (a non-docker wrong-arch host binary), NOT a + // Docker code. Codex round-5 regression. + name: "bare_exec_format_no_hint_stays_stdio", + err: errors.New("failed to spawn stdio server: recent stderr: exec format error"), + hint: ClassifierHints{Transport: "stdio"}, + want: STDIOSpawnExecFormat, + }, + { + // A real docker OCI error that lacks the hint but carries "oci + // runtime" context still classifies as OCI (not STDIO), via the + // classifyDocker fallback. + name: "oci_context_no_hint", + err: errors.New("oci runtime create failed: exec format error"), + hint: ClassifierHints{Transport: "stdio"}, + want: DockerOCIRuntime, + }, { // Same ENOENT string WITHOUT the isolation hint stays a plain stdio // spawn failure — no false DOCKER attribution for host stdio servers. diff --git a/internal/diagnostics/codes.go b/internal/diagnostics/codes.go index 6c6b1a78b..302bb1989 100644 --- a/internal/diagnostics/codes.go +++ b/internal/diagnostics/codes.go @@ -8,8 +8,13 @@ package diagnostics // STDIO domain — stdio-transport MCP server failures. const ( - STDIOSpawnENOENT Code = "MCPX_STDIO_SPAWN_ENOENT" - STDIOSpawnEACCES Code = "MCPX_STDIO_SPAWN_EACCES" + STDIOSpawnENOENT Code = "MCPX_STDIO_SPAWN_ENOENT" + STDIOSpawnEACCES Code = "MCPX_STDIO_SPAWN_EACCES" + // STDIOSpawnExecFormat: the stdio binary exists but is the wrong CPU + // architecture / not an executable format (ENOEXEC — "exec format error"). + // Distinct from a Docker/OCI exec-format failure, which is + // MCPX_DOCKER_OCI_RUNTIME under the docker-isolation hint. + STDIOSpawnExecFormat Code = "MCPX_STDIO_SPAWN_EXEC_FORMAT" STDIOExitNonzero Code = "MCPX_STDIO_EXIT_NONZERO" STDIOExitBeforeInitialize Code = "MCPX_STDIO_EXIT_BEFORE_INITIALIZE" STDIOHandshakeTimeout Code = "MCPX_STDIO_HANDSHAKE_TIMEOUT" diff --git a/internal/diagnostics/registry.go b/internal/diagnostics/registry.go index 29efde178..c3e6f7a2d 100644 --- a/internal/diagnostics/registry.go +++ b/internal/diagnostics/registry.go @@ -51,6 +51,16 @@ func seedSTDIO() { }, DocsURL: docsURL(STDIOSpawnEACCES), }) + register(CatalogEntry{ + Code: STDIOSpawnExecFormat, + Severity: SeverityError, + UserMessage: "The configured command is the wrong CPU architecture or not an executable (exec format error). Install a build that matches this machine.", + FixSteps: []FixStep{ + {Type: FixStepCommand, Label: "Check the binary's architecture", Command: "file "}, + {Type: FixStepLink, Label: "Install a matching build", URL: docsURL(STDIOSpawnExecFormat)}, + }, + DocsURL: docsURL(STDIOSpawnExecFormat), + }) register(CatalogEntry{ Code: STDIOExitNonzero, Severity: SeverityError,