From f3f6b6291b12de33cda22e9ad97657bd95911cc9 Mon Sep 17 00:00:00 2001 From: st-gr <38470677+st-gr@users.noreply.github.com> Date: Fri, 5 Jun 2026 03:19:57 -0700 Subject: [PATCH 1/4] feat(core): add Config.external_compute_driver_socket field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Carries the path to an out-of-tree compute driver's Unix domain socket. When set, the gateway will dispatch sandbox lifecycle to that driver instead of one of the in-tree backends, taking precedence over `compute_drivers` and auto-detection. The field lives on `Config` rather than as a `ComputeDriverKind` variant so the four in-tree variants (`Kubernetes`, `Vm`, `Docker`, `Podman`) keep their flat shape and the enum stays `Copy`. Plumbing only — the dispatch wiring lands in a follow-up commit. Signed-off-by: st-gr <38470677+st-gr@users.noreply.github.com> --- crates/openshell-core/src/config.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index 04d6928da..5645923d6 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -361,6 +361,12 @@ pub struct Config { /// configured driver. pub compute_drivers: Vec, + /// When set, the gateway dispatches sandbox lifecycle to an out-of-tree + /// compute driver process listening on this Unix domain socket and + /// speaking `compute_driver.proto`. Takes precedence over + /// `compute_drivers` and the auto-detection probe. + pub external_compute_driver_socket: Option, + /// TTL for SSH session tokens, in seconds. 0 disables expiry. pub ssh_session_ttl_secs: u64, @@ -546,6 +552,7 @@ impl Config { gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], + external_compute_driver_socket: None, ssh_session_ttl_secs: default_ssh_session_ttl_secs(), service_routing: ServiceRoutingConfig::default(), } @@ -607,6 +614,13 @@ impl Config { self } + /// Pin an external compute driver by Unix domain socket path. + #[must_use] + pub fn with_external_compute_driver_socket(mut self, socket: Option) -> Self { + self.external_compute_driver_socket = socket; + self + } + /// Create a new configuration with the SSH session TTL. #[must_use] pub const fn with_ssh_session_ttl_secs(mut self, secs: u64) -> Self { From 078f36daef47a8279dc5aefe29e66ea1cb126b49 Mon Sep 17 00:00:00 2001 From: st-gr <38470677+st-gr@users.noreply.github.com> Date: Fri, 5 Jun 2026 03:37:23 -0700 Subject: [PATCH 2/4] feat(server): add --compute-driver-socket CLI flag for external driver Adds an optional `--compute-driver-socket=` flag (env `OPENSHELL_COMPUTE_DRIVER_SOCKET`) on `RunArgs`, plumbed through to `Config.external_compute_driver_socket`. The flag activates dispatch to an out-of-tree compute driver listening on a Unix domain socket and takes precedence over both the `--drivers` list and the auto-detection probe; `effective_single_driver` returns `None` when set so callers keyed off `ComputeDriverKind` skip the in-tree match. Tests cover flag parsing, env-var fallback, and the override of `--drivers`. The driver name advertised in `GetCapabilities` is logged for diagnostics in a follow-up commit that wires the channel. Signed-off-by: st-gr <38470677+st-gr@users.noreply.github.com> --- crates/openshell-server/src/cli.rs | 81 ++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 748cec264..c98587760 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -111,6 +111,16 @@ struct RunArgs { )] drivers: Vec, + /// Path to a Unix domain socket served by an out-of-tree compute driver + /// implementing `compute_driver.proto`. + /// + /// When set, the gateway dispatches sandbox lifecycle to that driver + /// instead of one of the in-tree backends, skipping both the `--drivers` + /// list and the auto-detection probe. The driver name advertised in + /// `GetCapabilities` is logged for diagnostics. + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")] + compute_driver_socket: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -353,6 +363,7 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { config = config .with_database_url(db_url) .with_compute_drivers(args.drivers.clone()) + .with_external_compute_driver_socket(args.compute_driver_socket.clone()) .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); @@ -611,6 +622,12 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: } fn effective_single_driver(args: &RunArgs) -> Option { + // An external-driver socket pins dispatch to the out-of-tree path and + // bypasses both the `--drivers` list and auto-detection probe; callers + // that key off the in-tree `ComputeDriverKind` get `None` here. + if args.compute_driver_socket.is_some() { + return None; + } match args.drivers.as_slice() { [] => openshell_core::config::detect_driver(), [driver] => Some(*driver), @@ -1440,6 +1457,70 @@ ssh_session_ttl_secs = 1234 assert!(!super::is_singleplayer_driver(&multi)); } + #[test] + fn compute_driver_socket_flag_populates_run_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + + let (args, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--compute-driver-socket", + "/run/openshell/external.sock", + ]); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new("/run/openshell/external.sock")) + ); + // External socket pins dispatch off the in-tree enum, so the + // single-driver helper must return None even when no --drivers given. + assert!(super::effective_single_driver(&args).is_none()); + } + + #[test] + fn compute_driver_socket_overrides_drivers_flag() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + + let (args, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "docker", + "--compute-driver-socket", + "/run/openshell/external.sock", + ]); + assert!( + super::effective_single_driver(&args).is_none(), + "external socket must short-circuit --drivers" + ); + } + + #[test] + fn compute_driver_socket_reads_from_env_var() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set( + "OPENSHELL_COMPUTE_DRIVER_SOCKET", + "/var/run/openshell/external.sock", + ); + + let (args, _) = parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new( + "/var/run/openshell/external.sock" + )) + ); + } + #[test] fn file_populates_service_routing_fields() { let _lock = ENV_LOCK From fb553a4e871def9d4c332e71fd92d1067a83127d Mon Sep 17 00:00:00 2001 From: st-gr <38470677+st-gr@users.noreply.github.com> Date: Fri, 5 Jun 2026 03:52:17 -0700 Subject: [PATCH 3/4] feat(server): connect external compute driver via acquired UDS endpoint Splits ComputeRuntime construction so that the gateway can dispatch to an out-of-tree compute driver process listening on a Unix domain socket the operator already owns, without adding a fifth `ComputeDriverKind` variant. * `ComputeRuntime::from_driver` now takes `Option`; the four in-tree constructors wrap their kind in `Some(...)`. Out-of- tree drivers pass `None` so callers keyed off the enum skip the in-tree match. * `connect_external_compute_driver` produces a tonic `Channel` over a pre-existing UDS path, mirroring the vm driver's connector. A `#[cfg(not(unix))]` stub returns the same error the vm path uses. * `ComputeRuntime::new_remote_external` consumes the channel via the existing `RemoteComputeDriver` proxy and skips both shutdown cleanup and managed-process supervision (the operator owns the lifecycle). * `from_driver` logs the `driver_name` advertised by `GetCapabilities` whenever `driver_kind` is `None`, so operators can confirm the gateway connected to the driver they expect. * `build_compute_runtime` short-circuits to the external path when `Config.external_compute_driver_socket` is set, before consulting `--drivers` / auto-detect. The four in-tree backends (Kubernetes, Vm, Docker, Podman) keep their existing dispatch arms unchanged. Signed-off-by: st-gr <38470677+st-gr@users.noreply.github.com> --- crates/openshell-server/src/compute/mod.rs | 107 ++++++++++++++++++--- crates/openshell-server/src/lib.rs | 20 ++++ 2 files changed, 115 insertions(+), 12 deletions(-) diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d6c1966e7..214ba1d5a 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -35,15 +35,20 @@ use openshell_driver_kubernetes::{ use openshell_driver_podman::{ ComputeDriverService as PodmanDriverService, PodmanComputeConfig, PodmanComputeDriver, }; +use hyper_util::rt::TokioIo; use prost::Message; use std::fmt; use std::net::SocketAddr; +use std::path::{Path, PathBuf}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; use tokio::sync::Mutex; -use tonic::transport::Channel; +#[cfg(unix)] +use tokio::net::UnixStream; +use tonic::transport::{Channel, Endpoint}; use tonic::{Code, Request, Status}; +use tower::service_fn; use tracing::{info, warn}; type DriverWatchStream = Pin> + Send>>; @@ -101,11 +106,11 @@ pub use openshell_core::ComputeDriverError as ComputeError; #[derive(Debug)] pub struct ManagedDriverProcess { child: std::sync::Mutex>, - socket_path: std::path::PathBuf, + socket_path: PathBuf, } impl ManagedDriverProcess { - pub(crate) fn new(child: tokio::process::Child, socket_path: std::path::PathBuf) -> Self { + pub(crate) fn new(child: tokio::process::Child, socket_path: PathBuf) -> Self { Self { child: std::sync::Mutex::new(Some(child)), socket_path, @@ -243,7 +248,7 @@ impl fmt::Debug for ComputeRuntime { impl ComputeRuntime { #[allow(clippy::too_many_arguments)] async fn from_driver( - driver_kind: ComputeDriverKind, + driver_kind: Option, driver: SharedComputeDriver, shutdown_cleanup: Option>, startup_resume: Option>, @@ -256,15 +261,24 @@ impl ComputeRuntime { _allows_loopback_endpoints: bool, gateway_bind_addresses: Vec, ) -> Result { - let default_image = driver + let capabilities = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) .await .map_err(compute_error_from_status)? - .into_inner() - .default_image; + .into_inner(); + // For out-of-tree drivers (driver_kind = None), log the name the + // driver advertises in GetCapabilities so operators can confirm + // the gateway is talking to the driver they expect. + if driver_kind.is_none() { + info!( + driver_name = %capabilities.driver_name, + "External compute driver connected" + ); + } + let default_image = capabilities.default_image; Ok(Self { driver, - driver_kind: Some(driver_kind), + driver_kind, shutdown_cleanup, startup_resume, _driver_process: driver_process, @@ -308,7 +322,7 @@ impl ComputeRuntime { let startup_resume: Arc = driver.clone(); let driver: SharedComputeDriver = driver; Self::from_driver( - ComputeDriverKind::Docker, + Some(ComputeDriverKind::Docker), driver, Some(shutdown_cleanup), Some(startup_resume), @@ -337,7 +351,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(ComputeDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Kubernetes, + Some(ComputeDriverKind::Kubernetes), driver, None, None, @@ -364,7 +378,7 @@ impl ComputeRuntime { ) -> Result { let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); Self::from_driver( - ComputeDriverKind::Vm, + Some(ComputeDriverKind::Vm), driver, None, None, @@ -380,6 +394,39 @@ impl ComputeRuntime { .await } + /// Construct a runtime that proxies all sandbox lifecycle to an + /// out-of-tree compute driver listening on a pre-existing UDS endpoint. + /// + /// The driver process is operator-managed (not spawned by the gateway), + /// so no [`ManagedDriverProcess`] handle is attached. The advertised + /// `driver_name` from `GetCapabilities` is logged for diagnostics by + /// [`Self::from_driver`]. + pub(crate) async fn new_remote_external( + channel: Channel, + store: Arc, + sandbox_index: SandboxIndex, + sandbox_watch_bus: SandboxWatchBus, + tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, + ) -> Result { + let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); + Self::from_driver( + None, + driver, + None, + None, + None, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + true, + Vec::new(), + ) + .await + } + pub async fn new_podman( config: PodmanComputeConfig, store: Arc, @@ -393,7 +440,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(PodmanDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Podman, + Some(ComputeDriverKind::Podman), driver, None, None, @@ -1249,6 +1296,42 @@ impl ComputeRuntime { } } +/// Connect to an out-of-tree compute driver that is already listening on +/// `socket_path` and return a tonic `Channel` speaking `compute_driver.proto`. +/// +/// The gateway does not spawn or own the driver process — the operator is +/// responsible for placing the driver alongside the gateway and granting the +/// gateway uid read/write on the socket. The host portion of the URL is +/// ignored because the connector resolves to the UDS rather than DNS. +#[cfg(unix)] +pub async fn connect_external_compute_driver( + socket_path: &Path, +) -> Result { + let socket_path: PathBuf = socket_path.to_path_buf(); + let display_path = socket_path.clone(); + Endpoint::from_static("http://[::]:50051") + .connect_with_connector(service_fn(move |_: tonic::transport::Uri| { + let socket_path = socket_path.clone(); + async move { UnixStream::connect(socket_path).await.map(TokioIo::new) } + })) + .await + .map_err(|e| { + ComputeError::Message(format!( + "failed to connect to external compute driver socket '{}': {e}", + display_path.display() + )) + }) +} + +#[cfg(not(unix))] +pub async fn connect_external_compute_driver( + _socket_path: &Path, +) -> Result { + Err(ComputeError::Message( + "the external compute driver requires unix domain socket support".to_string(), + )) +} + fn driver_sandbox_from_public(sandbox: &Sandbox) -> DriverSandbox { DriverSandbox { id: sandbox.object_id().to_string(), diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 676e23071..12911480b 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -703,6 +703,26 @@ async fn build_compute_runtime( tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { + if let Some(socket_path) = config.external_compute_driver_socket.as_deref() { + info!( + socket = %socket_path.display(), + "Using external compute driver" + ); + let channel = compute::connect_external_compute_driver(socket_path) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}")))?; + return ComputeRuntime::new_remote_external( + channel, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))); + } + let driver = configured_compute_driver(config)?; info!(driver = %driver, "Using compute driver"); warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, driver); From b1ed3629d9f5f46ba472e76e6687ed6de1a5a54c Mon Sep 17 00:00:00 2001 From: st-gr <38470677+st-gr@users.noreply.github.com> Date: Fri, 5 Jun 2026 04:07:39 -0700 Subject: [PATCH 4/4] docs(compute): document the External runtime row and supervisor cell Adds the External row to the runtime summary and supervisor delivery tables: activated by --compute-driver-socket, GetCapabilities driver_name logged for diagnostics, operator owns process and socket lifecycle, trust boundary is the socket's filesystem permissions. Also picks up rustfmt's normalization of the new imports and helper signatures introduced in the previous two commits. Signed-off-by: st-gr <38470677+st-gr@users.noreply.github.com> --- architecture/compute-runtimes.md | 2 ++ crates/openshell-server/src/cli.rs | 4 +--- crates/openshell-server/src/compute/mod.rs | 12 ++++-------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index b70a2fccc..a8114c0b5 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -34,6 +34,7 @@ when a sandbox create request asks for GPU resources. | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | | Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | | VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. | +| External | Out-of-tree drivers operated alongside the gateway. | Whatever boundary the driver implements. | Activated by `--compute-driver-socket=` (env `OPENSHELL_COMPUTE_DRIVER_SOCKET`). The gateway connects to a UDS the operator already provisioned, runs `GetCapabilities`, logs the advertised `driver_name`, and dispatches all sandbox lifecycle calls through the same `compute_driver.proto` surface as the in-tree drivers. The driver process and socket lifecycle are operator-owned; the gateway does not spawn, supervise, or remove the driver. The trust boundary is the socket's filesystem permissions — the operator must ensure only the gateway uid can read/write it. | Per-sandbox CPU and memory values currently enter the driver layer through template resource limits. Docker and Podman apply them as runtime limits. @@ -68,6 +69,7 @@ The supervisor must be available inside each sandbox workload: | Podman | Read-only OCI image volume containing the supervisor binary. | | Kubernetes | Sandbox pod image or pod template configuration. | | VM | Embedded in the guest rootfs bundle. | +| External | Defined by the out-of-tree driver. | Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index c98587760..f5ac53919 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -1515,9 +1515,7 @@ ssh_session_ttl_secs = 1234 let (args, _) = parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); assert_eq!( args.compute_driver_socket.as_deref(), - Some(std::path::Path::new( - "/var/run/openshell/external.sock" - )) + Some(std::path::Path::new("/var/run/openshell/external.sock")) ); } diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 214ba1d5a..f0e020aeb 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -15,6 +15,7 @@ use crate::sandbox_watch::SandboxWatchBus; use crate::supervisor_session::SupervisorSessionRegistry; use crate::tracing_bus::TracingLogBus; use futures::{Stream, StreamExt}; +use hyper_util::rt::TokioIo; use openshell_core::ComputeDriverKind; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, @@ -35,7 +36,6 @@ use openshell_driver_kubernetes::{ use openshell_driver_podman::{ ComputeDriverService as PodmanDriverService, PodmanComputeConfig, PodmanComputeDriver, }; -use hyper_util::rt::TokioIo; use prost::Message; use std::fmt; use std::net::SocketAddr; @@ -43,9 +43,9 @@ use std::path::{Path, PathBuf}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use tokio::sync::Mutex; #[cfg(unix)] use tokio::net::UnixStream; +use tokio::sync::Mutex; use tonic::transport::{Channel, Endpoint}; use tonic::{Code, Request, Status}; use tower::service_fn; @@ -1304,9 +1304,7 @@ impl ComputeRuntime { /// gateway uid read/write on the socket. The host portion of the URL is /// ignored because the connector resolves to the UDS rather than DNS. #[cfg(unix)] -pub async fn connect_external_compute_driver( - socket_path: &Path, -) -> Result { +pub async fn connect_external_compute_driver(socket_path: &Path) -> Result { let socket_path: PathBuf = socket_path.to_path_buf(); let display_path = socket_path.clone(); Endpoint::from_static("http://[::]:50051") @@ -1324,9 +1322,7 @@ pub async fn connect_external_compute_driver( } #[cfg(not(unix))] -pub async fn connect_external_compute_driver( - _socket_path: &Path, -) -> Result { +pub async fn connect_external_compute_driver(_socket_path: &Path) -> Result { Err(ComputeError::Message( "the external compute driver requires unix domain socket support".to_string(), ))