diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index b70a2fccc..a8114c0b5 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -34,6 +34,7 @@ when a sandbox create request asks for GPU resources. | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | | Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | | VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. | +| External | Out-of-tree drivers operated alongside the gateway. | Whatever boundary the driver implements. | Activated by `--compute-driver-socket=` (env `OPENSHELL_COMPUTE_DRIVER_SOCKET`). The gateway connects to a UDS the operator already provisioned, runs `GetCapabilities`, logs the advertised `driver_name`, and dispatches all sandbox lifecycle calls through the same `compute_driver.proto` surface as the in-tree drivers. The driver process and socket lifecycle are operator-owned; the gateway does not spawn, supervise, or remove the driver. The trust boundary is the socket's filesystem permissions — the operator must ensure only the gateway uid can read/write it. | Per-sandbox CPU and memory values currently enter the driver layer through template resource limits. Docker and Podman apply them as runtime limits. @@ -68,6 +69,7 @@ The supervisor must be available inside each sandbox workload: | Podman | Read-only OCI image volume containing the supervisor binary. | | Kubernetes | Sandbox pod image or pod template configuration. | | VM | Embedded in the guest rootfs bundle. | +| External | Defined by the out-of-tree driver. | Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index 04d6928da..5645923d6 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -361,6 +361,12 @@ pub struct Config { /// configured driver. pub compute_drivers: Vec, + /// When set, the gateway dispatches sandbox lifecycle to an out-of-tree + /// compute driver process listening on this Unix domain socket and + /// speaking `compute_driver.proto`. Takes precedence over + /// `compute_drivers` and the auto-detection probe. + pub external_compute_driver_socket: Option, + /// TTL for SSH session tokens, in seconds. 0 disables expiry. pub ssh_session_ttl_secs: u64, @@ -546,6 +552,7 @@ impl Config { gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], + external_compute_driver_socket: None, ssh_session_ttl_secs: default_ssh_session_ttl_secs(), service_routing: ServiceRoutingConfig::default(), } @@ -607,6 +614,13 @@ impl Config { self } + /// Pin an external compute driver by Unix domain socket path. + #[must_use] + pub fn with_external_compute_driver_socket(mut self, socket: Option) -> Self { + self.external_compute_driver_socket = socket; + self + } + /// Create a new configuration with the SSH session TTL. #[must_use] pub const fn with_ssh_session_ttl_secs(mut self, secs: u64) -> Self { diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 748cec264..f5ac53919 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -111,6 +111,16 @@ struct RunArgs { )] drivers: Vec, + /// Path to a Unix domain socket served by an out-of-tree compute driver + /// implementing `compute_driver.proto`. + /// + /// When set, the gateway dispatches sandbox lifecycle to that driver + /// instead of one of the in-tree backends, skipping both the `--drivers` + /// list and the auto-detection probe. The driver name advertised in + /// `GetCapabilities` is logged for diagnostics. + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")] + compute_driver_socket: Option, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -353,6 +363,7 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { config = config .with_database_url(db_url) .with_compute_drivers(args.drivers.clone()) + .with_external_compute_driver_socket(args.compute_driver_socket.clone()) .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); @@ -611,6 +622,12 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: } fn effective_single_driver(args: &RunArgs) -> Option { + // An external-driver socket pins dispatch to the out-of-tree path and + // bypasses both the `--drivers` list and auto-detection probe; callers + // that key off the in-tree `ComputeDriverKind` get `None` here. + if args.compute_driver_socket.is_some() { + return None; + } match args.drivers.as_slice() { [] => openshell_core::config::detect_driver(), [driver] => Some(*driver), @@ -1440,6 +1457,68 @@ ssh_session_ttl_secs = 1234 assert!(!super::is_singleplayer_driver(&multi)); } + #[test] + fn compute_driver_socket_flag_populates_run_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + + let (args, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--compute-driver-socket", + "/run/openshell/external.sock", + ]); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new("/run/openshell/external.sock")) + ); + // External socket pins dispatch off the in-tree enum, so the + // single-driver helper must return None even when no --drivers given. + assert!(super::effective_single_driver(&args).is_none()); + } + + #[test] + fn compute_driver_socket_overrides_drivers_flag() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + + let (args, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "docker", + "--compute-driver-socket", + "/run/openshell/external.sock", + ]); + assert!( + super::effective_single_driver(&args).is_none(), + "external socket must short-circuit --drivers" + ); + } + + #[test] + fn compute_driver_socket_reads_from_env_var() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set( + "OPENSHELL_COMPUTE_DRIVER_SOCKET", + "/var/run/openshell/external.sock", + ); + + let (args, _) = parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new("/var/run/openshell/external.sock")) + ); + } + #[test] fn file_populates_service_routing_fields() { let _lock = ENV_LOCK diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d6c1966e7..f0e020aeb 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -15,6 +15,7 @@ use crate::sandbox_watch::SandboxWatchBus; use crate::supervisor_session::SupervisorSessionRegistry; use crate::tracing_bus::TracingLogBus; use futures::{Stream, StreamExt}; +use hyper_util::rt::TokioIo; use openshell_core::ComputeDriverKind; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, @@ -38,12 +39,16 @@ use openshell_driver_podman::{ use prost::Message; use std::fmt; use std::net::SocketAddr; +use std::path::{Path, PathBuf}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; +#[cfg(unix)] +use tokio::net::UnixStream; use tokio::sync::Mutex; -use tonic::transport::Channel; +use tonic::transport::{Channel, Endpoint}; use tonic::{Code, Request, Status}; +use tower::service_fn; use tracing::{info, warn}; type DriverWatchStream = Pin> + Send>>; @@ -101,11 +106,11 @@ pub use openshell_core::ComputeDriverError as ComputeError; #[derive(Debug)] pub struct ManagedDriverProcess { child: std::sync::Mutex>, - socket_path: std::path::PathBuf, + socket_path: PathBuf, } impl ManagedDriverProcess { - pub(crate) fn new(child: tokio::process::Child, socket_path: std::path::PathBuf) -> Self { + pub(crate) fn new(child: tokio::process::Child, socket_path: PathBuf) -> Self { Self { child: std::sync::Mutex::new(Some(child)), socket_path, @@ -243,7 +248,7 @@ impl fmt::Debug for ComputeRuntime { impl ComputeRuntime { #[allow(clippy::too_many_arguments)] async fn from_driver( - driver_kind: ComputeDriverKind, + driver_kind: Option, driver: SharedComputeDriver, shutdown_cleanup: Option>, startup_resume: Option>, @@ -256,15 +261,24 @@ impl ComputeRuntime { _allows_loopback_endpoints: bool, gateway_bind_addresses: Vec, ) -> Result { - let default_image = driver + let capabilities = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) .await .map_err(compute_error_from_status)? - .into_inner() - .default_image; + .into_inner(); + // For out-of-tree drivers (driver_kind = None), log the name the + // driver advertises in GetCapabilities so operators can confirm + // the gateway is talking to the driver they expect. + if driver_kind.is_none() { + info!( + driver_name = %capabilities.driver_name, + "External compute driver connected" + ); + } + let default_image = capabilities.default_image; Ok(Self { driver, - driver_kind: Some(driver_kind), + driver_kind, shutdown_cleanup, startup_resume, _driver_process: driver_process, @@ -308,7 +322,7 @@ impl ComputeRuntime { let startup_resume: Arc = driver.clone(); let driver: SharedComputeDriver = driver; Self::from_driver( - ComputeDriverKind::Docker, + Some(ComputeDriverKind::Docker), driver, Some(shutdown_cleanup), Some(startup_resume), @@ -337,7 +351,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(ComputeDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Kubernetes, + Some(ComputeDriverKind::Kubernetes), driver, None, None, @@ -364,7 +378,7 @@ impl ComputeRuntime { ) -> Result { let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); Self::from_driver( - ComputeDriverKind::Vm, + Some(ComputeDriverKind::Vm), driver, None, None, @@ -380,6 +394,39 @@ impl ComputeRuntime { .await } + /// Construct a runtime that proxies all sandbox lifecycle to an + /// out-of-tree compute driver listening on a pre-existing UDS endpoint. + /// + /// The driver process is operator-managed (not spawned by the gateway), + /// so no [`ManagedDriverProcess`] handle is attached. The advertised + /// `driver_name` from `GetCapabilities` is logged for diagnostics by + /// [`Self::from_driver`]. + pub(crate) async fn new_remote_external( + channel: Channel, + store: Arc, + sandbox_index: SandboxIndex, + sandbox_watch_bus: SandboxWatchBus, + tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, + ) -> Result { + let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); + Self::from_driver( + None, + driver, + None, + None, + None, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + true, + Vec::new(), + ) + .await + } + pub async fn new_podman( config: PodmanComputeConfig, store: Arc, @@ -393,7 +440,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(PodmanDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Podman, + Some(ComputeDriverKind::Podman), driver, None, None, @@ -1249,6 +1296,38 @@ impl ComputeRuntime { } } +/// Connect to an out-of-tree compute driver that is already listening on +/// `socket_path` and return a tonic `Channel` speaking `compute_driver.proto`. +/// +/// The gateway does not spawn or own the driver process — the operator is +/// responsible for placing the driver alongside the gateway and granting the +/// gateway uid read/write on the socket. The host portion of the URL is +/// ignored because the connector resolves to the UDS rather than DNS. +#[cfg(unix)] +pub async fn connect_external_compute_driver(socket_path: &Path) -> Result { + let socket_path: PathBuf = socket_path.to_path_buf(); + let display_path = socket_path.clone(); + Endpoint::from_static("http://[::]:50051") + .connect_with_connector(service_fn(move |_: tonic::transport::Uri| { + let socket_path = socket_path.clone(); + async move { UnixStream::connect(socket_path).await.map(TokioIo::new) } + })) + .await + .map_err(|e| { + ComputeError::Message(format!( + "failed to connect to external compute driver socket '{}': {e}", + display_path.display() + )) + }) +} + +#[cfg(not(unix))] +pub async fn connect_external_compute_driver(_socket_path: &Path) -> Result { + Err(ComputeError::Message( + "the external compute driver requires unix domain socket support".to_string(), + )) +} + fn driver_sandbox_from_public(sandbox: &Sandbox) -> DriverSandbox { DriverSandbox { id: sandbox.object_id().to_string(), diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 676e23071..12911480b 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -703,6 +703,26 @@ async fn build_compute_runtime( tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { + if let Some(socket_path) = config.external_compute_driver_socket.as_deref() { + info!( + socket = %socket_path.display(), + "Using external compute driver" + ); + let channel = compute::connect_external_compute_driver(socket_path) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}")))?; + return ComputeRuntime::new_remote_external( + channel, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))); + } + let driver = configured_compute_driver(config)?; info!(driver = %driver, "Using compute driver"); warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, driver);