diff --git a/architecture/gateway.md b/architecture/gateway.md index 533ce04a5..e1e5265ac 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -60,8 +60,11 @@ checks the returned pod binding against the live pod UID, and verifies the pod's controlling `Sandbox` ownerReference against the live Sandbox CR UID and sandbox-id label before minting the gateway JWT. Supervisors renew gateway JWTs in memory before expiry only while the sandbox record still exists. Older tokens -are not server-revoked; deployments bound replay exposure with short -`gateway_jwt.ttl_secs` lifetimes. +are not server-revoked; shared deployments bound replay exposure with short +`gateway_jwt.ttl_secs` lifetimes. The config default is +`gateway_jwt.ttl_secs = 0` for local single-player Docker, Podman, and VM +gateways; those tokens carry `exp = 0` and do not expire. Kubernetes and other +shared deployments should set a positive TTL. Gateway JWT signing-key rotation is currently an offline operator action. The runtime loads one active signing key and one matching public verification key diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index a517ab3eb..04d6928da 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -504,7 +504,8 @@ pub struct GatewayJwtConfig { /// hostname-or-`openshell` placeholder if unset. #[serde(default = "default_gateway_id")] pub gateway_id: String, - /// Token lifetime in seconds. Defaults to 1 hour. + /// Token lifetime in seconds. A value of 0 disables expiration and is + /// intended only for local single-player deployments. #[serde(default = "default_sandbox_token_ttl_secs")] pub ttl_secs: u64, } @@ -514,7 +515,7 @@ fn default_gateway_id() -> String { } const fn default_sandbox_token_ttl_secs() -> u64 { - 3_600 + 0 } fn default_roles_claim() -> String { @@ -726,7 +727,7 @@ mod tests { #[cfg(unix)] use super::is_reachable_unix_socket; use super::{ - ComputeDriverKind, Config, DEFAULT_SERVICE_ROUTING_DOMAIN, detect_driver, + ComputeDriverKind, Config, DEFAULT_SERVICE_ROUTING_DOMAIN, GatewayJwtConfig, detect_driver, docker_host_unix_socket_path, is_unix_socket, podman_socket_candidates_from_env, podman_socket_responds, }; @@ -781,6 +782,18 @@ mod tests { assert!(!cfg.auth.allow_unauthenticated_users); } + #[test] + fn gateway_jwt_ttl_defaults_to_non_expiring() { + let cfg: GatewayJwtConfig = serde_json::from_value(serde_json::json!({ + "signing_key_path": "/tmp/signing.pem", + "public_key_path": "/tmp/public.pem", + "kid_path": "/tmp/kid" + })) + .expect("gateway JWT config should deserialize with default ttl"); + + assert_eq!(cfg.ttl_secs, 0); + } + #[test] fn service_routing_allows_loopback_plaintext_http_by_default() { let cfg = Config::new(None); diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index d03eb1281..ffa22f951 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -389,7 +389,7 @@ async fn refresh_token_loop( /// Compute the next refresh delay: 80 % of the time remaining until the /// current token's `exp`, plus up to 10 % jitter, with a small lower bound /// for already-expired tokens and capped at 12 h. If the token can't be parsed -/// (legacy/non-JWT bearer) +/// (legacy/non-JWT bearer) or carries the `exp = 0` non-expiring sentinel, /// default to 6 h. fn compute_refresh_delay(slot: &TokenSlot) -> Duration { let token = slot @@ -404,11 +404,16 @@ fn compute_refresh_delay(slot: &TokenSlot) -> Duration { .map_or(0, |d| d.as_millis()), ) .unwrap_or(i64::MAX); - let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback - let mut delay_ms = if remaining_ms <= 0 { - 1_000 - } else { - (remaining_ms * 8 / 10).clamp(1_000, 43_200_000) + let mut delay_ms = match parse_jwt_exp_ms(bearer) { + Some(0) | None => 21_600_000, + Some(exp) => { + let remaining_ms = exp - now_ms; + if remaining_ms <= 0 { + 1_000 + } else { + (remaining_ms * 8 / 10).clamp(1_000, 43_200_000) + } + } }; // Up to 10 % jitter, derived deterministically from token bytes so // unit tests are reproducible without injecting an RNG. @@ -494,6 +499,20 @@ mod auth_tests { assert!((1..60).contains(&delay.as_secs())); } + #[test] + fn compute_refresh_delay_treats_exp_zero_as_non_expiring() { + use base64::Engine as _; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(r#"{"exp":0}"#); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + assert!( + (6 * 60 * 60..=7 * 60 * 60).contains(&delay.as_secs()), + "non-expiring tokens should use the fallback refresh delay, got {delay:?}" + ); + } + #[test] fn compute_refresh_delay_supports_short_token_ttl() { use base64::Engine as _; diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs index 2ec890249..39f5982ca 100644 --- a/crates/openshell-server/src/auth/sandbox_jwt.rs +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -31,6 +31,7 @@ use tracing::{debug, warn}; /// reuse the same subject namespace without breaking handler equality /// checks. const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/"; +const SANDBOX_JWT_EXP_LEEWAY_SECS: i64 = 60; /// JWT claim set serialized in every gateway-minted sandbox token. #[derive(Debug, Serialize, Deserialize)] @@ -100,7 +101,11 @@ impl SandboxJwtIssuer { #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here pub fn mint(&self, sandbox_id: &str) -> Result { let now = now_secs(); - let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(3_600); + let exp = if self.ttl.is_zero() { + 0 + } else { + now.saturating_add(i64::try_from(self.ttl.as_secs()).unwrap_or(3_600)) + }; let claims = SandboxJwtClaims { sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), iss: self.issuer.clone(), @@ -178,6 +183,7 @@ impl SandboxJwtAuthenticator { validation.set_issuer(&[&self.issuer]); validation.set_audience(&[&self.audience]); validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + validation.validate_exp = false; let data = decode::(token, &self.decoding_key, &validation).map_err(|e| { @@ -186,6 +192,7 @@ impl SandboxJwtAuthenticator { })?; let claims = data.claims; + validate_exp(claims.exp)?; Ok(Some(Principal::Sandbox(SandboxPrincipal { sandbox_id: claims.sandbox_id, source: SandboxIdentitySource::BootstrapJwt { issuer: claims.iss }, @@ -212,6 +219,20 @@ impl Authenticator for SandboxJwtAuthenticator { } } +#[allow(clippy::result_large_err)] +fn validate_exp(exp: i64) -> Result<(), Status> { + if exp == 0 { + return Ok(()); + } + + if exp < now_secs().saturating_sub(SANDBOX_JWT_EXP_LEEWAY_SECS) { + debug!("sandbox JWT expired"); + return Err(Status::unauthenticated("invalid token: ExpiredSignature")); + } + + Ok(()) +} + fn now_secs() -> i64 { i64::try_from( SystemTime::now() @@ -236,12 +257,16 @@ mod tests { } fn pair() -> (SandboxJwtIssuer, SandboxJwtAuthenticator) { + pair_with_ttl(Duration::from_secs(3600)) + } + + fn pair_with_ttl(ttl: Duration) -> (SandboxJwtIssuer, SandboxJwtAuthenticator) { let mat = generate_jwt_key().expect("jwt key"); let issuer = SandboxJwtIssuer::from_pem( mat.signing_key_pem.as_bytes(), mat.kid.clone(), "test-gateway", - Duration::from_secs(3600), + ttl, ) .unwrap(); let auth = SandboxJwtAuthenticator::from_pem( @@ -276,6 +301,30 @@ mod tests { } } + #[tokio::test] + async fn ttl_zero_mints_non_expiring_token() { + let (issuer, auth) = pair_with_ttl(Duration::ZERO); + let minted = issuer.mint("sandbox-never").unwrap(); + assert_eq!(minted.expires_at_ms, 0); + + let principal = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap() + .expect("exp=0 token should authenticate"); + assert!(matches!(principal, Principal::Sandbox(_))); + + let mut validation = Validation::new(Algorithm::EdDSA); + validation.algorithms = vec![Algorithm::EdDSA]; + validation.set_issuer(&["openshell-gateway:test-gateway"]); + validation.set_audience(&["openshell-gateway:test-gateway"]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + validation.validate_exp = false; + let decoded = decode::(&minted.token, &auth.decoding_key, &validation) + .expect("token should decode"); + assert_eq!(decoded.claims.exp, 0); + } + #[tokio::test] async fn token_signed_by_other_key_is_rejected() { let (_, auth_a) = pair(); diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index b8d345f9e..3f313f18b 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -618,6 +618,13 @@ fn effective_single_driver(args: &RunArgs) -> Option { } } +fn is_singleplayer_driver(args: &RunArgs) -> bool { + matches!( + effective_single_driver(args), + Some(ComputeDriverKind::Docker | ComputeDriverKind::Podman | ComputeDriverKind::Vm) + ) +} + fn resolve_mtls_auth_enabled( args: &RunArgs, matches: &ArgMatches, @@ -634,10 +641,7 @@ fn resolve_mtls_auth_enabled( return false; } - matches!( - effective_single_driver(args), - Some(ComputeDriverKind::Docker | ComputeDriverKind::Podman | ComputeDriverKind::Vm) - ) + is_singleplayer_driver(args) } /// Build [`VmComputeConfig`] from the `[openshell.drivers.vm]` table @@ -1376,6 +1380,41 @@ ssh_session_ttl_secs = 1234 assert_eq!(file.openshell.gateway.ssh_session_ttl_secs, Some(1234)); } + #[test] + fn singleplayer_driver_matches_only_one_local_driver() { + for driver in ["docker", "podman", "vm"] { + let (args, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + driver, + ]); + assert!( + super::is_singleplayer_driver(&args), + "{driver} should be singleplayer" + ); + } + + let (k8s, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "kubernetes", + ]); + assert!(!super::is_singleplayer_driver(&k8s)); + + let (multi, _) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "docker,podman", + ]); + assert!(!super::is_singleplayer_driver(&multi)); + } + #[test] fn file_populates_service_routing_fields() { let _lock = ENV_LOCK diff --git a/crates/openshell-server/src/defaults.rs b/crates/openshell-server/src/defaults.rs index 25179bbd3..b5a5a5e92 100644 --- a/crates/openshell-server/src/defaults.rs +++ b/crates/openshell-server/src/defaults.rs @@ -104,7 +104,7 @@ pub fn complete_local_jwt_config() -> Result> { public_key_path: paths.public_key, kid_path: paths.kid, gateway_id: "openshell".to_string(), - ttl_secs: 3_600, + ttl_secs: 0, })), _ => Err(miette::miette!( "partial local sandbox JWT state in {}: expected jwt/signing.pem, jwt/public.pem, and jwt/kid", @@ -237,6 +237,6 @@ mod tests { assert_eq!(config.public_key_path, tmp.path().join("jwt/public.pem")); assert_eq!(config.kid_path, tmp.path().join("jwt/kid")); assert_eq!(config.gateway_id, "openshell"); - assert_eq!(config.ttl_secs, 3_600); + assert_eq!(config.ttl_secs, 0); } } diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index c25ba1cfd..676e23071 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -705,6 +705,7 @@ async fn build_compute_runtime( ) -> Result { let driver = configured_compute_driver(config)?; info!(driver = %driver, "Using compute driver"); + warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, driver); match driver { ComputeDriverKind::Kubernetes => { @@ -878,13 +879,30 @@ fn configured_compute_driver(config: &Config) -> Result { } } +fn kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: ComputeDriverKind) -> bool { + matches!(driver, ComputeDriverKind::Kubernetes) + && config + .gateway_jwt + .as_ref() + .is_some_and(|jwt| jwt.ttl_secs == 0) +} + +fn warn_if_kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: ComputeDriverKind) { + if kubernetes_sandbox_jwt_expiry_disabled(config, driver) { + warn!( + "Kubernetes gateway configured with non-expiring sandbox JWTs (gateway_jwt.ttl_secs = 0); set ttl_secs > 0 for shared Kubernetes deployments" + ); + } +} + #[cfg(test)] mod tests { use super::{ ConnectionProtocol, MultiplexService, ServerState, TlsAcceptor, allow_plaintext_service_http, classify_initial_bytes, configured_compute_driver, gateway_listener_addresses, is_benign_tls_handshake_failure, - kubernetes_config_for_k8s_sa_bootstrap, serve_gateway_listener, + kubernetes_config_for_k8s_sa_bootstrap, kubernetes_sandbox_jwt_expiry_disabled, + serve_gateway_listener, }; use openshell_core::{ ComputeDriverKind, Config, @@ -1288,6 +1306,38 @@ mod tests { ); } + #[test] + fn kubernetes_sandbox_jwt_expiry_disabled_warns_only_for_kubernetes_zero_ttl() { + fn config_with_jwt_ttl(ttl_secs: u64) -> Config { + let mut config = Config::new(None); + config.gateway_jwt = Some(openshell_core::GatewayJwtConfig { + signing_key_path: "/tmp/signing.pem".into(), + public_key_path: "/tmp/public.pem".into(), + kid_path: "/tmp/kid".into(), + gateway_id: "openshell".to_string(), + ttl_secs, + }); + config + } + + assert!(kubernetes_sandbox_jwt_expiry_disabled( + &config_with_jwt_ttl(0), + ComputeDriverKind::Kubernetes + )); + assert!(!kubernetes_sandbox_jwt_expiry_disabled( + &config_with_jwt_ttl(3600), + ComputeDriverKind::Kubernetes + )); + assert!(!kubernetes_sandbox_jwt_expiry_disabled( + &config_with_jwt_ttl(0), + ComputeDriverKind::Docker + )); + assert!(!kubernetes_sandbox_jwt_expiry_disabled( + &Config::new(None), + ComputeDriverKind::Kubernetes + )); + } + #[test] fn k8s_sa_bootstrap_rejects_missing_kubernetes_driver_config() { let err = kubernetes_config_for_k8s_sa_bootstrap(None).unwrap_err(); diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index a4606cdb2..0d49a6096 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -110,6 +110,7 @@ signing_key_path = "/etc/openshell/jwt/signing.pem" public_key_path = "/etc/openshell/jwt/public.pem" kid_path = "/etc/openshell/jwt/kid" gateway_id = "openshell" +# Omit or set to 0 only for local single-player Docker, Podman, or VM gateways. ttl_secs = 3600 [openshell.gateway.auth] @@ -130,6 +131,8 @@ scopes_claim = "" Local Docker, Podman, and VM gateways can also set `[openshell.gateway.mtls_auth] enabled = true` to authenticate CLI callers from verified client certificates. Kubernetes deployments must leave this unset and use OIDC or a trusted access proxy; the Helm chart does not render this table. +`[openshell.gateway.gateway_jwt] ttl_secs` controls gateway-minted sandbox JWT lifetime. When omitted, it defaults to `0`: the token `exp` claim and `expires_at_ms` response field become `0`, and the sandbox JWT does not expire. Use that default only for local single-player Docker, Podman, or VM gateways. Kubernetes and other shared deployments should set a positive TTL; Helm renders `3600` seconds by default, and the gateway logs a warning when a Kubernetes gateway uses `0`. + `[openshell.gateway.auth] allow_unauthenticated_users = true` is an unsafe local-development and trusted-proxy escape hatch. It accepts user-facing CLI/API calls without OIDC or mTLS credentials while sandbox supervisors still authenticate with gateway-minted sandbox JWTs. Leave it false for shared and production gateways. `image_pull_policy` is intentionally not a shared gateway key. Kubernetes and Docker use `Always`, `IfNotPresent`, or `Never`. Podman uses `always`, `missing`, `never`, or `newer`. Set it inside the relevant driver table. diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index 990aa5c46..aceacf682 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -76,6 +76,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + [[package]] name = "equivalent" version = "1.0.2" @@ -272,6 +278,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" @@ -344,6 +359,7 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", + "prost", "rand", "serde_json", "sha1", @@ -409,6 +425,29 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "quote" version = "1.0.45" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 26957baab..31c6a3347 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -46,6 +46,11 @@ name = "gateway_resume" path = "tests/gateway_resume.rs" required-features = ["e2e-docker"] +[[test]] +name = "local_driver_token_restart" +path = "tests/local_driver_token_restart.rs" +required-features = ["e2e"] + [[test]] name = "podman_gateway_resume" path = "tests/podman_gateway_resume.rs" @@ -97,6 +102,7 @@ bytes = "1" http-body-util = "0.1" hyper = { version = "1", features = ["client", "http1"] } hyper-util = { version = "0.1", features = ["tokio"] } +prost = "0.13" tokio = { version = "1.43", features = ["full"] } tempfile = "3" sha1 = "0.10" diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh index 69cb03309..926821e53 100755 --- a/e2e/rust/e2e-vm.sh +++ b/e2e/rust/e2e-vm.sh @@ -219,7 +219,9 @@ signing_key_path = "${JWT_DIR}/signing.pem" public_key_path = "${JWT_DIR}/public.pem" kid_path = "${JWT_DIR}/kid" gateway_id = "${GATEWAY_NAME}" -ttl_secs = 3600 +# Local VM e2e gateways exercise the single-player default: sandbox JWTs +# identify the supervisor and do not expire. +ttl_secs = 0 [openshell.drivers.vm] grpc_endpoint = "https://host.openshell.internal:${HOST_PORT}" @@ -282,6 +284,7 @@ e2e_register_mtls_gateway \ export OPENSHELL_GATEWAY_ENDPOINT="${CLI_GATEWAY_ENDPOINT}" export OPENSHELL_E2E_EXPECT_VM_OVERLAY=1 export OPENSHELL_E2E_DRIVER="vm" +export OPENSHELL_E2E_VM_STATE_DIR="${RUN_STATE_DIR}" e2e_export_gateway_restart_metadata \ "${GATEWAY_BIN}" \ "${GATEWAY_ARGS_FILE}" \ diff --git a/e2e/rust/tests/local_driver_token_restart.rs b/e2e/rust/tests/local_driver_token_restart.rs new file mode 100644 index 000000000..2a7a54603 --- /dev/null +++ b/e2e/rust/tests/local_driver_token_restart.rs @@ -0,0 +1,389 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! Local-driver E2E regression for sandbox supervisor restart from bootstrap +//! JWT material. Docker and Podman supervisors reload their mounted token file +//! after a container restart. VM sandboxes reboot from persisted driver state +//! after the VM driver restarts. Local single-player gateway configs should +//! mint that token with `exp = 0` so reconnect does not depend on token refresh. + +use std::fs; +use std::path::PathBuf; +use std::process::Stdio; +use std::time::{Duration, Instant}; + +use base64::Engine as _; +use openshell_e2e::harness::cli::{wait_for_healthy, wait_for_sandbox_exec_contains}; +use openshell_e2e::harness::container::{ContainerEngine, e2e_driver}; +use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::sandbox::SandboxGuard; +use prost::Message; +use tokio::time::sleep; + +const READY_MARKER: &str = "local-driver-token-restart-ready"; +const RESTART_FILE: &str = "/sandbox/local-driver-token-restart-state"; +const CONTAINER_TOKEN_MOUNT_PATH: &str = "/etc/openshell/auth/sandbox.jwt"; +const VM_STATE_DIR_ENV: &str = "OPENSHELL_E2E_VM_STATE_DIR"; + +#[derive(Clone, PartialEq, Message)] +struct PersistedDriverSandbox { + #[prost(string, tag = "2")] + name: String, + #[prost(message, optional, tag = "4")] + spec: Option, +} + +#[derive(Clone, PartialEq, Message)] +struct PersistedDriverSandboxSpec { + #[prost(string, tag = "11")] + sandbox_token: String, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum LocalDriver { + Docker, + Podman, + Vm, +} + +impl LocalDriver { + fn from_env() -> Option { + match e2e_driver().as_deref() { + Some("docker") => Some(Self::Docker), + Some("podman") => Some(Self::Podman), + Some("vm") => Some(Self::Vm), + _ => None, + } + } + + fn is_container(self) -> bool { + matches!(self, Self::Docker | Self::Podman) + } + + fn container_filters(self, namespace: &str, sandbox_name: &str) -> Vec { + match self { + Self::Docker => vec![ + "label=openshell.ai/managed-by=openshell".to_string(), + format!("label=openshell.ai/sandbox-namespace={namespace}"), + format!("label=openshell.ai/sandbox-name={sandbox_name}"), + ], + Self::Podman => vec![ + "label=openshell.managed=true".to_string(), + format!("label=openshell.sandbox-name={sandbox_name}"), + ], + Self::Vm => Vec::new(), + } + } +} + +fn run_engine(engine: &ContainerEngine, args: &[String]) -> Result { + let output = engine + .command() + .args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("failed to run {} {}: {err}", engine.name(), args.join(" ")))?; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + if !output.status.success() { + return Err(format!( + "{} {} failed (exit {:?}):\n{combined}", + engine.name(), + args.join(" "), + output.status.code() + )); + } + Ok(stdout.trim().to_string()) +} + +fn sandbox_container_id( + engine: &ContainerEngine, + driver: LocalDriver, + namespace: &str, + sandbox_name: &str, +) -> Result { + let mut args = vec!["ps".to_string(), "-aq".to_string()]; + for filter in driver.container_filters(namespace, sandbox_name) { + args.push("--filter".to_string()); + args.push(filter); + } + + let stdout = run_engine(engine, &args)?; + let ids = stdout + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .collect::>(); + match ids.as_slice() { + [id] => Ok((*id).to_string()), + [] => Err(format!( + "no {driver:?} container found for sandbox '{sandbox_name}' in namespace '{namespace}'" + )), + _ => Err(format!( + "multiple {driver:?} containers found for sandbox '{sandbox_name}' in namespace '{namespace}': {ids:?}" + )), + } +} + +fn container_running(engine: &ContainerEngine, container_id: &str) -> Result { + let output = run_engine( + engine, + &[ + "inspect".to_string(), + "-f".to_string(), + "{{.State.Running}}".to_string(), + container_id.to_string(), + ], + )?; + match output.trim() { + "true" => Ok(true), + "false" => Ok(false), + other => Err(format!( + "unexpected running state for container {container_id}: {other}" + )), + } +} + +async fn wait_for_container_running( + engine: &ContainerEngine, + container_id: &str, + expected: bool, + timeout: Duration, +) -> Result<(), String> { + let start = Instant::now(); + + loop { + let last_state = match container_running(engine, container_id) { + Ok(running) if running == expected => return Ok(()), + Ok(running) => format!("running={running}"), + Err(err) => err, + }; + + if start.elapsed() > timeout { + return Err(format!( + "container {container_id} did not reach running={expected} within {}s. Last state: {last_state}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(1)).await; + } +} + +fn read_bootstrap_token(engine: &ContainerEngine, container_id: &str) -> Result { + run_engine( + engine, + &[ + "exec".to_string(), + container_id.to_string(), + "cat".to_string(), + CONTAINER_TOKEN_MOUNT_PATH.to_string(), + ], + ) +} + +fn read_vm_bootstrap_token(sandbox_name: &str) -> Result { + let state_dir = std::env::var_os(VM_STATE_DIR_ENV) + .map(PathBuf::from) + .ok_or_else(|| format!("{VM_STATE_DIR_ENV} must be set for VM restart coverage"))?; + let sandboxes_dir = state_dir.join("sandboxes"); + let entries = fs::read_dir(&sandboxes_dir) + .map_err(|err| format!("read VM sandboxes dir '{}': {err}", sandboxes_dir.display()))?; + + let mut decoded_names = Vec::new(); + for entry in entries { + let entry = entry.map_err(|err| { + format!( + "read VM sandbox dir entry under '{}': {err}", + sandboxes_dir.display() + ) + })?; + let request_path = entry.path().join("sandbox.pb"); + let bytes = match fs::read(&request_path) { + Ok(bytes) => bytes, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => continue, + Err(err) => { + return Err(format!( + "read VM sandbox request '{}': {err}", + request_path.display() + )); + } + }; + + let sandbox = PersistedDriverSandbox::decode(bytes.as_slice()).map_err(|err| { + format!( + "decode VM sandbox request '{}': {err}", + request_path.display() + ) + })?; + decoded_names.push(sandbox.name.clone()); + if sandbox.name != sandbox_name { + continue; + } + + let spec = sandbox + .spec + .ok_or_else(|| format!("VM sandbox '{sandbox_name}' is missing driver spec"))?; + if spec.sandbox_token.trim().is_empty() { + return Err(format!( + "VM sandbox '{sandbox_name}' persisted driver spec has no sandbox token" + )); + } + return Ok(spec.sandbox_token); + } + + Err(format!( + "no VM sandbox request found for '{sandbox_name}' under '{}'. Decoded sandbox names: {decoded_names:?}", + sandboxes_dir.display() + )) +} + +fn token_exp_claim(token: &str) -> Result { + let payload_b64 = token + .trim() + .split('.') + .nth(1) + .ok_or_else(|| "sandbox JWT has no payload segment".to_string())?; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .map_err(|err| format!("failed to decode sandbox JWT payload: {err}"))?; + let claims: serde_json::Value = serde_json::from_slice(&payload) + .map_err(|err| format!("failed to parse sandbox JWT claims: {err}"))?; + claims + .get("exp") + .and_then(serde_json::Value::as_i64) + .ok_or_else(|| format!("sandbox JWT missing integer exp claim: {claims}")) +} + +fn require_non_expiring_token(token: &str, context: &str) -> Result<(), String> { + let exp = token_exp_claim(token)?; + if exp != 0 { + return Err(format!("{context} should use exp=0, got exp={exp}")); + } + Ok(()) +} + +async fn restart_container_sandbox( + engine: &ContainerEngine, + driver: LocalDriver, + namespace: &str, + sandbox_name: &str, +) -> Result<(), String> { + let container_id = sandbox_container_id(engine, driver, namespace, sandbox_name)?; + let token = read_bootstrap_token(engine, &container_id)?; + require_non_expiring_token(&token, "local-driver bootstrap JWT")?; + + run_engine(engine, &["stop".to_string(), container_id.clone()])?; + wait_for_container_running(engine, &container_id, false, Duration::from_secs(60)).await?; + + run_engine(engine, &["start".to_string(), container_id.clone()])?; + wait_for_container_running(engine, &container_id, true, Duration::from_secs(60)).await +} + +async fn restart_vm_sandbox(gateway: &ManagedGateway, sandbox_name: &str) -> Result<(), String> { + let token = read_vm_bootstrap_token(sandbox_name)?; + require_non_expiring_token(&token, "VM bootstrap JWT")?; + + gateway.stop()?; + gateway.start()?; + wait_for_healthy(Duration::from_secs(120)).await +} + +async fn wait_for_driver_reconnect( + driver: LocalDriver, + sandbox_name: &str, +) -> Result<(), String> { + match driver { + LocalDriver::Docker | LocalDriver::Podman => { + wait_for_sandbox_exec_contains( + sandbox_name, + &["cat", RESTART_FILE], + "before-local-driver-restart", + Duration::from_secs(240), + ) + .await + } + LocalDriver::Vm => { + wait_for_sandbox_exec_contains( + sandbox_name, + &["echo", "vm-reconnect-ok"], + "vm-reconnect-ok", + Duration::from_secs(240), + ) + .await + } + } +} + +#[tokio::test] +async fn local_driver_sandbox_restarts_with_non_expiring_bootstrap_jwt() { + let Some(driver) = LocalDriver::from_env() else { + eprintln!("Skipping local-driver token restart test: e2e driver is not Docker, Podman, or VM"); + return; + }; + let namespace = if driver.is_container() { + let Some(namespace) = std::env::var("OPENSHELL_E2E_SANDBOX_NAMESPACE") + .ok() + .filter(|value| !value.trim().is_empty()) + else { + eprintln!( + "Skipping local-driver token restart test: OPENSHELL_E2E_SANDBOX_NAMESPACE is unavailable" + ); + return; + }; + Some(namespace) + } else { + None + }; + let engine = driver.is_container().then(ContainerEngine::from_env); + let gateway = if driver == LocalDriver::Vm { + let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") + else { + eprintln!( + "Skipping local-driver token restart test: VM e2e gateway is not managed by this test run" + ); + return; + }; + Some(gateway) + } else { + None + }; + + wait_for_healthy(Duration::from_secs(30)) + .await + .expect("gateway should start healthy"); + + let script = format!( + "echo before-local-driver-restart > {RESTART_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" + ); + let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) + .await + .expect("create long-running local-driver sandbox"); + + match driver { + LocalDriver::Docker | LocalDriver::Podman => { + let engine = engine.as_ref().expect("container engine should be set"); + let namespace = namespace + .as_deref() + .expect("container namespace should be set"); + restart_container_sandbox(engine, driver, namespace, &sandbox.name) + .await + .expect("restart sandbox container"); + } + LocalDriver::Vm => { + let gateway = gateway.as_ref().expect("managed VM gateway should be set"); + restart_vm_sandbox(gateway, &sandbox.name) + .await + .expect("restart e2e VM gateway"); + } + } + + wait_for_driver_reconnect(driver, &sandbox.name) + .await + .expect("sandbox supervisor should reconnect after local-driver restart"); + + sandbox.cleanup().await; +} diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 9766126d5..8da3d0706 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -65,6 +65,10 @@ e2e_preserve_mise_dirs() { *) export MISE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/mise" ;; esac fi + + if [ -z "${MISE_STATE_DIR:-}" ]; then + export MISE_STATE_DIR="${XDG_STATE_HOME:-${HOME}/.local/state}/mise" + fi } e2e_align_docker_host_with_cli_context() { @@ -153,7 +157,9 @@ e2e_write_gateway_jwt_config() { printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" - printf 'ttl_secs = 3600\n\n' + # Local Docker/Podman e2e gateways exercise the single-player default: + # sandbox JWTs identify the supervisor and do not expire. + printf 'ttl_secs = 0\n\n' } e2e_write_gateway_mtls_auth_config() { diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index 3fea3e53a..dc4f4ede1 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -75,11 +75,18 @@ cleanup() { e2e_stop_gateway "${GATEWAY_PID}" "${GATEWAY_PID_FILE}" local sandbox_ids="" - if [ -n "${E2E_NAMESPACE}" ] && command -v podman >/dev/null 2>&1; then - sandbox_ids="$(podman_cmd ps -aq \ - --filter "label=openshell.managed=true" \ - --filter "label=openshell.sandbox-namespace=${E2E_NAMESPACE}" \ - 2>/dev/null || true)" + if command -v podman >/dev/null 2>&1; then + if [ -n "${PODMAN_NETWORK_NAME}" ]; then + sandbox_ids="$(podman_cmd ps -aq \ + --filter "label=openshell.managed=true" \ + --filter "network=${PODMAN_NETWORK_NAME}" \ + 2>/dev/null || true)" + elif [ -n "${E2E_NAMESPACE}" ]; then + sandbox_ids="$(podman_cmd ps -aq \ + --filter "label=openshell.managed=true" \ + --filter "label=openshell.sandbox-namespace=${E2E_NAMESPACE}" \ + 2>/dev/null || true)" + fi fi if [ "${exit_code}" -ne 0 ] && [ -n "${sandbox_ids}" ]; then diff --git a/proto/openshell.proto b/proto/openshell.proto index f9b64618b..a8ead0d31 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -253,7 +253,8 @@ message IssueSandboxTokenRequest {} message IssueSandboxTokenResponse { // Gateway-minted JWT bound to the calling sandbox's UUID. string token = 1; - // Absolute expiry of the issued token, milliseconds since the epoch. + // Absolute expiry of the issued token, milliseconds since the epoch. 0 means + // the token is non-expiring. int64 expires_at_ms = 2; } @@ -267,7 +268,8 @@ message RefreshSandboxTokenRequest {} message RefreshSandboxTokenResponse { // Fresh gateway-minted JWT bound to the same sandbox UUID. string token = 1; - // Absolute expiry of the new token, milliseconds since the epoch. + // Absolute expiry of the new token, milliseconds since the epoch. 0 means + // the token is non-expiring. int64 expires_at_ms = 2; }