From 36406eff28ee3b268a27d65b497a0c48a4b48ebf Mon Sep 17 00:00:00 2001 From: st-gr <38470677+st-gr@users.noreply.github.com> Date: Wed, 17 Jun 2026 14:27:48 +0200 Subject: [PATCH 1/2] feat(server): support remote compute driver endpoints Add named remote compute driver endpoint support to the gateway. Remote drivers are selected by a non-reserved compute driver name and either a CLI/env socket endpoint or [openshell.drivers.].socket_path. The VM driver now enters ComputeRuntime through the same acquired remote endpoint path, while Docker, Podman, and Kubernetes retain their in-process drivers. Require --drivers/OPENSHELL_DRIVERS when pairing an ad-hoc socket endpoint so the socket does not imply a magic driver name, and keep reserved in-tree names unavailable for unmanaged socket endpoints. Co-authored-by: Evan Lezar Signed-off-by: st-gr <38470677+st-gr@users.noreply.github.com> Signed-off-by: Evan Lezar --- architecture/compute-runtimes.md | 4 +- crates/openshell-core/src/config.rs | 70 ++++- crates/openshell-server/src/cli.rs | 188 ++++++++++++- crates/openshell-server/src/compute/mod.rs | 168 +++++++++--- crates/openshell-server/src/compute/vm.rs | 13 +- crates/openshell-server/src/config_file.rs | 4 +- crates/openshell-server/src/lib.rs | 290 +++++++++++++++++---- docs/reference/gateway-config.mdx | 21 ++ docs/reference/sandbox-compute-drivers.mdx | 31 ++- 9 files changed, 672 insertions(+), 117 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 092f2c049..5024ec515 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -33,7 +33,8 @@ when a sandbox create request asks for GPU resources. | Docker | Local development with Docker available. | Container plus nested sandbox namespace. | Uses host networking so loopback gateway endpoints work from the supervisor. | | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | | Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | -| VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. | +| VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Managed endpoint-backed driver. The gateway spawns `openshell-driver-vm`, waits for its Unix socket, and then consumes it through the same remote `compute_driver.proto` path used by unmanaged endpoint drivers. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. | +| Extension | Out-of-tree drivers operated alongside the gateway. | Whatever boundary the driver implements. | Selected by a non-reserved custom `compute_drivers = [""]` entry with `[openshell.drivers.].socket_path`, or at launch time by pairing `--drivers ` with `--compute-driver-socket=`. Reserved built-in names such as `vm`, `docker`, `podman`, and `kubernetes` cannot be used as unmanaged socket endpoints. The gateway connects to a UDS the operator already provisioned, runs `GetCapabilities`, logs the advertised `driver_name`, and dispatches all sandbox lifecycle calls through `compute_driver.proto`. The driver process and socket lifecycle are operator-owned; the gateway does not spawn, supervise, or remove unmanaged extension drivers. The trust boundary is the socket's filesystem permissions: the operator must ensure only the gateway uid can read/write it. | Per-sandbox CPU and memory values currently enter the driver layer through template resource limits. Docker and Podman apply them as runtime limits. @@ -84,6 +85,7 @@ The supervisor must be available inside each sandbox workload: | Podman | Read-only OCI image volume containing the supervisor binary. | | Kubernetes | Sandbox pod image or pod template configuration. | | VM | Embedded in the guest rootfs bundle. | +| Extension | Defined by the out-of-tree driver. | Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index eaaf1e4a0..c66d32610 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -4,6 +4,7 @@ //! Configuration management for `OpenShell` components. use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; use std::fmt; #[cfg(unix)] use std::io::{Read, Write}; @@ -69,6 +70,27 @@ impl ComputeDriverKind { } } +/// Normalize a configured compute driver name. +/// +/// Built-in driver names and custom remote driver names share the same +/// selection namespace. The normalized value is lowercase ASCII and may contain +/// letters, digits, `-`, and `_`. +pub fn normalize_compute_driver_name(value: &str) -> Result { + let value = value.trim(); + if value.is_empty() { + return Err("compute driver name cannot be empty".to_string()); + } + if !value + .bytes() + .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'-' | b'_')) + { + return Err(format!( + "invalid compute driver name '{value}'. use ASCII letters, digits, '-' or '_'" + )); + } + Ok(value.to_ascii_lowercase()) +} + impl fmt::Display for ComputeDriverKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(self.as_str()) @@ -358,7 +380,14 @@ pub struct Config { /// The config shape allows multiple drivers so the gateway can evolve /// toward multi-backend routing. Current releases require exactly one /// configured driver. - pub compute_drivers: Vec, + pub compute_drivers: Vec, + + /// Operator-provided endpoints for named remote compute drivers. + /// + /// This is populated by CLI/env inputs such as `--compute-driver-socket`. + /// TOML-authored endpoints live under `[openshell.drivers.]` and are + /// resolved by the gateway config loader. + pub compute_driver_endpoints: BTreeMap, /// TTL for SSH session tokens, in seconds. 0 disables expiry. pub ssh_session_ttl_secs: u64, @@ -559,6 +588,7 @@ impl Config { gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], + compute_driver_endpoints: BTreeMap::new(), ssh_session_ttl_secs: default_ssh_session_ttl_secs(), grpc_rate_limit_requests: None, grpc_rate_limit_window_secs: None, @@ -614,11 +644,27 @@ impl Config { /// Create a new configuration with the configured compute drivers. #[must_use] - pub fn with_compute_drivers(mut self, drivers: I) -> Self + pub fn with_compute_drivers(mut self, drivers: I) -> Self where - I: IntoIterator, + I: IntoIterator, + D: ToString, { - self.compute_drivers = drivers.into_iter().collect(); + self.compute_drivers = drivers + .into_iter() + .map(|driver| driver.to_string()) + .collect(); + self + } + + /// Register a Unix domain socket endpoint for a named remote driver. + #[must_use] + pub fn with_compute_driver_endpoint( + mut self, + name: impl Into, + socket: impl Into, + ) -> Self { + self.compute_driver_endpoints + .insert(name.into(), socket.into()); self } @@ -766,8 +812,8 @@ mod tests { use super::is_reachable_unix_socket; use super::{ ComputeDriverKind, Config, DEFAULT_SERVICE_ROUTING_DOMAIN, GatewayJwtConfig, detect_driver, - docker_host_unix_socket_path, is_unix_socket, podman_socket_candidates_from_env, - podman_socket_responds, + docker_host_unix_socket_path, is_unix_socket, normalize_compute_driver_name, + podman_socket_candidates_from_env, podman_socket_responds, }; #[cfg(unix)] use std::io::{Read as _, Write as _}; @@ -803,6 +849,18 @@ mod tests { assert!(err.contains("unsupported compute driver 'firecracker'")); } + #[test] + fn compute_driver_name_normalization_accepts_builtin_and_custom_names() { + assert_eq!(normalize_compute_driver_name(" VM ").unwrap(), "vm"); + assert_eq!( + normalize_compute_driver_name("Kyma_GPU-1").unwrap(), + "kyma_gpu-1" + ); + + let err = normalize_compute_driver_name("kyma/gpu").unwrap_err(); + assert!(err.contains("invalid compute driver name")); + } + #[test] fn config_defaults_to_loopback_bind_address() { let expected: SocketAddr = "127.0.0.1:17670".parse().expect("valid address"); diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index ce7734262..ef43dd405 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -109,7 +109,17 @@ struct RunArgs { value_delimiter = ',', value_parser = parse_compute_driver )] - drivers: Vec, + drivers: Vec, + + /// Path to a Unix domain socket served by a remote compute driver + /// implementing `compute_driver.proto`. + /// + /// When set, the socket is associated with the single driver name supplied + /// by `--drivers` or `OPENSHELL_DRIVERS`. Reserved built-in driver names + /// such as Docker, Podman, Kubernetes, and VM do not accept socket + /// endpoints. + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")] + compute_driver_socket: Option, /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel @@ -235,6 +245,7 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { if let Some(file) = file.as_ref() { merge_file_into_args(&mut args, &file.openshell.gateway, &matches); } + normalize_compute_driver_socket_args(&mut args, &matches)?; let local_tls = apply_runtime_defaults(&mut args)?; let local_jwt = defaults::complete_local_jwt_config()?; @@ -371,6 +382,13 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { args.grpc_rate_limit_requests, args.grpc_rate_limit_window_seconds, )?; + if let Some(socket) = args.compute_driver_socket.clone() { + let driver = args + .drivers + .first() + .expect("normalize_compute_driver_socket_args sets a driver for socket endpoints"); + config = config.with_compute_driver_endpoint(driver.clone(), socket); + } if let Some(ttl) = file .as_ref() @@ -457,8 +475,8 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { .into_diagnostic() } -fn parse_compute_driver(value: &str) -> std::result::Result { - value.parse() +fn parse_compute_driver(value: &str) -> std::result::Result { + openshell_core::config::normalize_compute_driver_name(value) } fn resolve_config_path(args: &RunArgs) -> Result> { @@ -657,10 +675,52 @@ fn validate_grpc_rate_limit_args(requests: Option, window_seconds: Option Result<()> { + let Some(socket) = args.compute_driver_socket.as_ref() else { + return Ok(()); + }; + if socket.as_os_str().is_empty() { + return Err(miette::miette!( + "--compute-driver-socket must not be an empty path" + )); + } + if arg_defaulted(matches, "drivers") { + return Err(miette::miette!( + "--compute-driver-socket requires --drivers or OPENSHELL_DRIVERS= to select a non-reserved compute driver name" + )); + } + + match args.drivers.as_slice() { + [driver] => { + let driver = openshell_core::config::normalize_compute_driver_name(driver) + .map_err(|err| miette::miette!("{err}"))?; + if matches!( + driver.parse::().ok(), + Some( + ComputeDriverKind::Docker + | ComputeDriverKind::Podman + | ComputeDriverKind::Kubernetes + | ComputeDriverKind::Vm + ) + ) { + return Err(miette::miette!( + "--compute-driver-socket cannot be combined with reserved built-in compute driver '{driver}'" + )); + } + args.drivers[0] = driver; + Ok(()) + } + drivers => Err(miette::miette!( + "--compute-driver-socket requires exactly one compute driver name, got: {}", + drivers.join(",") + )), + } +} + fn effective_single_driver(args: &RunArgs) -> Option { match args.drivers.as_slice() { [] => openshell_core::config::detect_driver(), - [driver] => Some(*driver), + [driver] => driver.parse().ok(), _ => None, } } @@ -1561,6 +1621,126 @@ ssh_session_ttl_secs = 1234 assert!(!super::is_singleplayer_driver(&multi)); } + #[test] + fn compute_driver_socket_flag_uses_explicit_driver_name() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "Kyma", + "--compute-driver-socket", + "/run/openshell/kyma.sock", + ]); + super::normalize_compute_driver_socket_args(&mut args, &matches).unwrap(); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new("/run/openshell/kyma.sock")) + ); + assert_eq!(args.drivers, ["kyma"]); + assert!(super::effective_single_driver(&args).is_none()); + } + + #[test] + fn compute_driver_socket_requires_explicit_driver_name() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--compute-driver-socket", + "/run/openshell/kyma.sock", + ]); + let err = super::normalize_compute_driver_socket_args(&mut args, &matches).unwrap_err(); + + assert!( + err.to_string().contains("requires --drivers "), + "unexpected error: {err}" + ); + } + + #[test] + fn compute_driver_socket_rejects_reserved_builtin_drivers() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "docker", + "--compute-driver-socket", + "/run/openshell/extension.sock", + ]); + let err = super::normalize_compute_driver_socket_args(&mut args, &matches).unwrap_err(); + assert!( + err.to_string() + .contains("cannot be combined with reserved built-in compute driver 'docker'"), + "unexpected error: {err}" + ); + } + + #[test] + fn compute_driver_socket_rejects_vm_endpoint() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_COMPUTE_DRIVER_SOCKET"); + let _g2 = EnvVarGuard::remove("OPENSHELL_DRIVERS"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--drivers", + "vm", + "--compute-driver-socket", + "/run/openshell/vm.sock", + ]); + let err = super::normalize_compute_driver_socket_args(&mut args, &matches).unwrap_err(); + assert!( + err.to_string() + .contains("cannot be combined with reserved built-in compute driver 'vm'"), + "unexpected error: {err}" + ); + } + + #[test] + fn compute_driver_socket_reads_from_env_var() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::set( + "OPENSHELL_COMPUTE_DRIVER_SOCKET", + "/var/run/openshell/kyma.sock", + ); + let _g2 = EnvVarGuard::set("OPENSHELL_DRIVERS", "kyma"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + super::normalize_compute_driver_socket_args(&mut args, &matches).unwrap(); + assert_eq!( + args.compute_driver_socket.as_deref(), + Some(std::path::Path::new("/var/run/openshell/kyma.sock")) + ); + assert_eq!(args.drivers, ["kyma"]); + } + #[test] fn file_populates_service_routing_fields() { let _lock = ENV_LOCK diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 272c3907f..19396f523 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -16,6 +16,7 @@ use crate::sandbox_watch::SandboxWatchBus; use crate::supervisor_session::SupervisorSessionRegistry; use crate::tracing_bus::TracingLogBus; use futures::{Stream, StreamExt}; +use hyper_util::rt::TokioIo; use openshell_core::ComputeDriverKind; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, @@ -39,12 +40,16 @@ use openshell_driver_podman::{ use prost::Message; use std::fmt; use std::net::SocketAddr; +use std::path::{Path, PathBuf}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; +#[cfg(unix)] +use tokio::net::UnixStream; use tokio::sync::{Mutex, watch}; -use tonic::transport::Channel; +use tonic::transport::{Channel, Endpoint}; use tonic::{Code, Request, Status}; +use tower::service_fn; use tracing::{debug, info, warn}; type DriverWatchStream = Pin> + Send>>; @@ -104,11 +109,11 @@ pub use openshell_core::ComputeDriverError as ComputeError; #[derive(Debug)] pub struct ManagedDriverProcess { child: std::sync::Mutex>, - socket_path: std::path::PathBuf, + socket_path: PathBuf, } impl ManagedDriverProcess { - pub(crate) fn new(child: tokio::process::Child, socket_path: std::path::PathBuf) -> Self { + pub(crate) fn new(child: tokio::process::Child, socket_path: PathBuf) -> Self { Self { child: std::sync::Mutex::new(Some(child)), socket_path, @@ -125,6 +130,35 @@ impl Drop for ManagedDriverProcess { } } +#[derive(Debug)] +pub struct AcquiredRemoteDriverEndpoint { + pub(crate) name: String, + pub(crate) channel: Channel, + pub(crate) driver_process: Option>, +} + +impl AcquiredRemoteDriverEndpoint { + pub(crate) fn managed_builtin( + driver_kind: ComputeDriverKind, + channel: Channel, + driver_process: Arc, + ) -> Self { + Self { + name: driver_kind.as_str().to_string(), + channel, + driver_process: Some(driver_process), + } + } + + pub(crate) fn unmanaged(name: impl Into, channel: Channel) -> Self { + Self { + name: name.into(), + channel, + driver_process: None, + } + } +} + #[derive(Debug, Clone)] struct RemoteComputeDriver { channel: Channel, @@ -223,7 +257,7 @@ impl ComputeDriver for RemoteComputeDriver { #[derive(Clone)] pub struct ComputeRuntime { driver: SharedComputeDriver, - driver_kind: Option, + driver_name: String, shutdown_cleanup: Option>, startup_resume: Option>, _driver_process: Option>, @@ -247,7 +281,7 @@ impl fmt::Debug for ComputeRuntime { impl ComputeRuntime { #[allow(clippy::too_many_arguments)] async fn from_driver( - driver_kind: ComputeDriverKind, + driver_name: String, driver: SharedComputeDriver, shutdown_cleanup: Option>, startup_resume: Option>, @@ -260,15 +294,22 @@ impl ComputeRuntime { _allows_loopback_endpoints: bool, gateway_bind_addresses: Vec, ) -> Result { - let default_image = driver + let capabilities = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) .await .map_err(compute_error_from_status)? - .into_inner() - .default_image; + .into_inner(); + let driver_kind = driver_name.parse::().ok(); + info!( + configured_driver = %driver_name, + advertised_driver = %capabilities.driver_name, + in_tree = driver_kind.is_some(), + "Compute driver connected" + ); + let default_image = capabilities.default_image; Ok(Self { driver, - driver_kind: Some(driver_kind), + driver_name, shutdown_cleanup, startup_resume, _driver_process: driver_process, @@ -313,7 +354,7 @@ impl ComputeRuntime { let startup_resume: Arc = driver.clone(); let driver: SharedComputeDriver = driver; Self::from_driver( - ComputeDriverKind::Docker, + ComputeDriverKind::Docker.as_str().to_string(), driver, Some(shutdown_cleanup), Some(startup_resume), @@ -342,7 +383,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(ComputeDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Kubernetes, + ComputeDriverKind::Kubernetes.as_str().to_string(), driver, None, None, @@ -358,22 +399,21 @@ impl ComputeRuntime { .await } - pub(crate) async fn new_remote_vm( - channel: Channel, - driver_process: Option>, + pub(crate) async fn new_remote_driver( + endpoint: AcquiredRemoteDriverEndpoint, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { - let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); + let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(endpoint.channel)); Self::from_driver( - ComputeDriverKind::Vm, + endpoint.name, driver, None, None, - driver_process, + endpoint.driver_process, store, sandbox_index, sandbox_watch_bus, @@ -398,7 +438,7 @@ impl ComputeRuntime { .map_err(|err| ComputeError::Message(err.to_string()))?; let driver: SharedComputeDriver = Arc::new(PodmanDriverService::new(driver)); Self::from_driver( - ComputeDriverKind::Podman, + ComputeDriverKind::Podman.as_str().to_string(), driver, None, None, @@ -421,7 +461,7 @@ impl ComputeRuntime { #[must_use] pub fn driver_kind(&self) -> Option { - self.driver_kind + self.driver_name.parse().ok() } #[must_use] @@ -431,7 +471,7 @@ impl ComputeRuntime { pub async fn validate_sandbox_create(&self, sandbox: &Sandbox) -> Result<(), Status> { let driver_sandbox = - driver_sandbox_from_public(sandbox, self.driver_kind).map_err(|status| *status)?; + driver_sandbox_from_public(sandbox, &self.driver_name).map_err(|status| *status)?; self.driver .validate_sandbox_create(Request::new(ValidateSandboxCreateRequest { sandbox: Some(driver_sandbox), @@ -447,7 +487,7 @@ impl ComputeRuntime { ) -> Result { let sandbox_id = sandbox.object_id().to_string(); let mut driver_sandbox = - driver_sandbox_from_public(&sandbox, self.driver_kind).map_err(|status| *status)?; + driver_sandbox_from_public(&sandbox, &self.driver_name).map_err(|status| *status)?; // Create with MustCreate condition to prevent duplicate creation race self.sandbox_index.update_from_sandbox(&sandbox); @@ -1471,9 +1511,48 @@ impl ComputeRuntime { } } +/// Connect to an unmanaged remote compute driver that is already listening on +/// `socket_path` and return the acquired endpoint. +/// +/// The gateway does not spawn or own the driver process — the operator is +/// responsible for placing the driver alongside the gateway and granting the +/// gateway uid read/write on the socket. The host portion of the URL is +/// ignored because the connector resolves to the UDS rather than DNS. +#[cfg(unix)] +pub async fn connect_remote_compute_driver( + name: impl Into, + socket_path: &Path, +) -> Result { + let socket_path: PathBuf = socket_path.to_path_buf(); + let display_path = socket_path.clone(); + let channel = Endpoint::from_static("http://[::]:50051") + .connect_with_connector(service_fn(move |_: tonic::transport::Uri| { + let socket_path = socket_path.clone(); + async move { UnixStream::connect(socket_path).await.map(TokioIo::new) } + })) + .await + .map_err(|e| { + ComputeError::Message(format!( + "failed to connect to remote compute driver socket '{}': {e}", + display_path.display() + )) + })?; + Ok(AcquiredRemoteDriverEndpoint::unmanaged(name, channel)) +} + +#[cfg(not(unix))] +pub async fn connect_remote_compute_driver( + _name: impl Into, + _socket_path: &Path, +) -> Result { + Err(ComputeError::Message( + "remote compute driver endpoints require unix domain socket support".to_string(), + )) +} + fn driver_sandbox_from_public( sandbox: &Sandbox, - driver_kind: Option, + driver_name: &str, ) -> Result> { Ok(DriverSandbox { id: sandbox.object_id().to_string(), @@ -1482,7 +1561,7 @@ fn driver_sandbox_from_public( spec: sandbox .spec .as_ref() - .map(|spec| driver_sandbox_spec_from_public(spec, driver_kind)) + .map(|spec| driver_sandbox_spec_from_public(spec, driver_name)) .transpose()?, status: sandbox.status.as_ref().map(driver_status_from_public), }) @@ -1490,7 +1569,7 @@ fn driver_sandbox_from_public( fn driver_sandbox_spec_from_public( spec: &SandboxSpec, - driver_kind: Option, + driver_name: &str, ) -> Result> { Ok(DriverSandboxSpec { log_level: spec.log_level.clone(), @@ -1498,7 +1577,7 @@ fn driver_sandbox_spec_from_public( template: spec .template .as_ref() - .map(|template| driver_sandbox_template_from_public(template, driver_kind)) + .map(|template| driver_sandbox_template_from_public(template, driver_name)) .transpose()?, gpu: spec.gpu, sandbox_token: String::new(), @@ -1507,7 +1586,7 @@ fn driver_sandbox_spec_from_public( fn driver_sandbox_template_from_public( template: &SandboxTemplate, - driver_kind: Option, + driver_name: &str, ) -> Result> { Ok(DriverSandboxTemplate { image: template.image.clone(), @@ -1516,21 +1595,17 @@ fn driver_sandbox_template_from_public( environment: template.environment.clone(), resources: extract_typed_resources(&template.resources), platform_config: build_platform_config(template), - driver_config: select_driver_config(&template.driver_config, driver_kind)?, + driver_config: select_driver_config(&template.driver_config, driver_name)?, }) } fn select_driver_config( config: &Option, - driver_kind: Option, + driver_name: &str, ) -> Result, Box> { let Some(config) = config else { return Ok(None); }; - let Some(driver_kind) = driver_kind else { - return Ok(None); - }; - let driver_name = driver_kind.as_str(); let Some(value) = config.fields.get(driver_name) else { return Ok(None); }; @@ -2025,7 +2100,7 @@ impl ComputeDriver for NoopTestDriver { pub async fn new_test_runtime(store: Arc) -> ComputeRuntime { ComputeRuntime { driver: Arc::new(NoopTestDriver), - driver_kind: None, + driver_name: "test".to_string(), shutdown_cleanup: None, startup_resume: None, _driver_process: None, @@ -2095,8 +2170,7 @@ mod tests { .collect(), }; - let selected = - select_driver_config(&Some(config), Some(ComputeDriverKind::Kubernetes)).unwrap(); + let selected = select_driver_config(&Some(config), "kubernetes").unwrap(); let selected = selected.expect("kubernetes config should be selected"); assert!(selected.fields.contains_key("node")); @@ -2113,12 +2187,27 @@ mod tests { .collect(), }; - let selected = - select_driver_config(&Some(config), Some(ComputeDriverKind::Kubernetes)).unwrap(); + let selected = select_driver_config(&Some(config), "kubernetes").unwrap(); assert!(selected.is_none()); } + #[test] + fn select_driver_config_forwards_named_remote_driver_block() { + let config = prost_types::Struct { + fields: std::iter::once(( + "kyma".to_string(), + struct_value([("pool", string_value("gpu"))]), + )) + .collect(), + }; + + let selected = select_driver_config(&Some(config), "kyma").unwrap(); + let selected = selected.expect("named remote config should be selected"); + + assert!(selected.fields.contains_key("pool")); + } + #[test] fn select_driver_config_rejects_non_object_matching_driver_block() { let config = prost_types::Struct { @@ -2126,8 +2215,7 @@ mod tests { .collect(), }; - let err = - select_driver_config(&Some(config), Some(ComputeDriverKind::Kubernetes)).unwrap_err(); + let err = select_driver_config(&Some(config), "kubernetes").unwrap_err(); assert_eq!(err.code(), Code::InvalidArgument); assert!(err.message().contains("template.driver_config.kubernetes")); @@ -2247,7 +2335,7 @@ mod tests { let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); ComputeRuntime { driver, - driver_kind: None, + driver_name: "test-driver".to_string(), shutdown_cleanup: None, startup_resume, _driver_process: None, diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index efdc9daab..be88047f3 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -29,6 +29,7 @@ //! trait implementation registering the VM driver against the generic //! interface. +use super::AcquiredRemoteDriverEndpoint; #[cfg(unix)] use super::ManagedDriverProcess; #[cfg(unix)] @@ -37,7 +38,7 @@ use hyper_util::rt::TokioIo; use openshell_core::proto::compute::v1::{ GetCapabilitiesRequest, compute_driver_client::ComputeDriverClient, }; -use openshell_core::{Config, Error, Result}; +use openshell_core::{ComputeDriverKind, Config, Error, Result}; #[cfg(unix)] use std::os::unix::fs::{FileTypeExt, MetadataExt, PermissionsExt}; #[cfg(unix)] @@ -451,7 +452,7 @@ pub fn compute_driver_guest_tls_paths( pub async fn spawn( config: &Config, vm_config: &VmComputeConfig, -) -> Result<(Channel, Arc)> { +) -> Result { if vm_config.grpc_endpoint.trim().is_empty() { return Err(Error::config( "grpc_endpoint is required when using the vm compute driver", @@ -507,14 +508,18 @@ pub async fn spawn( })?; let channel = wait_for_compute_driver(&socket_path, &mut child).await?; let process = Arc::new(ManagedDriverProcess::new(child, socket_path)); - Ok((channel, process)) + Ok(AcquiredRemoteDriverEndpoint::managed_builtin( + ComputeDriverKind::Vm, + channel, + process, + )) } #[cfg(not(unix))] pub async fn spawn( _config: &Config, _vm_config: &VmComputeConfig, -) -> Result<(Channel, std::sync::Arc)> { +) -> Result { Err(Error::config( "the vm compute driver requires unix domain socket support", )) diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 39cf02bba..3875756dc 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -87,7 +87,7 @@ pub struct GatewayFileSection { // ── Drivers ────────────────────────────────────────────────────────── #[serde(default)] - pub compute_drivers: Option>, + pub compute_drivers: Option>, // ── Sandbox / SSH ──────────────────────────────────────────────────── #[serde(default)] @@ -631,7 +631,7 @@ version = 2 .expect("compute_drivers must be explicitly set in the RPM default config"); assert_eq!( drivers, - &[ComputeDriverKind::Podman], + &["podman".to_string()], "RPM default must pin compute_drivers to [podman] to prevent unexpected \ driver selection when Docker is also installed" ); diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index dda8708e0..3b32c06ed 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -10,14 +10,18 @@ //! - mTLS support //! //! TODO(driver-abstraction): `build_compute_runtime` still switches on -//! [`ComputeDriverKind`] and calls driver-specific constructors -//! ([`ComputeRuntime::new_kubernetes`], [`compute::vm::spawn`] + -//! [`ComputeRuntime::new_remote_vm`]). Once we have a generalized compute -//! driver interface, the per-arm wiring here should collapse to a single -//! driver-agnostic path that asks each registered driver to produce a -//! [`Channel`](tonic::transport::Channel) and hands the rest of the gateway a -//! uniform [`ComputeRuntime`]. The remaining VM plumbing now lives in -//! [`compute::vm`]; keep this file driver-agnostic going forward. +//! built-in driver names and calls driver-specific constructors +//! ([`ComputeRuntime::new_kubernetes`], [`ComputeRuntime::new_docker`], +//! [`compute::vm::spawn`] + [`ComputeRuntime::new_remote_driver`], +//! [`ComputeRuntime::new_podman`]). Endpoint-backed drivers now share the +//! remote `compute_driver.proto` path, so new remote drivers should enter +//! through named endpoint acquisition rather than gateway-wide socket side +//! channels. Once we have a generalized compute-driver registry, the remaining +//! per-arm wiring here should collapse to driver construction records that +//! produce either an in-process `SharedComputeDriver` or an acquired remote +//! endpoint, then hand the rest of the gateway a uniform [`ComputeRuntime`]. +//! The VM launch plumbing now lives in [`compute::vm`]; keep this file limited +//! to selecting and acquiring drivers. mod auth; pub mod certgen; @@ -47,6 +51,7 @@ mod ws_tunnel; use metrics_exporter_prometheus::PrometheusBuilder; use openshell_core::{ComputeDriverKind, Config, Error, Result}; +use serde::Deserialize; use std::collections::HashMap; use std::io::ErrorKind; use std::net::SocketAddr; @@ -721,12 +726,14 @@ async fn build_compute_runtime( tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { - let driver = configured_compute_driver(config)?; - info!(driver = %driver, "Using compute driver"); - warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, driver); + let driver = configured_compute_driver(config, file)?; + info!(driver = %driver.name(), "Using compute driver"); + if let ConfiguredComputeDriver::Builtin(kind) = &driver { + warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, *kind); + } match driver { - ComputeDriverKind::Kubernetes => { + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Kubernetes) => { let mut k8s = kubernetes_config_from_file(file)?; if let Ok(size) = std::env::var("OPENSHELL_K8S_WORKSPACE_DEFAULT_STORAGE_SIZE") { k8s.workspace_default_storage_size = size; @@ -742,7 +749,7 @@ async fn build_compute_runtime( .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } - ComputeDriverKind::Docker => ComputeRuntime::new_docker( + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Docker) => ComputeRuntime::new_docker( config.clone(), docker_config.clone(), store, @@ -753,21 +760,7 @@ async fn build_compute_runtime( ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), - ComputeDriverKind::Vm => { - let (channel, driver_process) = compute::vm::spawn(config, vm_config).await?; - ComputeRuntime::new_remote_vm( - channel, - Some(driver_process), - store, - sandbox_index, - sandbox_watch_bus, - tracing_log_bus, - supervisor_sessions, - ) - .await - .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) - } - ComputeDriverKind::Podman => { + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Podman) => { let mut podman = podman_config_from_file(file)?; podman.gateway_port = config.bind_address.port(); if let Ok(p) = std::env::var("OPENSHELL_PODMAN_SOCKET") { @@ -789,6 +782,40 @@ async fn build_compute_runtime( .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Vm) => { + let endpoint = compute::vm::spawn(config, vm_config).await?; + ComputeRuntime::new_remote_driver( + endpoint, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) + } + ConfiguredComputeDriver::Remote(remote) => { + let RemoteComputeDriverSelection { name, socket_path } = remote; + info!( + driver = %name, + socket = %socket_path.display(), + "Using remote compute driver endpoint" + ); + let endpoint = compute::connect_remote_compute_driver(name, &socket_path) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}")))?; + ComputeRuntime::new_remote_driver( + endpoint, + store, + sandbox_index, + sandbox_watch_bus, + tracing_log_bus, + supervisor_sessions, + ) + .await + .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) + } } } @@ -868,35 +895,117 @@ fn apply_podman_local_tls_defaults( Ok(()) } -fn configured_compute_driver(config: &Config) -> Result { +#[derive(Debug, Clone)] +enum ConfiguredComputeDriver { + Builtin(ComputeDriverKind), + Remote(RemoteComputeDriverSelection), +} + +impl ConfiguredComputeDriver { + fn name(&self) -> &str { + match self { + Self::Builtin(kind) => kind.as_str(), + Self::Remote(remote) => &remote.name, + } + } +} + +#[derive(Debug, Clone)] +struct RemoteComputeDriverSelection { + name: String, + socket_path: PathBuf, +} + +#[derive(Debug, Deserialize)] +#[serde(deny_unknown_fields)] +struct RemoteComputeDriverConfig { + socket_path: PathBuf, +} + +fn configured_compute_driver( + config: &Config, + file: Option<&config_file::ConfigFile>, +) -> Result { match config.compute_drivers.as_slice() { [] => match openshell_core::config::detect_driver() { Some(ComputeDriverKind::Vm) => Err(Error::config( "vm compute driver is opt-in only; set --drivers vm or OPENSHELL_DRIVERS=vm", )), - Some(driver) => Ok(driver), + Some(driver) => Ok(ConfiguredComputeDriver::Builtin(driver)), None => Err(Error::config( "no compute driver configured and auto-detection found no suitable driver; \ set --drivers or OPENSHELL_DRIVERS to kubernetes, podman, docker, or vm", )), }, - [ - driver @ (ComputeDriverKind::Kubernetes - | ComputeDriverKind::Vm - | ComputeDriverKind::Docker - | ComputeDriverKind::Podman), - ] => Ok(*driver), + [driver] => resolve_configured_compute_driver(driver, config, file), drivers => Err(Error::config(format!( "multiple compute drivers are not supported yet; configured drivers: {}", - drivers - .iter() - .map(ToString::to_string) - .collect::>() - .join(",") + drivers.join(",") ))), } } +fn resolve_configured_compute_driver( + driver_name: &str, + config: &Config, + file: Option<&config_file::ConfigFile>, +) -> Result { + let name = openshell_core::config::normalize_compute_driver_name(driver_name) + .map_err(Error::config)?; + let driver_kind = builtin_compute_driver(&name); + if let Some(socket_path) = config.compute_driver_endpoints.get(&name) { + if driver_kind.is_some() { + return Err(Error::config(format!( + "compute driver '{name}' is a reserved built-in driver and cannot be selected with a socket endpoint" + ))); + } + return Ok(ConfiguredComputeDriver::Remote( + RemoteComputeDriverSelection { + name, + socket_path: socket_path.clone(), + }, + )); + } + + if let Some(kind) = driver_kind { + return Ok(ConfiguredComputeDriver::Builtin(kind)); + } + + let socket_path = remote_driver_socket_from_file(&name, file)?; + Ok(ConfiguredComputeDriver::Remote( + RemoteComputeDriverSelection { name, socket_path }, + )) +} + +fn builtin_compute_driver(name: &str) -> Option { + name.parse().ok() +} + +fn remote_driver_socket_from_file( + name: &str, + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Err(Error::config(format!( + "compute driver '{name}' is not a built-in driver; configure [openshell.drivers.{name}].socket_path or pass --drivers {name} with --compute-driver-socket" + ))); + }; + let Some(raw) = file.openshell.drivers.get(name) else { + return Err(Error::config(format!( + "compute driver '{name}' is not a built-in driver; configure [openshell.drivers.{name}].socket_path" + ))); + }; + let config = raw + .clone() + .try_into::() + .map_err(|err| { + Error::config(format!( + "invalid [openshell.drivers.{name}] table for remote compute driver: {err}" + )) + })?; + Ok(config.socket_path) +} + fn kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: ComputeDriverKind) -> bool { matches!(driver, ComputeDriverKind::Kubernetes) && config @@ -916,7 +1025,7 @@ fn warn_if_kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: Compu #[cfg(test)] mod tests { use super::{ - ConnectionProtocol, MultiplexService, ServerState, TlsAcceptor, + ConfiguredComputeDriver, ConnectionProtocol, MultiplexService, ServerState, TlsAcceptor, allow_plaintext_service_http, classify_initial_bytes, configured_compute_driver, gateway_listener_addresses, is_benign_tls_handshake_failure, kubernetes_config_for_k8s_sa_bootstrap, kubernetes_sandbox_jwt_expiry_disabled, @@ -928,6 +1037,7 @@ mod tests { }; use std::io::{Error, ErrorKind}; use std::net::SocketAddr; + use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; use tempfile::{TempDir, tempdir}; @@ -1228,14 +1338,14 @@ mod tests { #[test] fn configured_compute_driver_triggers_auto_detection_when_empty() { - let config = Config::new(None).with_compute_drivers([]); + let config = Config::new(None).with_compute_drivers(std::iter::empty::()); // Empty drivers triggers auto-detection, which may return Some or None // depending on the environment. This test verifies the auto-detection path // is taken rather than immediately returning an error. - let result = configured_compute_driver(&config); + let result = configured_compute_driver(&config, None); // Either we get a detected driver or an error about none being detected. match result { - Ok(driver) => { + Ok(ConfiguredComputeDriver::Builtin(driver)) => { assert!( matches!( driver, @@ -1246,6 +1356,9 @@ mod tests { "auto-detected unexpected driver: {driver:?}" ); } + Ok(ConfiguredComputeDriver::Remote(remote)) => { + panic!("auto-detection returned remote driver: {remote:?}"); + } Err(e) => { assert!( e.to_string() @@ -1260,7 +1373,7 @@ mod tests { fn configured_compute_driver_rejects_multiple_entries() { let config = Config::new(None) .with_compute_drivers([ComputeDriverKind::Kubernetes, ComputeDriverKind::Podman]); - let err = configured_compute_driver(&config).unwrap_err(); + let err = configured_compute_driver(&config, None).unwrap_err(); assert!( err.to_string() .contains("multiple compute drivers are not supported yet") @@ -1271,27 +1384,90 @@ mod tests { #[test] fn configured_compute_driver_accepts_podman() { let config = Config::new(None).with_compute_drivers([ComputeDriverKind::Podman]); - assert_eq!( - configured_compute_driver(&config).unwrap(), - ComputeDriverKind::Podman - ); + let driver = configured_compute_driver(&config, None).unwrap(); + assert!(matches!( + driver, + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Podman) + )); } #[test] fn configured_compute_driver_accepts_vm() { let config = Config::new(None).with_compute_drivers([ComputeDriverKind::Vm]); - assert_eq!( - configured_compute_driver(&config).unwrap(), - ComputeDriverKind::Vm - ); + let driver = configured_compute_driver(&config, None).unwrap(); + assert!(matches!( + driver, + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Vm) + )); } #[test] fn configured_compute_driver_accepts_docker() { let config = Config::new(None).with_compute_drivers([ComputeDriverKind::Docker]); - assert_eq!( - configured_compute_driver(&config).unwrap(), - ComputeDriverKind::Docker + let driver = configured_compute_driver(&config, None).unwrap(); + assert!(matches!( + driver, + ConfiguredComputeDriver::Builtin(ComputeDriverKind::Docker) + )); + } + + #[test] + fn configured_compute_driver_resolves_named_remote_from_file() { + let file: super::config_file::ConfigFile = toml::from_str( + r#" +[openshell.gateway] +compute_drivers = ["kyma"] + +[openshell.drivers.kyma] +socket_path = "/run/openshell/kyma.sock" +"#, + ) + .unwrap(); + let config = Config::new(None).with_compute_drivers(["kyma"]); + + let driver = configured_compute_driver(&config, Some(&file)).unwrap(); + + match driver { + ConfiguredComputeDriver::Remote(remote) => { + assert_eq!(remote.name, "kyma"); + assert_eq!( + remote.socket_path, + PathBuf::from("/run/openshell/kyma.sock") + ); + } + ConfiguredComputeDriver::Builtin(other) => { + panic!("expected remote driver, got builtin driver {other:?}") + } + } + } + + #[test] + fn configured_compute_driver_rejects_vm_endpoint_from_config() { + let config = Config::new(None) + .with_compute_drivers([ComputeDriverKind::Vm]) + .with_compute_driver_endpoint("vm", "/run/openshell/vm.sock"); + + let err = configured_compute_driver(&config, None).unwrap_err(); + + assert!( + err.to_string() + .contains("reserved built-in driver and cannot be selected with a socket endpoint"), + "unexpected error: {err}" + ); + } + + #[test] + fn configured_compute_driver_rejects_builtin_endpoint() { + let config = Config::new(None) + .with_compute_drivers([ComputeDriverKind::Docker]) + .with_compute_driver_endpoint("docker", "/run/openshell/docker.sock"); + + let err = configured_compute_driver(&config, None).unwrap_err(); + + assert!( + err.to_string() + .contains("cannot be selected with a socket endpoint"), + "unexpected error: {err}" ); } diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index ff4542136..8b6e695d5 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -306,3 +306,24 @@ guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" ``` + +### Extension Driver + +Extension drivers run outside the gateway and expose the +`compute_driver.proto` gRPC service on a Unix socket. Use a non-reserved driver +name; built-in names such as `vm`, `docker`, `podman`, and `kubernetes` cannot +be selected through unmanaged socket endpoints. The selected driver name is the +key used for driver-owned sandbox config such as `template.driver_config.`. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:17670" +log_level = "info" +compute_drivers = ["kyma"] + +[openshell.drivers.kyma] +socket_path = "/run/openshell/kyma-compute-driver.sock" +``` diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 95a319c37..9e71cc638 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -21,7 +21,9 @@ Configure the compute driver on the gateway. Current releases accept one driver compute_drivers = ["docker"] ``` -Supported values are `docker`, `podman`, `kubernetes`, and `vm`. +Reserved built-in values are `docker`, `podman`, `kubernetes`, and `vm`. +Non-reserved names select an extension driver and require a +`socket_path` in `[openshell.drivers.]`. When `compute_drivers` is unset, the gateway auto-detects Kubernetes, then Podman, then Docker by CLI availability or a local Unix socket. The VM driver is never auto-detected; configure it explicitly with `compute_drivers = ["vm"]` or set `OPENSHELL_DRIVERS=vm` in the launch environment. @@ -29,10 +31,33 @@ Common gateway options: | Gateway TOML option | Description | |---|---| -| `compute_drivers = [""]` | Select the compute driver. Supported values are `docker`, `podman`, `kubernetes`, and `vm`. | +| `compute_drivers = [""]` | Select the compute driver. Built-in values are `docker`, `podman`, `kubernetes`, and `vm`; custom names require `[openshell.drivers.].socket_path`. | Set driver-specific values such as sandbox images, callback endpoints, network names, TLS material, and VM sizing in the gateway TOML file. See the [Gateway Configuration File](./gateway-config) reference for the full `[openshell.drivers.]` schema. +Extension drivers use the same `compute_driver.proto` gRPC surface as the +managed VM driver. For an out-of-tree driver, choose a driver name and point +the gateway at the Unix socket the operator has already provisioned: + +```toml +[openshell.gateway] +compute_drivers = ["kyma"] + +[openshell.drivers.kyma] +socket_path = "/run/openshell/kyma.sock" +``` + +For a launch-time socket override, pass the same non-reserved driver name with +the socket path: + +```shell +openshell-gateway --drivers kyma --compute-driver-socket /run/openshell/kyma.sock +``` + +The gateway does not spawn, supervise, or delete extension drivers. The +operator must protect the socket so only the gateway uid can access it. +Reserved built-in names cannot be selected through unmanaged socket endpoints. + Sandbox create supports `--cpu` and `--memory` for per-sandbox compute sizing. Docker and Podman apply them as runtime limits. Kubernetes applies them as both container requests and limits. The VM driver accepts the fields but currently @@ -225,7 +250,7 @@ compute_drivers = ["vm"] For a launch-time override, set `OPENSHELL_DRIVERS=vm` in the gateway environment and restart the service. -Configure VM driver values such as `grpc_endpoint`, `driver_dir`, `state_dir`, `default_image`, `bootstrap_image`, `vcpus`, `mem_mib`, `overlay_disk_mib`, `krun_log_level`, and `guest_tls_*` in `[openshell.drivers.vm]`. The VM `state_dir` stores overlay disks, console logs, runtime state, image-rootfs cache, and the private `run/compute-driver.sock` socket. +Configure VM driver values such as `grpc_endpoint`, `driver_dir`, `state_dir`, `default_image`, `bootstrap_image`, `vcpus`, `mem_mib`, `overlay_disk_mib`, `krun_log_level`, and `guest_tls_*` in `[openshell.drivers.vm]`. The VM `state_dir` stores overlay disks, console logs, runtime state, image-rootfs cache, and the private `run/compute-driver.sock` socket. The VM socket path is managed by the gateway and is not configurable through remote endpoint settings. The gateway starts `openshell-driver-vm` over a private Unix socket and passes its process ID so the driver can reject unexpected local clients. The driver's standalone TCP listener is disabled unless `--allow-unauthenticated-tcp` is set for local development. From 02d9b3191f129c99809d1e166fbdf48a512da26c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Jun 2026 14:58:57 +0200 Subject: [PATCH 2/2] test(server): cover remote compute driver UDS lifecycle Signed-off-by: Evan Lezar --- crates/openshell-server/src/compute/mod.rs | 97 ++++++ crates/openshell-server/src/lib.rs | 2 + crates/openshell-server/src/test_support.rs | 311 ++++++++++++++++++++ 3 files changed, 410 insertions(+) create mode 100644 crates/openshell-server/src/test_support.rs diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 19396f523..b7ca8f21c 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -3421,6 +3421,103 @@ mod tests { ); } + #[tokio::test] + #[cfg(unix)] + async fn remote_compute_driver_forwards_lifecycle_calls_over_uds() { + use crate::test_support::{FakeComputeDriver, FakeComputeDriverCall}; + + let dir = tempfile::tempdir().unwrap(); + let socket_path = dir.path().join("compute-driver.sock"); + let driver = FakeComputeDriver::new() + .with_driver_name("fake-remote-driver") + .with_default_image("openshell/sandbox:remote"); + let _server = driver.serve_uds(&socket_path).unwrap(); + + let endpoint = connect_remote_compute_driver("external-test", &socket_path) + .await + .unwrap(); + let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); + let runtime = ComputeRuntime::new_remote_driver( + endpoint, + store, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + ) + .await + .unwrap(); + + let mut sandbox = sandbox_record("sb-uds", "uds-sandbox", SandboxPhase::Provisioning); + sandbox.spec = Some(SandboxSpec { + log_level: "debug".to_string(), + template: Some(SandboxTemplate { + image: "ghcr.io/nvidia/openshell/sandbox:test".to_string(), + driver_config: Some(prost_types::Struct { + fields: [ + ( + "external-test".to_string(), + struct_value([("pool", string_value("ci"))]), + ), + ( + "docker".to_string(), + struct_value([("network_mode", string_value("bridge"))]), + ), + ] + .into_iter() + .collect(), + }), + ..Default::default() + }), + ..Default::default() + }); + + runtime.validate_sandbox_create(&sandbox).await.unwrap(); + runtime.create_sandbox(sandbox, None).await.unwrap(); + assert!(runtime.delete_sandbox("uds-sandbox").await.unwrap()); + + let calls = driver.calls(); + assert_eq!(calls.len(), 4, "unexpected calls: {calls:?}"); + assert!(matches!(calls[0], FakeComputeDriverCall::GetCapabilities)); + + let validated = match &calls[1] { + FakeComputeDriverCall::ValidateSandboxCreate { + sandbox: Some(sandbox), + } => sandbox, + other => panic!("expected ValidateSandboxCreate call, got {other:?}"), + }; + assert_eq!(validated.id, "sb-uds"); + assert_eq!(validated.name, "uds-sandbox"); + let driver_config = validated + .spec + .as_ref() + .and_then(|spec| spec.template.as_ref()) + .and_then(|template| template.driver_config.as_ref()) + .expect("selected driver_config should be forwarded"); + assert!(driver_config.fields.contains_key("pool")); + assert!(!driver_config.fields.contains_key("network_mode")); + + let created = match &calls[2] { + FakeComputeDriverCall::CreateSandbox { + sandbox: Some(sandbox), + } => sandbox, + other => panic!("expected CreateSandbox call, got {other:?}"), + }; + assert_eq!(created.id, "sb-uds"); + assert_eq!(created.name, "uds-sandbox"); + + match &calls[3] { + FakeComputeDriverCall::DeleteSandbox { + sandbox_id, + sandbox_name, + } => { + assert_eq!(sandbox_id, "sb-uds"); + assert_eq!(sandbox_name, "uds-sandbox"); + } + other => panic!("expected DeleteSandbox call, got {other:?}"), + } + } + #[tokio::test] async fn create_sandbox_returns_resource_version_one() { let runtime = test_runtime(Arc::new(TestDriver::default())).await; diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 3b32c06ed..d0dbb1681 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -43,6 +43,8 @@ mod service_routing; mod ssh_sessions; pub mod supervisor_session; mod telemetry; +#[cfg(any(test, feature = "test-support"))] +pub mod test_support; mod tls; #[cfg(test)] pub(crate) mod tls_test_utils; diff --git a/crates/openshell-server/src/test_support.rs b/crates/openshell-server/src/test_support.rs new file mode 100644 index 000000000..9cd80d6ed --- /dev/null +++ b/crates/openshell-server/src/test_support.rs @@ -0,0 +1,311 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Test fixtures for exercising gateway integration points. + +use futures::{Stream, stream}; +#[cfg(unix)] +use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; +use openshell_core::proto::compute::v1::{ + CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, + DriverSandbox, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, + GetSandboxResponse, ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, + StopSandboxResponse, ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, + WatchSandboxesEvent, WatchSandboxesRequest, compute_driver_server::ComputeDriver, +}; +use std::collections::HashMap; +#[cfg(unix)] +use std::io; +#[cfg(unix)] +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; +#[cfg(unix)] +use std::task::{Context, Poll}; +#[cfg(unix)] +use tokio::net::{UnixListener, UnixStream}; +#[cfg(unix)] +use tokio::task::JoinHandle; +use tonic::{Request, Response, Status}; + +type WatchStream = Pin> + Send>>; + +#[derive(Debug, Clone, PartialEq)] +pub enum FakeComputeDriverCall { + GetCapabilities, + ValidateSandboxCreate { + sandbox: Option, + }, + GetSandbox { + sandbox_id: String, + sandbox_name: String, + }, + ListSandboxes, + CreateSandbox { + sandbox: Option, + }, + StopSandbox { + sandbox_id: String, + sandbox_name: String, + }, + DeleteSandbox { + sandbox_id: String, + sandbox_name: String, + }, + WatchSandboxes, +} + +#[derive(Debug, Clone)] +pub struct FakeComputeDriver { + state: Arc>, +} + +#[derive(Debug)] +struct FakeComputeDriverState { + driver_name: String, + driver_version: String, + default_image: String, + sandboxes: HashMap, + calls: Vec, +} + +impl Default for FakeComputeDriver { + fn default() -> Self { + Self::new() + } +} + +impl FakeComputeDriver { + #[must_use] + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(FakeComputeDriverState { + driver_name: "fake-compute-driver".to_string(), + driver_version: "test".to_string(), + default_image: "openshell/sandbox:test".to_string(), + sandboxes: HashMap::new(), + calls: Vec::new(), + })), + } + } + + #[must_use] + pub fn with_driver_name(self, driver_name: impl Into) -> Self { + self.with_state(|state| state.driver_name = driver_name.into()); + self + } + + #[must_use] + pub fn with_driver_version(self, driver_version: impl Into) -> Self { + self.with_state(|state| state.driver_version = driver_version.into()); + self + } + + #[must_use] + pub fn with_default_image(self, default_image: impl Into) -> Self { + self.with_state(|state| state.default_image = default_image.into()); + self + } + + #[must_use] + pub fn calls(&self) -> Vec { + self.with_state(|state| state.calls.clone()) + } + + pub fn clear_calls(&self) { + self.with_state(|state| state.calls.clear()); + } + + #[cfg(unix)] + pub fn serve_uds( + &self, + socket_path: impl AsRef, + ) -> io::Result { + let socket_path = socket_path.as_ref().to_path_buf(); + let listener = UnixListener::bind(&socket_path)?; + let driver = self.clone(); + let task = tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(ComputeDriverServer::new(driver)) + .serve_with_incoming(UnixIncoming { listener }) + .await + }); + Ok(FakeComputeDriverServerHandle { socket_path, task }) + } + + fn with_state(&self, f: impl FnOnce(&mut FakeComputeDriverState) -> R) -> R { + let mut state = self + .state + .lock() + .expect("fake compute driver state poisoned"); + f(&mut state) + } +} + +#[cfg(unix)] +#[derive(Debug)] +pub struct FakeComputeDriverServerHandle { + socket_path: PathBuf, + task: JoinHandle>, +} + +#[cfg(unix)] +impl Drop for FakeComputeDriverServerHandle { + fn drop(&mut self) { + self.task.abort(); + let _ = std::fs::remove_file(&self.socket_path); + } +} + +#[cfg(unix)] +struct UnixIncoming { + listener: UnixListener, +} + +#[cfg(unix)] +impl Stream for UnixIncoming { + type Item = io::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.get_mut().listener.poll_accept(cx) { + Poll::Ready(Ok((stream, _addr))) => Poll::Ready(Some(Ok(stream))), + Poll::Ready(Err(err)) => Poll::Ready(Some(Err(err))), + Poll::Pending => Poll::Pending, + } + } +} + +#[tonic::async_trait] +impl ComputeDriver for FakeComputeDriver { + type WatchSandboxesStream = WatchStream; + + async fn get_capabilities( + &self, + _request: Request, + ) -> Result, Status> { + let response = self.with_state(|state| { + state.calls.push(FakeComputeDriverCall::GetCapabilities); + GetCapabilitiesResponse { + driver_name: state.driver_name.clone(), + driver_version: state.driver_version.clone(), + default_image: state.default_image.clone(), + } + }); + Ok(Response::new(response)) + } + + async fn validate_sandbox_create( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request.into_inner().sandbox; + self.with_state(|state| { + state + .calls + .push(FakeComputeDriverCall::ValidateSandboxCreate { sandbox }); + }); + Ok(Response::new(ValidateSandboxCreateResponse {})) + } + + async fn get_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + let sandbox = self.with_state(|state| { + state.calls.push(FakeComputeDriverCall::GetSandbox { + sandbox_id: request.sandbox_id.clone(), + sandbox_name: request.sandbox_name.clone(), + }); + state + .sandboxes + .values() + .find(|sandbox| { + (!request.sandbox_id.is_empty() && sandbox.id == request.sandbox_id) + || (!request.sandbox_name.is_empty() + && sandbox.name == request.sandbox_name) + }) + .cloned() + }); + let sandbox = sandbox.ok_or_else(|| Status::not_found("sandbox not found"))?; + Ok(Response::new(GetSandboxResponse { + sandbox: Some(sandbox), + })) + } + + async fn list_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + let sandboxes = self.with_state(|state| { + state.calls.push(FakeComputeDriverCall::ListSandboxes); + state.sandboxes.values().cloned().collect() + }); + Ok(Response::new(ListSandboxesResponse { sandboxes })) + } + + async fn create_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let sandbox = request.into_inner().sandbox; + self.with_state(|state| { + if let Some(sandbox) = sandbox.as_ref() { + state.sandboxes.insert(sandbox.id.clone(), sandbox.clone()); + } + state + .calls + .push(FakeComputeDriverCall::CreateSandbox { sandbox }); + }); + Ok(Response::new(CreateSandboxResponse {})) + } + + async fn stop_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + self.with_state(|state| { + state.calls.push(FakeComputeDriverCall::StopSandbox { + sandbox_id: request.sandbox_id, + sandbox_name: request.sandbox_name, + }); + }); + Ok(Response::new(StopSandboxResponse {})) + } + + async fn delete_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + let deleted = self.with_state(|state| { + state.calls.push(FakeComputeDriverCall::DeleteSandbox { + sandbox_id: request.sandbox_id.clone(), + sandbox_name: request.sandbox_name.clone(), + }); + if request.sandbox_id.is_empty() { + let Some(id) = state + .sandboxes + .iter() + .find(|(_, sandbox)| sandbox.name == request.sandbox_name) + .map(|(id, _)| id.clone()) + else { + return false; + }; + state.sandboxes.remove(&id).is_some() + } else { + state.sandboxes.remove(&request.sandbox_id).is_some() + } + }); + Ok(Response::new(DeleteSandboxResponse { deleted })) + } + + async fn watch_sandboxes( + &self, + _request: Request, + ) -> Result, Status> { + self.with_state(|state| state.calls.push(FakeComputeDriverCall::WatchSandboxes)); + Ok(Response::new(Box::pin(stream::empty()))) + } +}