From 291a7468c6f51ea6cdef18d26eadd1ca024eb4e9 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 19:01:18 +0000 Subject: [PATCH 01/14] feat(bluefield): add bf-core contracts crate and workspace wiring Introduce the openshell-driver-bluefield package as a set of private workspace member crates, starting with bf-core: the shared VF handle, role, claim, and lifecycle contracts the rest of the BlueField driver builds on. No behavior is wired into the gateway yet. --- Cargo.toml | 2 +- crates/openshell-driver-bluefield/Cargo.toml | 19 + .../bf-core/Cargo.toml | 14 + .../bf-core/src/assignment.rs | 180 +++++++ .../bf-core/src/claim.rs | 98 ++++ .../bf-core/src/error.rs | 30 ++ .../bf-core/src/handles.rs | 108 +++++ .../bf-core/src/lib.rs | 25 + .../bf-core/src/lifecycle.rs | 444 ++++++++++++++++++ .../bf-core/src/role.rs | 107 +++++ .../bf-core/src/runtime.rs | 173 +++++++ .../bf-core/src/state.rs | 40 ++ crates/openshell-driver-bluefield/src/lib.rs | 10 + 13 files changed, 1249 insertions(+), 1 deletion(-) create mode 100644 crates/openshell-driver-bluefield/Cargo.toml create mode 100644 crates/openshell-driver-bluefield/bf-core/Cargo.toml create mode 100644 crates/openshell-driver-bluefield/bf-core/src/assignment.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/claim.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/error.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/handles.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/lib.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/role.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/runtime.rs create mode 100644 crates/openshell-driver-bluefield/bf-core/src/state.rs create mode 100644 crates/openshell-driver-bluefield/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 86025646a..5057b56cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ [workspace] resolver = "2" -members = ["crates/*"] +members = ["crates/*", "crates/openshell-driver-bluefield/bf-*"] [workspace.package] version = "0.0.0" diff --git a/crates/openshell-driver-bluefield/Cargo.toml b/crates/openshell-driver-bluefield/Cargo.toml new file mode 100644 index 000000000..658fa032b --- /dev/null +++ b/crates/openshell-driver-bluefield/Cargo.toml @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-driver-bluefield" +description = "BlueField compute driver package marker" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +publish = false + +[lib] +name = "openshell_driver_bluefield" +path = "src/lib.rs" + +[lints] +workspace = true diff --git a/crates/openshell-driver-bluefield/bf-core/Cargo.toml b/crates/openshell-driver-bluefield/bf-core/Cargo.toml new file mode 100644 index 000000000..d901359f6 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "bf-core" +description = "Shared contracts for the OpenShell BlueField compute driver" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[dependencies] +async-trait = "0.1" +serde = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } diff --git a/crates/openshell-driver-bluefield/bf-core/src/assignment.rs b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs new file mode 100644 index 000000000..3a36f27f5 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs @@ -0,0 +1,180 @@ +//! The VF assignment the control-plane leader hands to a compute node. +//! +//! The leader allocates a VF, programs OVS via the DPU controller, then stamps +//! the resulting assignment into the sandbox's `template.labels`. The +//! compute-node role reads it back and binds exactly that VF. Carrying the +//! assignment as labels keeps it on the existing `ComputeDriver` contract with +//! no new proto, and makes it policy-stamped (a guest cannot forge it). + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// Label key prefix for all BlueField assignment fields. +pub const LABEL_PREFIX: &str = "openshell.io/bluefield."; + +pub const LABEL_HOST_BDF: &str = "openshell.io/bluefield.host-bdf"; +pub const LABEL_LEASE_GENERATION: &str = "openshell.io/bluefield.lease-generation"; +pub const LABEL_GUEST_MAC: &str = "openshell.io/bluefield.guest-mac"; +pub const LABEL_ATTACHMENT_ID: &str = "openshell.io/bluefield.attachment-id"; +pub const LABEL_PF: &str = "openshell.io/bluefield.pf"; +pub const LABEL_VF_INDEX: &str = "openshell.io/bluefield.vf-index"; + +/// A leader-decided VF assignment for one sandbox. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct BluefieldAssignment { + /// Host PCI BDF of the VF the compute node must bind. + pub host_bdf: String, + /// Controller lease generation. Carried for correlation/fencing; the + /// compute node never detaches, so it does not act on this directly. + pub lease_generation: u64, + /// Guest-visible VF MAC (the leader derives this deterministically). + pub guest_mac: String, + /// Controller attachment id, for logging/correlation. + pub attachment_id: String, + /// Optional cross-host coordinate; not required to bind. + pub pf: Option, + pub vf_index: Option, +} + +impl BluefieldAssignment { + /// True when the labels carry a (claimed) BlueField assignment. Used by the + /// compute node to fail closed when an unassigned sandbox arrives. + #[must_use] + pub fn is_present(labels: &HashMap) -> bool { + labels.contains_key(LABEL_HOST_BDF) + } + + /// Render the assignment as label key/value pairs. + #[must_use] + pub fn to_labels(&self) -> Vec<(String, String)> { + let mut out = vec![ + (LABEL_HOST_BDF.to_string(), self.host_bdf.clone()), + ( + LABEL_LEASE_GENERATION.to_string(), + self.lease_generation.to_string(), + ), + (LABEL_GUEST_MAC.to_string(), self.guest_mac.clone()), + (LABEL_ATTACHMENT_ID.to_string(), self.attachment_id.clone()), + ]; + if let Some(pf) = &self.pf { + out.push((LABEL_PF.to_string(), pf.clone())); + } + if let Some(vf_index) = self.vf_index { + out.push((LABEL_VF_INDEX.to_string(), vf_index.to_string())); + } + out + } + + /// Stamp the assignment into a labels map (overwriting any prior values). + pub fn apply(&self, labels: &mut HashMap) { + for (key, value) in self.to_labels() { + labels.insert(key, value); + } + } + + /// Parse an assignment from a labels map. Returns `Err` when a required + /// key is missing or malformed (the compute node treats this as fail-closed). + pub fn from_labels(labels: &HashMap) -> Result { + let required = |key: &str| -> Result { + labels + .get(key) + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()) + .ok_or_else(|| format!("missing required BlueField assignment label {key}")) + }; + + let host_bdf = required(LABEL_HOST_BDF)?; + let guest_mac = required(LABEL_GUEST_MAC)?; + let attachment_id = required(LABEL_ATTACHMENT_ID)?; + let lease_generation = required(LABEL_LEASE_GENERATION)? + .parse::() + .map_err(|err| format!("invalid {LABEL_LEASE_GENERATION}: {err}"))?; + + let pf = labels + .get(LABEL_PF) + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()); + let vf_index = match labels.get(LABEL_VF_INDEX).map(|v| v.trim()) { + Some(v) if !v.is_empty() => Some( + v.parse::() + .map_err(|err| format!("invalid {LABEL_VF_INDEX}: {err}"))?, + ), + _ => None, + }; + + Ok(Self { + host_bdf, + lease_generation, + guest_mac, + attachment_id, + pf, + vf_index, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample() -> BluefieldAssignment { + BluefieldAssignment { + host_bdf: "0000:03:00.2".to_string(), + lease_generation: 42, + guest_mac: "02:00:00:00:00:01".to_string(), + attachment_id: "bf-sb-1".to_string(), + pf: Some("0".to_string()), + vf_index: Some(3), + } + } + + #[test] + fn round_trips_through_labels() { + let assignment = sample(); + let mut labels = HashMap::new(); + assignment.apply(&mut labels); + assert!(BluefieldAssignment::is_present(&labels)); + assert_eq!( + BluefieldAssignment::from_labels(&labels).unwrap(), + assignment + ); + } + + #[test] + fn round_trips_without_optional_coordinate() { + let assignment = BluefieldAssignment { + pf: None, + vf_index: None, + ..sample() + }; + let mut labels = HashMap::new(); + assignment.apply(&mut labels); + assert_eq!( + BluefieldAssignment::from_labels(&labels).unwrap(), + assignment + ); + } + + #[test] + fn missing_required_label_is_rejected() { + let mut labels = HashMap::new(); + sample().apply(&mut labels); + labels.remove(LABEL_HOST_BDF); + assert!(!BluefieldAssignment::is_present(&labels)); + let err = BluefieldAssignment::from_labels(&labels).unwrap_err(); + assert!(err.contains(LABEL_HOST_BDF)); + } + + #[test] + fn malformed_lease_generation_is_rejected() { + let mut labels = HashMap::new(); + sample().apply(&mut labels); + labels.insert( + LABEL_LEASE_GENERATION.to_string(), + "not-a-number".to_string(), + ); + let err = BluefieldAssignment::from_labels(&labels).unwrap_err(); + assert!(err.contains(LABEL_LEASE_GENERATION)); + } +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/claim.rs b/crates/openshell-driver-bluefield/bf-core/src/claim.rs new file mode 100644 index 000000000..fca06a0eb --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/claim.rs @@ -0,0 +1,98 @@ +//! Runtime-neutral BlueField resource claims. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum NetworkMode { + #[default] + ProxyOnly, + DirectDevice, +} + +impl NetworkMode { + #[must_use] + pub fn as_str(&self) -> &'static str { + match self { + Self::ProxyOnly => "proxy-only", + Self::DirectDevice => "direct-device", + } + } + + #[must_use] + pub fn parse(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "proxy" | "proxy-only" | "proxy_only" => Some(Self::ProxyOnly), + "direct" | "direct-device" | "direct_device" | "vf" | "sriov" => { + Some(Self::DirectDevice) + } + _ => None, + } + } +} + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum StorageMode { + #[default] + None, + Workspace, + VmDisk, +} + +impl StorageMode { + #[must_use] + pub fn as_str(&self) -> &'static str { + match self { + Self::None => "none", + Self::Workspace => "workspace", + Self::VmDisk => "vm-disk", + } + } + + #[must_use] + pub fn parse(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "" | "none" | "disabled" => Some(Self::None), + "workspace" | "workspaces" => Some(Self::Workspace), + "vm-disk" | "vm_disk" | "vmdisk" => Some(Self::VmDisk), + _ => None, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DpuClaim { + pub claim_id: String, + pub sandbox_id: String, + pub runtime: String, + pub network_mode: NetworkMode, + pub storage_mode: StorageMode, + pub attachment_id: Option, + pub lease_generation: u64, + pub node: Option, + pub workload_identity: Option, + pub policy_hash: Option, +} + +impl DpuClaim { + #[must_use] + pub fn new( + claim_id: impl Into, + sandbox_id: impl Into, + runtime: impl Into, + network_mode: NetworkMode, + storage_mode: StorageMode, + ) -> Self { + Self { + claim_id: claim_id.into(), + sandbox_id: sandbox_id.into(), + runtime: runtime.into(), + network_mode, + storage_mode, + attachment_id: None, + lease_generation: 0, + node: None, + workload_identity: None, + policy_hash: None, + } + } +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/error.rs b/crates/openshell-driver-bluefield/bf-core/src/error.rs new file mode 100644 index 000000000..09a71c2f8 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/error.rs @@ -0,0 +1,30 @@ +//! Error surface shared by BlueField driver crates. + +pub type Result = std::result::Result; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BluefieldError { + InvalidConfig(String), + Unsupported(String), + ResourceExhausted(String), + Runtime(String), + Network(String), + Storage(String), + State(String), +} + +impl std::fmt::Display for BluefieldError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::InvalidConfig(message) + | Self::Unsupported(message) + | Self::ResourceExhausted(message) + | Self::Runtime(message) + | Self::Network(message) + | Self::Storage(message) + | Self::State(message) => f.write_str(message), + } + } +} + +impl std::error::Error for BluefieldError {} diff --git a/crates/openshell-driver-bluefield/bf-core/src/handles.rs b/crates/openshell-driver-bluefield/bf-core/src/handles.rs new file mode 100644 index 000000000..2cdb6a1f0 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/handles.rs @@ -0,0 +1,108 @@ +//! Shared BlueField handles that cross the driver, host, and DPU seam. + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct VfSlot { + pub id: String, + pub host_bdf: String, + pub pf: Option, + pub vf_index: Option, + pub representor: Option, + pub ovs_port: Option, + pub guest_datapath_address: Option, + pub guest_mac: Option, +} + +impl VfSlot { + #[must_use] + pub fn new(id: impl Into, host_bdf: impl Into) -> Self { + Self { + id: id.into(), + host_bdf: host_bdf.into(), + pf: None, + vf_index: None, + representor: None, + ovs_port: None, + guest_datapath_address: None, + guest_mac: None, + } + } + + #[must_use] + pub fn with_pf(mut self, pf: impl Into) -> Self { + self.pf = Some(pf.into()); + self + } + + #[must_use] + pub fn with_vf_index(mut self, vf_index: u32) -> Self { + self.vf_index = Some(vf_index); + self + } + + #[must_use] + pub fn with_representor(mut self, representor: impl Into) -> Self { + self.representor = Some(representor.into()); + self + } + + #[must_use] + pub fn with_ovs_port(mut self, ovs_port: impl Into) -> Self { + self.ovs_port = Some(ovs_port.into()); + self + } + + #[must_use] + pub fn with_guest_datapath_address(mut self, address: impl Into) -> Self { + self.guest_datapath_address = Some(address.into()); + self + } + + #[must_use] + pub fn with_guest_mac(mut self, mac: impl Into) -> Self { + self.guest_mac = Some(mac.into()); + self + } + + #[must_use] + pub fn vf_ref(&self) -> Option { + match (&self.pf, self.vf_index) { + (Some(pf), Some(idx)) => Some(VfRef::new(pf.clone(), idx)), + _ => None, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct VfRef { + pub pf: String, + pub vf_index: u32, +} + +impl VfRef { + #[must_use] + pub fn new(pf: impl Into, vf_index: u32) -> Self { + Self { + pf: pf.into(), + vf_index, + } + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum ProxyPlacement { + #[default] + None, + Dpu, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AttachSpec { + pub sandbox_id: String, + pub vf: VfRef, + pub host_bdf: String, + pub representor: Option, + pub guest_ip: Option, + pub guest_mac: Option, + pub openshell_endpoint: Option, + pub sandbox_token: Option, +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/lib.rs b/crates/openshell-driver-bluefield/bf-core/src/lib.rs new file mode 100644 index 000000000..ce31f5444 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/lib.rs @@ -0,0 +1,25 @@ +//! Shared contracts for the BlueField compute driver. + +pub mod assignment; +pub mod claim; +pub mod error; +pub mod handles; +pub mod lifecycle; +pub mod role; +pub mod runtime; +pub mod state; + +pub use assignment::BluefieldAssignment; +pub use claim::{DpuClaim, NetworkMode, StorageMode}; +pub use error::{BluefieldError, Result}; +pub use handles::{AttachSpec, ProxyPlacement, VfRef, VfSlot}; +pub use lifecycle::{ + BluefieldLifecycleExtension, LaunchAbortReason, LifecycleActivation, LifecycleContext, + LifecycleRegistry, RestoreContext, RuntimePlan, SandboxIdentity, +}; +pub use role::BluefieldRole; +pub use runtime::{ + RuntimeAdapter, RuntimeCapabilities, RuntimeCondition, RuntimeEvent, RuntimeEventKind, + RuntimeHandle, RuntimeResourceRequirements, RuntimeSandboxStatus, RuntimeWorkload, +}; +pub use state::{SandboxRecord, SandboxRecordPhase}; diff --git a/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs b/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs new file mode 100644 index 000000000..8a46dbf1d --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs @@ -0,0 +1,444 @@ +//! BlueField driver lifecycle extension framework. +//! +//! This mirrors the in-tree VM lifecycle extension hook chain, but the hooks +//! run inside the external BlueField compute driver and apply to any runtime. + +use std::sync::Arc; + +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; + +use crate::{DpuClaim, NetworkMode, Result, RuntimeHandle, RuntimeWorkload, StorageMode}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SandboxIdentity { + pub sandbox_id: String, + pub sandbox_name: String, + pub namespace: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct LifecycleContext { + pub sandbox: SandboxIdentity, + pub runtime: String, + pub network_mode: NetworkMode, + pub storage_mode: StorageMode, + pub node: Option, + pub policy_hash: Option, + pub labels: Vec<(String, String)>, + pub annotations: Vec<(String, String)>, +} + +impl LifecycleContext { + #[must_use] + pub fn extension_enabled(&self, key: &str) -> bool { + let extension_label = format!("openshell.io/extension.{key}"); + self.labels + .iter() + .chain(self.annotations.iter()) + .any(|(name, value)| name == &extension_label && value == "enabled") + } +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RuntimePlan { + pub runtime: String, + pub workload: RuntimeWorkload, + pub environment: Vec<(String, String)>, + pub labels: Vec<(String, String)>, + pub annotations: Vec<(String, String)>, + pub dpu_claim: Option, +} + +impl RuntimePlan { + #[must_use] + pub fn new(runtime: impl Into) -> Self { + Self { + runtime: runtime.into(), + workload: RuntimeWorkload::default(), + environment: Vec::new(), + labels: Vec::new(), + annotations: Vec::new(), + dpu_claim: None, + } + } + + pub fn set_env(&mut self, key: impl Into, value: impl Into) { + self.environment.push((key.into(), value.into())); + } + + pub fn set_label(&mut self, key: impl Into, value: impl Into) { + self.labels.push((key.into(), value.into())); + } + + pub fn set_annotation(&mut self, key: impl Into, value: impl Into) { + self.annotations.push((key.into(), value.into())); + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LaunchAbortReason { + RuntimeCreateFailed, + BeforeRuntimeCreateFailed, + DpuAttachFailed, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RestoreContext { + pub sandbox: SandboxIdentity, + pub runtime: String, + pub runtime_handle: Option, + pub dpu_claim: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LifecycleActivation { + Global, + OnRequest { key: &'static str }, +} + +#[async_trait] +pub trait BluefieldLifecycleExtension: std::fmt::Debug + Send + Sync { + fn name(&self) -> &'static str; + + fn activation(&self) -> LifecycleActivation { + LifecycleActivation::Global + } + + /// Pure planning hook. + async fn configure_runtime( + &self, + _ctx: &LifecycleContext, + _plan: &mut RuntimePlan, + ) -> Result<()> { + Ok(()) + } + + /// Side-effect hook before the runtime creates the workload. + async fn before_runtime_create( + &self, + _ctx: &LifecycleContext, + _plan: &mut RuntimePlan, + ) -> Result<()> { + Ok(()) + } + + /// Cleanup hook when runtime creation aborts. + async fn after_runtime_create_failed( + &self, + _ctx: &LifecycleContext, + _plan: &RuntimePlan, + _reason: LaunchAbortReason, + ) -> Result<()> { + Ok(()) + } + + /// Cleanup hook after the runtime deletes the workload. + async fn after_runtime_delete( + &self, + _ctx: &LifecycleContext, + _plan: &RuntimePlan, + ) -> Result<()> { + Ok(()) + } + + /// Re-adopt claims before restoring an existing runtime workload. + async fn before_runtime_restore( + &self, + _ctx: &RestoreContext, + _plan: &mut RuntimePlan, + ) -> Result<()> { + Ok(()) + } + + /// Reconcile DPU state after runtime restore completes. + async fn after_runtime_restore( + &self, + _ctx: &RestoreContext, + _plan: &RuntimePlan, + ) -> Result<()> { + Ok(()) + } +} + +#[derive(Debug, Default, Clone)] +pub struct LifecycleRegistry { + extensions: Vec>, +} + +impl LifecycleRegistry { + #[must_use] + pub fn new() -> Self { + Self::default() + } + + pub fn push(&mut self, extension: Arc) { + self.extensions.push(extension); + } + + #[must_use] + pub fn len(&self) -> usize { + self.extensions.len() + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.extensions.is_empty() + } + + fn active<'a>( + &'a self, + ctx: &'a LifecycleContext, + ) -> impl Iterator> { + self.extensions.iter().filter(move |extension| { + matches!(extension.activation(), LifecycleActivation::Global) + || matches!( + extension.activation(), + LifecycleActivation::OnRequest { key } if ctx.extension_enabled(key) + ) + }) + } + + pub async fn configure_runtime( + &self, + ctx: &LifecycleContext, + plan: &mut RuntimePlan, + ) -> Result<()> { + for extension in self.active(ctx) { + extension.configure_runtime(ctx, plan).await?; + } + Ok(()) + } + + pub async fn before_runtime_create( + &self, + ctx: &LifecycleContext, + plan: &mut RuntimePlan, + ) -> Result<()> { + for extension in self.active(ctx) { + extension.before_runtime_create(ctx, plan).await?; + } + Ok(()) + } + + pub async fn after_runtime_create_failed( + &self, + ctx: &LifecycleContext, + plan: &RuntimePlan, + reason: LaunchAbortReason, + ) -> Result<()> { + let active = self.active(ctx).cloned().collect::>(); + for extension in active.iter().rev() { + extension + .after_runtime_create_failed(ctx, plan, reason) + .await?; + } + Ok(()) + } + + pub async fn after_runtime_delete( + &self, + ctx: &LifecycleContext, + plan: &RuntimePlan, + ) -> Result<()> { + let active = self.active(ctx).cloned().collect::>(); + for extension in active.iter().rev() { + extension.after_runtime_delete(ctx, plan).await?; + } + Ok(()) + } + + pub async fn before_runtime_restore( + &self, + ctx: &RestoreContext, + plan: &mut RuntimePlan, + ) -> Result<()> { + let lifecycle_ctx = LifecycleContext { + sandbox: ctx.sandbox.clone(), + runtime: ctx.runtime.clone(), + network_mode: ctx + .dpu_claim + .as_ref() + .map(|claim| claim.network_mode.clone()) + .unwrap_or_default(), + storage_mode: ctx + .dpu_claim + .as_ref() + .map(|claim| claim.storage_mode.clone()) + .unwrap_or_default(), + node: ctx.dpu_claim.as_ref().and_then(|claim| claim.node.clone()), + policy_hash: ctx + .dpu_claim + .as_ref() + .and_then(|claim| claim.policy_hash.clone()), + labels: Vec::new(), + annotations: Vec::new(), + }; + for extension in self.active(&lifecycle_ctx) { + extension.before_runtime_restore(ctx, plan).await?; + } + Ok(()) + } + + pub async fn after_runtime_restore( + &self, + ctx: &RestoreContext, + plan: &RuntimePlan, + ) -> Result<()> { + let lifecycle_ctx = LifecycleContext { + sandbox: ctx.sandbox.clone(), + runtime: ctx.runtime.clone(), + network_mode: ctx + .dpu_claim + .as_ref() + .map(|claim| claim.network_mode.clone()) + .unwrap_or_default(), + storage_mode: ctx + .dpu_claim + .as_ref() + .map(|claim| claim.storage_mode.clone()) + .unwrap_or_default(), + node: ctx.dpu_claim.as_ref().and_then(|claim| claim.node.clone()), + policy_hash: ctx + .dpu_claim + .as_ref() + .and_then(|claim| claim.policy_hash.clone()), + labels: Vec::new(), + annotations: Vec::new(), + }; + for extension in self.active(&lifecycle_ctx) { + extension.after_runtime_restore(ctx, plan).await?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use super::*; + + #[derive(Debug)] + struct RecordingExtension { + name: &'static str, + activation: LifecycleActivation, + events: Arc>>, + } + + #[async_trait] + impl BluefieldLifecycleExtension for RecordingExtension { + fn name(&self) -> &'static str { + self.name + } + + fn activation(&self) -> LifecycleActivation { + self.activation + } + + async fn configure_runtime( + &self, + _ctx: &LifecycleContext, + _plan: &mut RuntimePlan, + ) -> Result<()> { + self.events + .lock() + .expect("events lock poisoned") + .push(format!("{}:configure", self.name)); + Ok(()) + } + + async fn after_runtime_delete( + &self, + _ctx: &LifecycleContext, + _plan: &RuntimePlan, + ) -> Result<()> { + self.events + .lock() + .expect("events lock poisoned") + .push(format!("{}:delete", self.name)); + Ok(()) + } + } + + fn ctx(labels: Vec<(String, String)>) -> LifecycleContext { + LifecycleContext { + sandbox: SandboxIdentity { + sandbox_id: "sb".to_string(), + sandbox_name: "sandbox".to_string(), + namespace: "default".to_string(), + }, + runtime: "vm".to_string(), + network_mode: NetworkMode::ProxyOnly, + storage_mode: StorageMode::None, + node: None, + policy_hash: None, + labels, + annotations: Vec::new(), + } + } + + #[tokio::test] + async fn registry_runs_cleanup_in_reverse_order() { + let events = Arc::new(Mutex::new(Vec::new())); + let mut registry = LifecycleRegistry::new(); + registry.push(Arc::new(RecordingExtension { + name: "first", + activation: LifecycleActivation::Global, + events: events.clone(), + })); + registry.push(Arc::new(RecordingExtension { + name: "second", + activation: LifecycleActivation::Global, + events: events.clone(), + })); + + let ctx = ctx(Vec::new()); + let mut plan = RuntimePlan::new("vm"); + registry.configure_runtime(&ctx, &mut plan).await.unwrap(); + registry.after_runtime_delete(&ctx, &plan).await.unwrap(); + + assert_eq!( + *events.lock().expect("events lock poisoned"), + vec![ + "first:configure".to_string(), + "second:configure".to_string(), + "second:delete".to_string(), + "first:delete".to_string() + ] + ); + } + + #[tokio::test] + async fn registry_filters_on_request_extensions() { + let events = Arc::new(Mutex::new(Vec::new())); + let mut registry = LifecycleRegistry::new(); + registry.push(Arc::new(RecordingExtension { + name: "requested", + activation: LifecycleActivation::OnRequest { key: "network" }, + events: events.clone(), + })); + + let mut plan = RuntimePlan::new("vm"); + registry + .configure_runtime(&ctx(Vec::new()), &mut plan) + .await + .unwrap(); + assert!(events.lock().expect("events lock poisoned").is_empty()); + + registry + .configure_runtime( + &ctx(vec![( + "openshell.io/extension.network".to_string(), + "enabled".to_string(), + )]), + &mut plan, + ) + .await + .unwrap(); + assert_eq!( + *events.lock().expect("events lock poisoned"), + vec!["requested:configure".to_string()] + ); + } +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/role.rs b/crates/openshell-driver-bluefield/bf-core/src/role.rs new file mode 100644 index 000000000..6a730a3f1 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/role.rs @@ -0,0 +1,107 @@ +//! Deployment role for the BlueField compute driver. +//! +//! A single driver binary runs in one of three roles, selected at startup. +//! The role is workload-agnostic, so it lives in `bf-core` and is reused by +//! every leaf driver (`bf-vm`, a future `bf-container`, ...). + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +/// Which part of the split topology this driver instance plays. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum BluefieldRole { + /// In-process control + compute on one node (dev / single host). + #[default] + AllInOne, + /// Leader: allocates VFs, programs OVS via the DPU controller, and + /// forwards sandbox lifecycle to a downstream compute-node driver. Never + /// binds a VF or launches a workload itself. + ControlPlane, + /// Follower: binds the leader-assigned VF and launches the workload. + /// Holds no control-plane endpoint. + ComputeNode, +} + +impl BluefieldRole { + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::AllInOne => "all-in-one", + Self::ControlPlane => "control-plane", + Self::ComputeNode => "compute-node", + } + } + + /// True when this role allocates VFs and drives the DPU controller. + #[must_use] + pub fn is_control_plane(self) -> bool { + matches!(self, Self::AllInOne | Self::ControlPlane) + } + + /// True when this role binds a VF and launches the workload locally. + #[must_use] + pub fn runs_workload(self) -> bool { + matches!(self, Self::AllInOne | Self::ComputeNode) + } +} + +impl fmt::Display for BluefieldRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for BluefieldRole { + type Err = String; + + fn from_str(value: &str) -> Result { + match value { + "all-in-one" => Ok(Self::AllInOne), + "control-plane" => Ok(Self::ControlPlane), + "compute-node" => Ok(Self::ComputeNode), + other => Err(format!( + "invalid BlueField role {other:?}; expected 'all-in-one', 'control-plane', or 'compute-node'" + )), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn round_trips_through_str() { + for role in [ + BluefieldRole::AllInOne, + BluefieldRole::ControlPlane, + BluefieldRole::ComputeNode, + ] { + assert_eq!(role.as_str().parse::().unwrap(), role); + } + } + + #[test] + fn default_is_all_in_one() { + assert_eq!(BluefieldRole::default(), BluefieldRole::AllInOne); + } + + #[test] + fn capability_predicates() { + assert!(BluefieldRole::ControlPlane.is_control_plane()); + assert!(!BluefieldRole::ControlPlane.runs_workload()); + assert!(BluefieldRole::ComputeNode.runs_workload()); + assert!(!BluefieldRole::ComputeNode.is_control_plane()); + assert!(BluefieldRole::AllInOne.is_control_plane()); + assert!(BluefieldRole::AllInOne.runs_workload()); + } + + #[test] + fn rejects_unknown_role() { + let err = "leader".parse::().unwrap_err(); + assert!(err.contains("invalid BlueField role")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/runtime.rs b/crates/openshell-driver-bluefield/bf-core/src/runtime.rs new file mode 100644 index 000000000..f9d0fdd5f --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/runtime.rs @@ -0,0 +1,173 @@ +//! Runtime adapter contract. + +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; + +use crate::{DpuClaim, Result, RuntimePlan}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RuntimeCapabilities { + pub name: String, + pub supports_proxy_only: bool, + pub supports_direct_device: bool, + pub supports_storage: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RuntimeHandle { + pub runtime: String, + pub sandbox_id: String, + pub namespace: String, + pub name: String, + pub native_id: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] +pub struct RuntimeResourceRequirements { + pub cpu_request: String, + pub cpu_limit: String, + pub memory_request: String, + pub memory_limit: String, +} + +impl RuntimeResourceRequirements { + #[must_use] + pub fn is_empty(&self) -> bool { + self.cpu_request.is_empty() + && self.cpu_limit.is_empty() + && self.memory_request.is_empty() + && self.memory_limit.is_empty() + } +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RuntimeWorkload { + pub sandbox_id: String, + pub sandbox_name: String, + pub namespace: String, + pub image: Option, + pub log_level: Option, + pub environment: Vec<(String, String)>, + pub template_environment: Vec<(String, String)>, + pub template_labels: Vec<(String, String)>, + pub agent_socket_path: Option, + pub gpu: bool, + pub resources: Option, + pub platform_config: serde_json::Value, +} + +impl Default for RuntimeWorkload { + fn default() -> Self { + Self { + sandbox_id: String::new(), + sandbox_name: String::new(), + namespace: String::new(), + image: None, + log_level: None, + environment: Vec::new(), + template_environment: Vec::new(), + template_labels: Vec::new(), + agent_socket_path: None, + gpu: false, + resources: None, + platform_config: serde_json::Value::Null, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RuntimeCondition { + pub r#type: String, + pub status: String, + pub reason: String, + pub message: String, + pub last_transition_time: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RuntimeSandboxStatus { + pub handle: RuntimeHandle, + pub sandbox_name: String, + pub agent_fd: String, + pub sandbox_fd: String, + pub conditions: Vec, + pub deleting: bool, +} + +impl RuntimeSandboxStatus { + #[must_use] + pub fn ready(handle: RuntimeHandle) -> Self { + Self { + sandbox_name: handle.name.clone(), + handle, + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![RuntimeCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "RuntimeObserved".to_string(), + message: "Runtime workload observed".to_string(), + last_transition_time: String::new(), + }], + deleting: false, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum RuntimeEventKind { + Created, + Updated, + Deleted, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RuntimeEvent { + pub kind: RuntimeEventKind, + pub handle: RuntimeHandle, + pub message: String, +} + +#[async_trait] +pub trait RuntimeAdapter: std::fmt::Debug + Send + Sync { + fn name(&self) -> &'static str; + fn capabilities(&self) -> RuntimeCapabilities; + + async fn validate_claim(&self, claim: &DpuClaim) -> Result<()>; + + /// Validate the final runtime plan after BlueField lifecycle extensions + /// have had a chance to add DPU claim material. + async fn validate_plan(&self, plan: &RuntimePlan) -> Result<()> { + if let Some(claim) = &plan.dpu_claim { + self.validate_claim(claim).await?; + } + Ok(()) + } + + async fn create(&self, plan: RuntimePlan) -> Result; + async fn stop(&self, handle: &RuntimeHandle) -> Result<()>; + async fn delete(&self, handle: &RuntimeHandle) -> Result<()>; + async fn get(&self, sandbox_id: &str) -> Result>; + async fn list(&self) -> Result>; + + async fn status(&self, sandbox_id: &str) -> Result> { + Ok(self.get(sandbox_id).await?.map(RuntimeSandboxStatus::ready)) + } + + async fn list_statuses(&self) -> Result> { + let handles = self.list().await?; + Ok(handles + .into_iter() + .map(RuntimeSandboxStatus::ready) + .collect()) + } + + async fn reconcile( + &self, + plan: &RuntimePlan, + existing: Option, + ) -> Result> { + let _ = plan; + Ok(existing) + } +} diff --git a/crates/openshell-driver-bluefield/bf-core/src/state.rs b/crates/openshell-driver-bluefield/bf-core/src/state.rs new file mode 100644 index 000000000..548813c6e --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/state.rs @@ -0,0 +1,40 @@ +//! Persisted BlueField driver state. + +use serde::{Deserialize, Serialize}; + +use crate::{DpuClaim, RuntimeHandle, RuntimePlan, SandboxIdentity}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum SandboxRecordPhase { + Creating, + Ready, + Stopped, + Deleting, + Failed, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SandboxRecord { + pub sandbox: SandboxIdentity, + pub runtime: String, + pub phase: SandboxRecordPhase, + pub plan: RuntimePlan, + pub runtime_handle: Option, + pub dpu_claim: Option, + pub message: Option, +} + +impl SandboxRecord { + #[must_use] + pub fn new(sandbox: SandboxIdentity, plan: RuntimePlan) -> Self { + Self { + runtime: plan.runtime.clone(), + sandbox, + phase: SandboxRecordPhase::Creating, + runtime_handle: None, + dpu_claim: plan.dpu_claim.clone(), + plan, + message: None, + } + } +} diff --git a/crates/openshell-driver-bluefield/src/lib.rs b/crates/openshell-driver-bluefield/src/lib.rs new file mode 100644 index 000000000..68a17b4b1 --- /dev/null +++ b/crates/openshell-driver-bluefield/src/lib.rs @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! `BlueField` compute driver package marker. +//! +//! The `bf-*` crates under this directory are private implementation crates. +//! They are workspace members for build and review boundaries, but this marker +//! crate intentionally re-exports nothing. + +pub const DRIVER_NAME: &str = "bluefield"; From 226bba90f296022ece6b26e7c02faa713bc81205 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 19:02:58 +0000 Subject: [PATCH 02/14] feat(bluefield): add bf-inventory VF discovery and pool bf-inventory turns host sysfs into a set of claimable VF slots and owns the per-sandbox claim/release bookkeeping (VfPool) used by the lifecycle extension to hand out one VF per sandbox. --- .../bf-inventory/Cargo.toml | 12 + .../bf-inventory/src/inventory.rs | 262 ++++++++++++++++++ .../bf-inventory/src/lib.rs | 14 + .../bf-inventory/src/pool.rs | 136 +++++++++ 4 files changed, 424 insertions(+) create mode 100644 crates/openshell-driver-bluefield/bf-inventory/Cargo.toml create mode 100644 crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs create mode 100644 crates/openshell-driver-bluefield/bf-inventory/src/lib.rs create mode 100644 crates/openshell-driver-bluefield/bf-inventory/src/pool.rs diff --git a/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml b/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml new file mode 100644 index 000000000..cce233534 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "bf-inventory" +description = "BlueField function inventory, discovery, and allocation" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[dependencies] +bf-core = { path = "../bf-core" } +openshell-vfio = { path = "../../openshell-vfio" } diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs b/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs new file mode 100644 index 000000000..8df572872 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs @@ -0,0 +1,262 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VF inventory discovery. +//! +//! Replaces hand-fed VF slots with discovery behind a trait, so the same +//! extension works in both topologies and is unit-testable against a mock +//! `/sys` (no hardware): +//! +//! - [`StaticVfInventory`] — an explicit slot list (pinned setups, tests). +//! - [`SysfsVfInventory`] — **host** side: enumerates a PF's VFs from +//! `/sys/bus/pci/devices//virtfn` to get each VF's BDF + index. +//! - [`SysfsRepresentorInventory`] — **DPU** side: enumerates switchdev +//! representor netdevs and reads `phys_port_name` (`pfXvfY`) to map a VF +//! coordinate to its representor / OVS port. +//! +//! The two sides agree on a [`VfRef`] = `(pf, vf_index)`. The host uses the +//! PF's PCI BDF as the `pf` key; the DPU uses the e-switch PF index. Mapping +//! one to the other on a given deployment is a config concern (the host PF +//! BDF that backs `pf0`), kept out of this mechanical discovery layer. + +use std::path::PathBuf; + +use bf_core::{VfRef, VfSlot}; +use openshell_vfio::SysfsRoot; + +/// Error surface for inventory discovery. +#[derive(Debug, Clone)] +pub enum VfError { + Discovery(String), +} + +impl core::fmt::Display for VfError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::Discovery(m) => write!(f, "vf discovery failed: {m}"), + } + } +} + +impl std::error::Error for VfError {} + +pub type VfResult = Result; + +/// Source of the VF slots a [`super::pool::VfPool`] hands to sandboxes. +pub trait VfInventory: core::fmt::Debug + Send + Sync { + /// Enumerate all VF slots this inventory knows about. + fn discover(&self) -> VfResult>; + + /// Resolve the representor for a VF coordinate. Defaults to a scan of + /// [`discover`](Self::discover); sysfs impls may override for efficiency. + fn resolve_representor(&self, vf: &VfRef) -> VfResult> { + Ok(self + .discover()? + .into_iter() + .find(|s| s.pf.as_deref() == Some(vf.pf.as_str()) && s.vf_index == Some(vf.vf_index)) + .and_then(|s| s.representor)) + } +} + +/// Explicit, hand-fed inventory. Equivalent to the original `VfPool::new` +/// behavior; ideal for tests and pinned deployments. +#[derive(Debug, Default, Clone)] +pub struct StaticVfInventory { + slots: Vec, +} + +impl StaticVfInventory { + #[must_use] + pub fn new(slots: impl IntoIterator) -> Self { + Self { + slots: slots.into_iter().collect(), + } + } +} + +impl VfInventory for StaticVfInventory { + fn discover(&self) -> VfResult> { + Ok(self.slots.clone()) + } +} + +/// Host-side inventory: enumerates SR-IOV VFs of one or more PFs from sysfs. +#[derive(Debug)] +pub struct SysfsVfInventory { + sysfs: SysfsRoot, + /// PF PCI BDFs whose VFs are available to sandboxes. + pfs: Vec, + /// Safety cap on the per-PF `virtfn` scan. + max_vfs: u32, +} + +impl SysfsVfInventory { + #[must_use] + pub fn new(sysfs: SysfsRoot, pfs: impl IntoIterator) -> Self { + Self { + sysfs, + pfs: pfs.into_iter().collect(), + max_vfs: 256, + } + } +} + +impl VfInventory for SysfsVfInventory { + fn discover(&self) -> VfResult> { + let mut slots = Vec::new(); + for pf in &self.pfs { + let pf_dir = self.sysfs.pci_device(pf); + for index in 0..self.max_vfs { + let link = pf_dir.join(format!("virtfn{index}")); + // `symlink_metadata` so a dangling/!exists link stops the scan + // without following into a missing target. + if link.symlink_metadata().is_err() { + break; + } + let target = std::fs::read_link(&link).map_err(|e| { + VfError::Discovery(format!("read_link {}: {e}", link.display())) + })?; + let vf_bdf = target + .file_name() + .and_then(|n| n.to_str()) + .ok_or_else(|| { + VfError::Discovery(format!( + "virtfn target has no bdf: {}", + target.display() + )) + })? + .to_string(); + let mut slot = VfSlot::new(vf_bdf.clone(), vf_bdf.clone()) + .with_pf(pf.clone()) + .with_vf_index(index); + if let Some(mac) = read_vf_mac(&self.sysfs, &vf_bdf) { + slot = slot.with_guest_mac(mac); + } + slots.push(slot); + } + } + Ok(slots) + } +} + +fn read_vf_mac(sysfs: &SysfsRoot, vf_bdf: &str) -> Option { + let net_dir = sysfs.pci_device(vf_bdf).join("net"); + let entries = std::fs::read_dir(net_dir).ok()?; + let mut ifaces = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .collect::>(); + ifaces.sort(); + for iface in ifaces { + let Ok(address) = std::fs::read_to_string(iface.join("address")) else { + continue; + }; + let address = address.trim(); + if !address.is_empty() { + return Some(address.to_ascii_lowercase()); + } + } + None +} + +/// DPU-side inventory: maps VF coordinates to switchdev representor netdevs by +/// reading `phys_port_name` under the net sysfs tree. +#[derive(Debug)] +pub struct SysfsRepresentorInventory { + /// Base of the net sysfs tree (default `/sys/class/net`). + net_sysfs: PathBuf, +} + +impl Default for SysfsRepresentorInventory { + fn default() -> Self { + Self { + net_sysfs: PathBuf::from("/sys/class/net"), + } + } +} + +impl SysfsRepresentorInventory { + #[must_use] + pub fn new(net_sysfs: impl Into) -> Self { + Self { + net_sysfs: net_sysfs.into(), + } + } +} + +/// Parse a switchdev VF-representor `phys_port_name` (e.g. `pf0vf3`) into a +/// `(pf_index, vf_index)`. Returns `None` for non-VF ports (uplinks, PFs). +fn parse_phys_port_name(s: &str) -> Option<(u32, u32)> { + let s = s.trim(); + let pf_pos = s.find("pf")?; + let after_pf = &s[pf_pos + 2..]; + let vf_pos = after_pf.find("vf")?; + let pf_num: u32 = after_pf[..vf_pos].parse().ok()?; + let vf_num: u32 = after_pf[vf_pos + 2..].parse().ok()?; + Some((pf_num, vf_num)) +} + +impl VfInventory for SysfsRepresentorInventory { + fn discover(&self) -> VfResult> { + let mut slots = Vec::new(); + let entries = std::fs::read_dir(&self.net_sysfs).map_err(|e| { + VfError::Discovery(format!("read_dir {}: {e}", self.net_sysfs.display())) + })?; + for entry in entries { + let entry = entry.map_err(|e| VfError::Discovery(e.to_string()))?; + let ifname = entry.file_name().to_string_lossy().into_owned(); + let ppn_path = entry.path().join("phys_port_name"); + let Ok(ppn) = std::fs::read_to_string(&ppn_path) else { + continue; + }; + let Some((pf_index, vf_index)) = parse_phys_port_name(&ppn) else { + continue; + }; + slots.push( + VfSlot::new(ifname.clone(), String::new()) + .with_pf(pf_index.to_string()) + .with_vf_index(vf_index) + .with_representor(ifname.clone()) + .with_ovs_port(ifname), + ); + } + Ok(slots) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::unix::fs::symlink; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_sysfs_root(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("openshell-{name}-{}-{nonce}", std::process::id())) + } + + #[test] + fn sysfs_vf_inventory_reads_guest_mac_from_vf_netdev() { + let root = temp_sysfs_root("vf-mac"); + let devices = root.join("bus/pci/devices"); + let pf = devices.join("0000:03:00.0"); + let vf = devices.join("0000:03:00.2"); + std::fs::create_dir_all(&pf).unwrap(); + std::fs::create_dir_all(vf.join("net/enp3s0v0")).unwrap(); + std::fs::write(vf.join("net/enp3s0v0/address"), "86:7F:6E:5B:E0:7B\n").unwrap(); + symlink("../0000:03:00.2", pf.join("virtfn0")).unwrap(); + + let inventory = SysfsVfInventory::new(SysfsRoot::new(&root), ["0000:03:00.0".to_string()]); + let slots = inventory.discover().unwrap(); + + assert_eq!(slots.len(), 1); + assert_eq!(slots[0].host_bdf, "0000:03:00.2"); + assert_eq!(slots[0].vf_index, Some(0)); + assert_eq!(slots[0].guest_mac.as_deref(), Some("86:7f:6e:5b:e0:7b")); + + std::fs::remove_dir_all(root).unwrap(); + } +} diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs b/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs new file mode 100644 index 000000000..875e19740 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField function inventory, discovery, and allocation. + +pub use bf_core::{VfRef, VfSlot}; + +pub mod inventory; +pub mod pool; + +pub use inventory::{ + StaticVfInventory, SysfsRepresentorInventory, SysfsVfInventory, VfError, VfInventory, VfResult, +}; +pub use pool::VfPool; diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs b/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs new file mode 100644 index 000000000..8fcbc892a --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VF slot pool. In-memory claim/release of BlueField VFs to sandboxes. + +use std::collections::HashMap; +use std::sync::Mutex; + +pub use bf_core::VfSlot; + +/// Inventory of VF slots with per-sandbox claim tracking. +#[derive(Debug, Default)] +pub struct VfPool { + slots: Vec, + /// sandbox_id -> slot index. + claims: Mutex>, +} + +impl VfPool { + #[must_use] + pub fn new(slots: impl IntoIterator) -> Self { + Self { + slots: slots.into_iter().collect(), + claims: Mutex::new(HashMap::new()), + } + } + + /// Build a pool from a [`VfInventory`](super::inventory::VfInventory), + /// discovering the available slots at startup instead of hand-feeding them. + pub fn from_inventory( + inventory: &dyn crate::inventory::VfInventory, + ) -> Result { + Ok(Self::new(inventory.discover()?)) + } + + /// Claim a free slot for `sandbox_id`. Idempotent: a sandbox that already + /// holds a slot gets the same one back. Returns `None` when exhausted. + pub fn claim(&self, sandbox_id: &str) -> Option { + let mut claims = self.claims.lock().expect("vf pool claims lock poisoned"); + if let Some(&idx) = claims.get(sandbox_id) { + return self.slots.get(idx).cloned(); + } + let used: std::collections::HashSet = claims.values().copied().collect(); + let free = (0..self.slots.len()).find(|idx| !used.contains(idx))?; + claims.insert(sandbox_id.to_string(), free); + self.slots.get(free).cloned() + } + + /// Claim a specific host BDF for `sandbox_id`. Idempotent for an existing + /// matching claim and fails if another sandbox owns the slot. + pub fn claim_by_host_bdf(&self, sandbox_id: &str, host_bdf: &str) -> Option { + let mut claims = self.claims.lock().expect("vf pool claims lock poisoned"); + let idx = self + .slots + .iter() + .position(|slot| slot.host_bdf == host_bdf)?; + if let Some(&existing_idx) = claims.get(sandbox_id) { + return (existing_idx == idx) + .then(|| self.slots.get(idx).cloned()) + .flatten(); + } + if claims.values().any(|&claimed_idx| claimed_idx == idx) { + return None; + } + claims.insert(sandbox_id.to_string(), idx); + self.slots.get(idx).cloned() + } + + /// Return the slot with the given host BDF. + #[must_use] + pub fn slot_by_host_bdf(&self, host_bdf: &str) -> Option { + self.slots + .iter() + .find(|slot| slot.host_bdf == host_bdf) + .cloned() + } + + /// Release the slot held by `sandbox_id`, if any. + pub fn release(&self, sandbox_id: &str) { + self.claims + .lock() + .expect("vf pool claims lock poisoned") + .remove(sandbox_id); + } +} + +#[cfg(test)] +mod tests { + use super::{VfPool, VfSlot}; + + #[test] + fn claim_is_idempotent_per_sandbox() { + let pool = VfPool::new([VfSlot::new("vf0", "0000:03:00.2")]); + let a = pool.claim("sandbox-1").unwrap(); + let b = pool.claim("sandbox-1").unwrap(); + assert_eq!(a, b); + } + + #[test] + fn distinct_sandboxes_get_distinct_slots_and_release_frees() { + let pool = VfPool::new([ + VfSlot::new("vf0", "0000:03:00.2"), + VfSlot::new("vf1", "0000:03:00.3"), + ]); + let s1 = pool.claim("sandbox-1").unwrap(); + let s2 = pool.claim("sandbox-2").unwrap(); + assert_ne!(s1.id, s2.id); + assert!(pool.claim("sandbox-3").is_none(), "pool exhausted"); + + pool.release("sandbox-1"); + let s3 = pool.claim("sandbox-3").unwrap(); + assert_eq!(s3.id, s1.id); + } + + #[test] + fn claim_by_host_bdf_reuses_matching_restore_slot() { + let pool = VfPool::new([ + VfSlot::new("vf0", "0000:03:00.2"), + VfSlot::new("vf1", "0000:03:00.3"), + ]); + + let restored = pool.claim_by_host_bdf("sandbox-1", "0000:03:00.3").unwrap(); + assert_eq!(restored.id, "vf1"); + assert_eq!( + pool.claim_by_host_bdf("sandbox-1", "0000:03:00.3") + .unwrap() + .id, + "vf1" + ); + assert!( + pool.claim_by_host_bdf("sandbox-2", "0000:03:00.3") + .is_none(), + "claimed restore slot is not reused by another sandbox" + ); + } +} From 9d2132f3570cb329a4c2f3337588c72bdb74cef8 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 19:02:59 +0000 Subject: [PATCH 03/14] feat(bluefield): add bf-vm VF passthrough lifecycle extension bf-vm plugs into the VM driver's lifecycle-extension seam. For each sandbox it claims a VF, checks host passthrough readiness, binds the VF to vfio-pci, persists the binding for restart recovery, and releases it on launch failure or delete. It also selects the BlueField guest kernel and wires the static guest-egress env contract. --- .../bf-vm/Cargo.toml | 28 + .../bf-vm/scripts/guest-egress-dropin.sh | 107 ++++ .../bf-vm/src/cli.rs | 224 ++++++++ .../bf-vm/src/config.rs | 167 ++++++ .../bf-vm/src/extension.rs | 534 ++++++++++++++++++ .../bf-vm/src/extensions/mod.rs | 37 ++ .../bf-vm/src/guest_egress.rs | 149 +++++ .../bf-vm/src/kernel.rs | 327 +++++++++++ .../bf-vm/src/lib.rs | 33 ++ .../bf-vm/src/slots.rs | 134 +++++ .../bf-vm/src/state.rs | 106 ++++ .../bf-vm/src/vf.rs | 137 +++++ 12 files changed, 1983 insertions(+) create mode 100644 crates/openshell-driver-bluefield/bf-vm/Cargo.toml create mode 100644 crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/cli.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/config.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/extension.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/extensions/mod.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/kernel.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/lib.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/slots.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/state.rs create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/vf.rs diff --git a/crates/openshell-driver-bluefield/bf-vm/Cargo.toml b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml new file mode 100644 index 000000000..463f1d1a2 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "bf-vm" +description = "Bare-metal VM runtime adapter for the OpenShell BlueField driver" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[features] +default = [] + +[dependencies] +bf-core = { path = "../bf-core" } +bf-inventory = { path = "../bf-inventory" } +clap = { workspace = true } +openshell-core = { path = "../../openshell-core", default-features = false } +openshell-driver-vm = { path = "../../openshell-driver-vm" } +openshell-vfio = { path = "../../openshell-vfio" } +serde = { workspace = true } +serde_json = { workspace = true } +sha2 = { workspace = true } +tokio = { workspace = true } +tonic = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +tempfile = "3" diff --git a/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh b/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh new file mode 100644 index 000000000..1e891d8a5 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# OpenShell BlueField VF egress setup. +set -eu + +find_bluefield_vf() { + local candidate vendor + + for path in /sys/class/net/*; do + [ -e "${path}" ] || continue + candidate="${path##*/}" + case "${candidate}" in + lo|dummy*|veth*|br-*|docker*|cni*|flannel*|eth0) + continue + ;; + esac + if [ -r "${path}/device/vendor" ]; then + vendor="$(cat "${path}/device/vendor" 2>/dev/null || true)" + if [ "${vendor}" = "0x15b3" ]; then + printf '%s\n' "${candidate}" + return 0 + fi + fi + done + + for path in /sys/class/net/*; do + [ -e "${path}" ] || continue + candidate="${path##*/}" + case "${candidate}" in + lo|dummy*|veth*|br-*|docker*|cni*|flannel*|eth0) + continue + ;; + esac + if [ -e "${path}/device" ]; then + printf '%s\n' "${candidate}" + return 0 + fi + done + + return 1 +} + +wait_for_bluefield_vf() { + local vf_nic + + vf_nic="" + for _ in 1 2 3 4 5; do + if vf_nic="$(find_bluefield_vf)"; then + printf '%s\n' "${vf_nic}" + return 0 + fi + sleep 1 + done + + echo "openshell: bluefield VF egress drop-in could not locate VF NIC" >&2 + return 1 +} + +set_optional_mac() { + local vf_nic="$1" + + if [ -n "${OPENSHELL_VM_DATA_MAC:-}" ]; then + ip link set "${vf_nic}" down 2>/dev/null || true + ip link set dev "${vf_nic}" address "${OPENSHELL_VM_DATA_MAC}" + fi +} + +configure_static_ip() { + local vf_nic="$1" + + ip link set "${vf_nic}" up + ip addr flush dev "${vf_nic}" 2>/dev/null || true + ip addr add "${OPENSHELL_VM_DATA_IP}" dev "${vf_nic}" + ip route replace default via "${OPENSHELL_VM_DATA_GW}" dev "${vf_nic}" +} + +configure_resolv_conf() { + local resolv_conf + + resolv_conf="${ROOT_PREFIX:-}/etc/resolv.conf" + mkdir -p "$(dirname "${resolv_conf}")" 2>/dev/null || true + { + echo '# OpenShell BlueField external-VF mode leaves DNS to the sandbox image or DPU-side policy.' + echo 'options timeout:1 attempts:1' + } > "${resolv_conf}" +} + +main() { + local vf_nic + + : "${OPENSHELL_VM_DATA_EGRESS:=}" + [ "${OPENSHELL_VM_DATA_EGRESS}" = "external-vf" ] || exit 0 + + : "${OPENSHELL_VM_DATA_IP:?OPENSHELL_VM_DATA_IP is required for BlueField VF egress}" + : "${OPENSHELL_VM_DATA_GW:?OPENSHELL_VM_DATA_GW is required for BlueField VF egress}" + + vf_nic="$(wait_for_bluefield_vf)" + set_optional_mac "${vf_nic}" + configure_static_ip "${vf_nic}" + configure_resolv_conf + + echo "openshell: bluefield VF egress configured nic=${vf_nic} ip=${OPENSHELL_VM_DATA_IP} gw=${OPENSHELL_VM_DATA_GW}" +} + +main "$@" diff --git a/crates/openshell-driver-bluefield/bf-vm/src/cli.rs b/crates/openshell-driver-bluefield/bf-vm/src/cli.rs new file mode 100644 index 000000000..945bc9253 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/cli.rs @@ -0,0 +1,224 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField VM-driver CLI/env surface. +//! +//! The root driver binary exposes these flags, but the extension owns their +//! names, defaults, parsing, and conversion into [`BluefieldDriverConfig`]. + +use std::path::PathBuf; + +use bf_core::BluefieldRole; +use clap::Args; + +use super::{BluefieldDriverConfig, ProxyPlacement}; + +#[derive(Args, Debug, Clone, Default)] +pub struct BluefieldDriverArgs { + #[arg( + long = "bluefield", + env = "OPENSHELL_BLUEFIELD", + default_value_t = false + )] + pub enabled: bool, + + /// Deployment role: `all-in-one` (default), `control-plane`, or + /// `compute-node`. Selects how this driver instance behaves in the split + /// topology. + #[arg( + long = "bluefield-role", + env = "OPENSHELL_BLUEFIELD_ROLE", + default_value = "all-in-one" + )] + pub role: String, + + #[arg( + long = "bluefield-controller-endpoint", + env = "OPENSHELL_BLUEFIELD_CONTROLLER_ENDPOINT" + )] + pub controller_endpoint: Option, + + #[arg(long = "bluefield-tls-dir", env = "OPENSHELL_BLUEFIELD_TLS_DIR")] + pub tls_dir: Option, + + #[arg( + long = "bluefield-tls-domain", + env = "OPENSHELL_BLUEFIELD_TLS_DOMAIN", + default_value = "bluefield-controller" + )] + pub tls_domain: String, + + #[arg(long = "bluefield-host-pf", env = "OPENSHELL_BLUEFIELD_HOST_PF")] + pub host_pf: Option, + + #[arg( + long = "bluefield-reserved-vf-index", + env = "OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES", + value_delimiter = ',' + )] + pub reserved_vf_indexes: Vec, + + #[arg(long = "bluefield-pf-key", env = "OPENSHELL_BLUEFIELD_PF_KEY")] + pub pf_key: Option, + + #[arg(long = "bluefield-snat-ip", env = "OPENSHELL_BLUEFIELD_SNAT_IP")] + pub snat_ip: Option, + + #[arg( + long = "bluefield-uplink-port", + env = "OPENSHELL_BLUEFIELD_UPLINK_PORT" + )] + pub uplink_port: Option, + + #[arg( + long = "bluefield-kernel-image", + env = "OPENSHELL_BLUEFIELD_KERNEL_IMAGE" + )] + pub kernel_image: Option, + + #[arg( + long = "bluefield-kernel-version", + env = "OPENSHELL_BLUEFIELD_KERNEL_VERSION" + )] + pub kernel_version: Option, + + #[arg( + long = "bluefield-kernel-sha256", + env = "OPENSHELL_BLUEFIELD_KERNEL_SHA256" + )] + pub kernel_sha256: Option, + + #[arg( + long = "bluefield-kernel-modules", + env = "OPENSHELL_BLUEFIELD_KERNEL_MODULES", + value_delimiter = ',' + )] + pub kernel_modules: Vec, + + #[arg( + long = "bluefield-egress-cidr", + env = "OPENSHELL_BLUEFIELD_EGRESS_CIDR" + )] + pub egress_cidr: Option, + + #[arg( + long = "bluefield-egress-cidr-pool", + env = "OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL", + value_delimiter = ',' + )] + pub egress_cidr_pool: Vec, + + #[arg( + long = "bluefield-egress-gateway", + env = "OPENSHELL_BLUEFIELD_EGRESS_GATEWAY" + )] + pub egress_gateway: Option, + + #[arg( + long = "bluefield-egress-dns", + env = "OPENSHELL_BLUEFIELD_EGRESS_DNS", + value_delimiter = ',' + )] + pub egress_dns: Vec, + + #[arg( + long = "bluefield-proxy-placement", + env = "OPENSHELL_BLUEFIELD_PROXY_PLACEMENT", + default_value = "none" + )] + pub proxy_placement: String, + + #[arg( + long = "bluefield-explicit-proxy-url", + env = "OPENSHELL_BLUEFIELD_EXPLICIT_PROXY_URL" + )] + pub explicit_proxy_url: Option, +} + +impl BluefieldDriverArgs { + pub fn to_driver_config( + &self, + openshell_endpoint: Option, + ) -> Result { + let role = if self.role.trim().is_empty() { + BluefieldRole::AllInOne + } else { + self.role.parse::()? + }; + Ok(BluefieldDriverConfig { + enabled: self.enabled, + role, + openshell_endpoint, + controller_endpoint: self.controller_endpoint.clone(), + tls_dir: self.tls_dir.clone(), + tls_domain: self.tls_domain.clone(), + host_pf: self.host_pf.clone(), + reserved_vf_indexes: self.reserved_vf_indexes.clone(), + pf_key: self.pf_key.clone(), + snat_ip: self.snat_ip.clone(), + uplink_port: self.uplink_port.clone(), + kernel_image: self.kernel_image.clone(), + kernel_version: self.kernel_version.clone(), + kernel_image_sha256: self.kernel_sha256.clone(), + kernel_modules: self.kernel_modules.clone(), + egress_cidr: self.egress_cidr.clone(), + egress_cidr_pool: self.egress_cidr_pool.clone(), + egress_gateway: self.egress_gateway.clone(), + egress_dns: self.egress_dns.clone(), + proxy_placement: parse_proxy_placement(&self.proxy_placement)?, + explicit_proxy_url: self.explicit_proxy_url.clone(), + }) + } +} + +fn parse_proxy_placement(value: &str) -> Result { + match value { + "none" => Ok(ProxyPlacement::None), + "dpu" => Ok(ProxyPlacement::Dpu), + other => Err(format!( + "invalid BlueField proxy placement {other:?}; expected 'none' or 'dpu'" + )), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn to_driver_config_parses_proxy_placement() { + let args = BluefieldDriverArgs { + enabled: true, + proxy_placement: "dpu".to_string(), + explicit_proxy_url: Some("http://10.0.0.2:3128".to_string()), + ..BluefieldDriverArgs::default() + }; + + let config = args + .to_driver_config(Some("https://gateway.example".to_string())) + .unwrap(); + + assert!(config.enabled); + assert_eq!( + config.openshell_endpoint, + Some("https://gateway.example".to_string()) + ); + assert_eq!(config.proxy_placement, ProxyPlacement::Dpu); + assert_eq!( + config.explicit_proxy_url, + Some("http://10.0.0.2:3128".to_string()) + ); + } + + #[test] + fn to_driver_config_rejects_unknown_proxy_placement() { + let args = BluefieldDriverArgs { + proxy_placement: "host".to_string(), + ..BluefieldDriverArgs::default() + }; + + let err = args.to_driver_config(None).unwrap_err(); + + assert!(err.contains("invalid BlueField proxy placement")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/config.rs b/crates/openshell-driver-bluefield/bf-vm/src/config.rs new file mode 100644 index 000000000..7d6d5874a --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/config.rs @@ -0,0 +1,167 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField VM-driver configuration and config-derived runtime options. + +use std::path::PathBuf; + +use bf_core::{BluefieldRole, ProxyPlacement}; + +use crate::guest_egress::GuestEgress; +use crate::kernel::BluefieldKernel; + +/// VM-driver BlueField extension configuration. +/// +/// The top-level driver keeps this disabled by default. When enabled, the +/// builder discovers host VFs under `host_pf`, rewrites their cross-host PF +/// coordinate to `pf_key` when supplied, and delegates datapath policy to the +/// remote DPU controller. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BluefieldDriverConfig { + pub enabled: bool, + pub role: BluefieldRole, + pub openshell_endpoint: Option, + pub controller_endpoint: Option, + pub tls_dir: Option, + pub tls_domain: String, + pub host_pf: Option, + pub reserved_vf_indexes: Vec, + pub pf_key: Option, + pub snat_ip: Option, + pub uplink_port: Option, + pub kernel_image: Option, + pub kernel_version: Option, + pub kernel_image_sha256: Option, + pub kernel_modules: Vec, + pub egress_cidr: Option, + pub egress_cidr_pool: Vec, + pub egress_gateway: Option, + /// Retained for deployment compatibility. Lab/upstream DNS resolvers are + /// applied by the DPU provider policy, not written into guest resolv.conf. + pub egress_dns: Vec, + pub proxy_placement: ProxyPlacement, + pub explicit_proxy_url: Option, +} + +impl Default for BluefieldDriverConfig { + fn default() -> Self { + Self { + enabled: false, + role: BluefieldRole::AllInOne, + openshell_endpoint: None, + controller_endpoint: None, + tls_dir: None, + tls_domain: "bluefield-controller".to_string(), + host_pf: None, + reserved_vf_indexes: Vec::new(), + pf_key: None, + snat_ip: None, + uplink_port: None, + kernel_image: None, + kernel_version: None, + kernel_image_sha256: None, + kernel_modules: Vec::new(), + egress_cidr: None, + egress_cidr_pool: Vec::new(), + egress_gateway: None, + egress_dns: Vec::new(), + proxy_placement: ProxyPlacement::None, + explicit_proxy_url: None, + } + } +} + +/// PR1 defers the DPU proxy; reject any config that asks for it so a +/// misconfiguration fails loudly instead of silently ignoring the request. +pub(crate) fn reject_deferred_proxy(config: &BluefieldDriverConfig) -> Result<(), String> { + if config.proxy_placement != ProxyPlacement::None { + return Err("BlueField DPU proxy placement is deferred from PR1".to_string()); + } + if config + .explicit_proxy_url + .as_deref() + .filter(|value| !value.trim().is_empty()) + .is_some() + { + return Err("BlueField explicit proxy URL is deferred from PR1".to_string()); + } + Ok(()) +} + +pub(crate) fn bluefield_kernel_from_config( + config: &BluefieldDriverConfig, +) -> Option { + let mut kernel = if let Some(image) = &config.kernel_image { + BluefieldKernel::from_image(image.clone()) + } else if config.kernel_modules.is_empty() + && config.kernel_version.is_none() + && config.kernel_image_sha256.is_none() + { + return None; + } else { + BluefieldKernel::new() + }; + + if !config.kernel_modules.is_empty() { + kernel = kernel.with_modules(config.kernel_modules.clone()); + } + if let Some(version) = &config.kernel_version { + kernel = kernel.with_version(version.clone()); + } + if let Some(sha256) = &config.kernel_image_sha256 { + kernel = kernel.with_image_sha256(sha256.clone()); + } + Some(kernel) +} + +pub(crate) fn guest_egress_from_config( + config: &BluefieldDriverConfig, +) -> Result, String> { + let address_cidr = config + .egress_cidr + .clone() + .or_else(|| config.egress_cidr_pool.first().cloned()); + match (address_cidr, &config.egress_gateway) { + (Some(address_cidr), Some(gateway)) => Ok(Some(GuestEgress { + address_cidr, + gateway: gateway.clone(), + })), + (None, None) => Ok(None), + _ => Err( + "BlueField guest egress requires OPENSHELL_BLUEFIELD_EGRESS_GATEWAY with OPENSHELL_BLUEFIELD_EGRESS_CIDR or OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL" + .to_string(), + ), + } +} + +#[cfg(test)] +mod tests { + use super::{BluefieldDriverConfig, reject_deferred_proxy}; + use bf_core::ProxyPlacement; + + #[test] + fn rejects_deferred_dpu_proxy_placement() { + let config = BluefieldDriverConfig { + enabled: true, + proxy_placement: ProxyPlacement::Dpu, + ..Default::default() + }; + + let err = reject_deferred_proxy(&config).unwrap_err(); + + assert!(err.contains("DPU proxy placement is deferred")); + } + + #[test] + fn rejects_deferred_explicit_proxy_url() { + let config = BluefieldDriverConfig { + enabled: true, + explicit_proxy_url: Some("http://100.64.4.1:3128".to_string()), + ..Default::default() + }; + + let err = reject_deferred_proxy(&config).unwrap_err(); + + assert!(err.contains("explicit proxy URL is deferred")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs new file mode 100644 index 000000000..759ef8556 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -0,0 +1,534 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField VM lifecycle extension: VF passthrough. +//! +//! This extension claims a host VF for a sandbox, binds it to `vfio-pci`, +//! persists enough state to recover after a driver restart, and releases the +//! VF on launch failure or delete. It does not program any DPU datapath; that +//! is layered on in later stages. + +use std::collections::HashMap; +use std::path::Path; +use std::sync::{Arc, Mutex}; + +use openshell_core::proto::compute::v1::DriverSandbox as Sandbox; +use openshell_vfio::SysfsRoot; + +use crate::gpu::mac_from_sandbox_id; +use crate::lifecycle::{ + BackendFeature, ExtensionActivation, ExtensionDescriptor, LaunchAbortReason, LaunchPlan, + LifecycleError, LifecycleExtension, LifecycleResult, RestoreContext, +}; + +use bf_inventory::{VfPool, VfSlot}; + +use crate::config::{bluefield_kernel_from_config, guest_egress_from_config, reject_deferred_proxy}; +use crate::guest_egress::{self, GuestEgress}; +use crate::kernel::BluefieldKernel; +use crate::slots::{HostSlotConfig, prepare_host_slots, require_host_pf}; +use crate::state::{self, AttachmentRecord, EXTENSION_NAME}; +use crate::vf::{HostReadiness, SysfsHostReadiness, SysfsVfBinder, VfBinder}; + +pub use crate::cli::BluefieldDriverArgs; +pub use crate::config::BluefieldDriverConfig; + +fn deterministic_vf_mac(sandbox_id: &str) -> String { + let key = format!("bluefield-vf:{sandbox_id}"); + let mac = mac_from_sandbox_id(&key); + format!( + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5] + ) +} + +/// BlueField lifecycle extension: claims a VF, binds it for passthrough, and +/// wires optional guest egress into the launch plan. +#[derive(Debug)] +pub struct BluefieldExtension { + pool: VfPool, + egress: Option, + kernel: Option, + readiness: Arc, + binder: Arc, + attachments: Mutex>, +} + +impl BluefieldExtension { + #[must_use] + pub fn new(pool: VfPool) -> Self { + Self { + pool, + egress: None, + kernel: None, + readiness: Arc::new(SysfsHostReadiness::default()), + binder: Arc::new(SysfsVfBinder::default()), + attachments: Mutex::new(HashMap::new()), + } + } + + /// Build a host-side extension from VM-driver config. Discovers the local + /// VFs under the configured host PF, applies the operator's reservations, + /// and returns an extension that binds one VF per sandbox. + /// + /// Returns `Ok(None)` when `config.enabled` is false so callers keep the + /// upstream default driver behavior unchanged. + pub fn from_driver_config(config: &BluefieldDriverConfig) -> Result, String> { + if !config.enabled { + return Ok(None); + } + reject_deferred_proxy(config)?; + + let host_pf = require_host_pf(config)?; + let sysfs = SysfsRoot::system(); + let slots = prepare_host_slots(HostSlotConfig::from(config), &sysfs, host_pf)?; + + let extension = Self::new(VfPool::new(slots)) + .with_host_readiness(Arc::new(SysfsHostReadiness::new(sysfs.clone()))) + .with_vf_binder(Arc::new(SysfsVfBinder::new(sysfs))); + + Ok(Some(extension.apply_runtime_options(config)?)) + } + + /// In this stage a compute node binds a VF the same way the all-in-one + /// role does; the leader-driven assignment path is layered on later. + pub fn from_compute_node_config( + config: &BluefieldDriverConfig, + ) -> Result, String> { + Self::from_driver_config(config) + } + + fn apply_runtime_options(mut self, config: &BluefieldDriverConfig) -> Result { + if let Some(kernel) = bluefield_kernel_from_config(config) { + self = self.with_kernel(kernel); + } + if let Some(egress) = guest_egress_from_config(config)? { + self = self.with_guest_egress(egress); + } + Ok(self) + } + + #[must_use] + pub fn with_guest_egress(mut self, egress: GuestEgress) -> Self { + self.egress = Some(egress); + self + } + + /// Select the BlueField guest kernel (image or profile) and the VF driver + /// modules to load in guest-init. + #[must_use] + pub fn with_kernel(mut self, kernel: BluefieldKernel) -> Self { + self.kernel = Some(kernel); + self + } + + /// Override the host VF-passthrough readiness probe (defaults to a + /// real-`/sys` [`SysfsHostReadiness`]). + #[must_use] + pub fn with_host_readiness(mut self, readiness: Arc) -> Self { + self.readiness = readiness; + self + } + + fn with_vf_binder(mut self, binder: Arc) -> Self { + self.binder = binder; + self + } + + fn record_attachment(&self, sandbox_id: &str, record: AttachmentRecord) { + self.attachments + .lock() + .expect("bluefield attachments lock poisoned") + .insert(sandbox_id.to_string(), record); + } + + fn take_attachment(&self, sandbox_id: &str) -> Option { + self.attachments + .lock() + .expect("bluefield attachments lock poisoned") + .remove(sandbox_id) + } + + fn release_binding(&self, sandbox_state_dir: &Path, slot: &VfSlot) -> LifecycleResult<()> { + self.binder.release_slot(slot).map_err(|err| { + LifecycleError::new(format!("bluefield: release VF {}: {err}", slot.host_bdf)) + })?; + state::remove_bind_state(sandbox_state_dir) + } + + fn claim_slot(&self, sandbox_id: &str) -> LifecycleResult { + let mut slot = self.pool.claim(sandbox_id).ok_or_else(|| { + LifecycleError::resource_exhausted(format!( + "bluefield: no free VF for sandbox {sandbox_id}" + )) + })?; + if slot.guest_mac.is_none() { + slot.guest_mac = Some(deterministic_vf_mac(sandbox_id)); + } + Ok(slot) + } +} + +#[tonic::async_trait] +impl LifecycleExtension for BluefieldExtension { + fn name(&self) -> &str { + EXTENSION_NAME + } + + fn activation(&self) -> ExtensionActivation { + ExtensionActivation::Global + } + + fn descriptor(&self) -> ExtensionDescriptor { + let mut descriptor = ExtensionDescriptor::new(EXTENSION_NAME); + descriptor.required_backend_features = + vec![BackendFeature::PciPassthrough, BackendFeature::GuestInitDropins]; + descriptor + } + + async fn configure_launch( + &self, + _sandbox: &Sandbox, + _state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + // VF passthrough requires QEMU; guest egress needs an init drop-in. + plan.require_backend_feature(BackendFeature::PciPassthrough); + plan.require_backend_feature(BackendFeature::GuestInitDropins); + plan.guest_init_dropins.push(guest_egress::dropin()); + + // Select the BlueField guest kernel + load its VF driver modules so the + // assigned VF is not an inert PCI function in the guest. + if let Some(kernel) = &self.kernel { + kernel.apply(plan)?; + } + Ok(()) + } + + async fn before_launch( + &self, + sandbox: &Sandbox, + state_dir: &Path, + plan: &mut LaunchPlan, + ) -> LifecycleResult<()> { + let slot = self.claim_slot(&sandbox.id)?; + + // Fail closed if this host can't actually pass the VF through (IOMMU + // off, device missing, group conflict). ResourceExhausted lets the + // scheduler retry on a capable host rather than booting a broken VM. + if let Err(reason) = self.readiness.check_passthrough(&slot.host_bdf) { + self.pool.release(&sandbox.id); + return Err(LifecycleError::resource_exhausted(format!( + "bluefield: host cannot pass through {}: {reason}", + slot.host_bdf + ))); + } + + // Fail closed on kernel image drift (missing / hash mismatch). + if let Some(kernel) = &self.kernel + && let Err(err) = kernel.validate() + { + self.pool.release(&sandbox.id); + return Err(LifecycleError::resource_exhausted(err.to_string())); + } + + let guard = match self.binder.bind_slot(&slot) { + Ok(guard) => guard, + Err(err) => { + self.pool.release(&sandbox.id); + return Err(LifecycleError::resource_exhausted(format!( + "bluefield: bind VF {} to vfio-pci: {err}", + slot.host_bdf + ))); + } + }; + + if let Err(err) = state::persist_bind_state(&sandbox.id, state_dir, &slot) { + drop(guard); + self.pool.release(&sandbox.id); + return Err(err); + } + // QEMU owns the device now; do not restore it on guard drop. + guard.disarm(); + + self.record_attachment(&sandbox.id, AttachmentRecord { slot: slot.clone() }); + + if let Some(egress) = &self.egress { + plan.env.extend(egress.env(&slot)); + } + Ok(()) + } + + async fn after_launch_failed( + &self, + sandbox: &Sandbox, + state_dir: &Path, + _reason: LaunchAbortReason, + ) -> LifecycleResult<()> { + if let Some(record) = self.take_attachment(&sandbox.id) + && let Err(err) = self.release_binding(state_dir, &record.slot) + { + tracing::warn!( + sandbox_id = %sandbox.id, + error = %err, + "bluefield: failed to release VF binding after launch failure" + ); + } + self.pool.release(&sandbox.id); + Ok(()) + } + + async fn after_delete(&self, sandbox: &Sandbox, state_dir: &Path) -> LifecycleResult<()> { + if let Some(record) = self.take_attachment(&sandbox.id) { + self.release_binding(state_dir, &record.slot)?; + } + self.pool.release(&sandbox.id); + Ok(()) + } + + async fn before_restore(&self, ctx: &RestoreContext) -> LifecycleResult<()> { + // A restore onto a misprovisioned host must fail closed exactly like a + // fresh launch. + if let Some(kernel) = &self.kernel + && let Err(err) = kernel.validate() + { + return Err(LifecycleError::resource_exhausted(err.to_string())); + } + let bind_state = state::load_bind_state(&ctx.sandbox.id, &ctx.state_dir)?; + let mut slot = self + .pool + .claim_by_host_bdf(&ctx.sandbox.id, &bind_state.host_bdf) + .ok_or_else(|| { + LifecycleError::resource_exhausted(format!( + "bluefield: persisted VF {} is not available for sandbox {}", + bind_state.host_bdf, ctx.sandbox.id + )) + })?; + if slot.guest_mac.is_none() { + slot.guest_mac = bind_state + .guest_mac + .clone() + .or_else(|| Some(deterministic_vf_mac(&ctx.sandbox.id))); + } + let guard = self.binder.adopt_slot(&slot).map_err(|err| { + LifecycleError::resource_exhausted(format!( + "bluefield: adopt VF {} from persisted state: {err}", + slot.host_bdf + )) + })?; + guard.disarm(); + self.record_attachment(&ctx.sandbox.id, AttachmentRecord { slot }); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[derive(Debug)] + struct AlwaysReady; + impl HostReadiness for AlwaysReady { + fn check_passthrough(&self, _host_bdf: &str) -> Result<(), String> { + Ok(()) + } + } + + #[derive(Debug)] + struct NeverReady; + impl HostReadiness for NeverReady { + fn check_passthrough(&self, _host_bdf: &str) -> Result<(), String> { + Err("IOMMU disabled".to_string()) + } + } + + #[derive(Debug)] + struct TestVfBinding; + impl crate::vf::VfBinding for TestVfBinding { + fn disarm(self: Box) {} + } + + #[derive(Debug)] + struct TestVfBinder; + impl VfBinder for TestVfBinder { + fn bind_slot(&self, _slot: &VfSlot) -> Result, String> { + Ok(Box::new(TestVfBinding)) + } + fn adopt_slot(&self, _slot: &VfSlot) -> Result, String> { + Ok(Box::new(TestVfBinding)) + } + fn release_slot(&self, _slot: &VfSlot) -> Result<(), String> { + Ok(()) + } + } + + fn sandbox(id: &str) -> Sandbox { + Sandbox { + id: id.to_string(), + name: id.to_string(), + ..Default::default() + } + } + + fn state_dir(name: &str) -> PathBuf { + std::env::temp_dir().join(format!( + "openshell-bluefield-{name}-{}-{}", + std::process::id(), + state::now_millis() + )) + } + + fn sample_plan() -> LaunchPlan { + LaunchPlan { + backend: crate::runtime::VmBackend::Qemu, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + } + } + + fn ext(pool: VfPool) -> BluefieldExtension { + BluefieldExtension::new(pool) + .with_host_readiness(Arc::new(AlwaysReady)) + .with_vf_binder(Arc::new(TestVfBinder)) + } + + #[tokio::test] + async fn before_launch_claims_slot_records_bind_state_and_injects_egress_env() { + let extension = ext(VfPool::new([ + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + ])) + .with_guest_egress(GuestEgress { + address_cidr: "10.0.120.10/22".to_string(), + gateway: "10.0.120.254".to_string(), + }); + + let mut plan = sample_plan(); + let state = state_dir("launch-env"); + extension + .before_launch(&sandbox("sandbox-1"), &state, &mut plan) + .await + .unwrap(); + + assert!( + plan.env + .iter() + .any(|e| e == "OPENSHELL_VM_DATA_IP=10.0.120.10/22") + ); + + let bind_state = state::load_bind_state("sandbox-1", &state).unwrap(); + assert_eq!(bind_state.host_bdf, "0000:03:00.2"); + + let record = extension + .take_attachment("sandbox-1") + .expect("attachment recorded"); + assert_eq!(record.slot.host_bdf, "0000:03:00.2"); + let _ = std::fs::remove_dir_all(&state); + } + + #[tokio::test] + async fn before_launch_fails_closed_when_pool_exhausted() { + let extension = ext(VfPool::new([])); + let mut plan = sample_plan(); + let err = extension + .before_launch(&sandbox("sandbox-1"), &PathBuf::from("/tmp/s"), &mut plan) + .await + .unwrap_err(); + assert!(err.is_resource_exhausted()); + } + + #[tokio::test] + async fn before_launch_fails_closed_when_host_not_vfio_ready() { + let extension = BluefieldExtension::new(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])) + .with_host_readiness(Arc::new(NeverReady)) + .with_vf_binder(Arc::new(TestVfBinder)); + + let mut plan = sample_plan(); + let err = extension + .before_launch(&sandbox("sandbox-1"), &PathBuf::from("/tmp/s"), &mut plan) + .await + .unwrap_err(); + assert!(err.is_resource_exhausted()); + + // Slot was released so a later capable host can claim it. + assert!(extension.pool.claim("sandbox-2").is_some()); + } + + #[tokio::test] + async fn after_delete_releases_slot_and_state() { + let extension = ext(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])); + let state = state_dir("delete"); + let mut plan = sample_plan(); + extension + .before_launch(&sandbox("sb-del"), &state, &mut plan) + .await + .unwrap(); + extension.after_delete(&sandbox("sb-del"), &state).await.unwrap(); + + assert!(extension.take_attachment("sb-del").is_none()); + assert!(state::load_bind_state("sb-del", &state).is_err()); + let _ = std::fs::remove_dir_all(&state); + } + + #[tokio::test] + async fn configure_launch_selects_bluefield_kernel() { + let extension = BluefieldExtension::new(VfPool::new([])) + .with_kernel(BluefieldKernel::from_image("/opt/openshell/kernels/bf-vmlinux")); + + let mut plan = sample_plan(); + extension + .configure_launch(&sandbox("sandbox-1"), &PathBuf::from("/tmp/s"), &mut plan) + .await + .unwrap(); + + assert_eq!( + plan.kernel_image.as_deref(), + Some(Path::new("/opt/openshell/kernels/bf-vmlinux")) + ); + assert!( + plan.required_backend_features + .contains(&BackendFeature::ExternalKernelImage) + ); + } + + #[tokio::test] + async fn before_restore_reclaims_and_records() { + let state = state_dir("restore"); + let initial = ext(VfPool::new([ + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + ])); + let mut plan = sample_plan(); + initial + .before_launch(&sandbox("sb-restore"), &state, &mut plan) + .await + .unwrap(); + + let extension = ext(VfPool::new([ + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + ])); + let ctx = RestoreContext { + sandbox: sandbox("sb-restore"), + state_dir: state.clone(), + }; + extension.before_restore(&ctx).await.unwrap(); + + let record = extension + .take_attachment("sb-restore") + .expect("attachment recorded"); + assert_eq!(record.slot.host_bdf, "0000:03:00.2"); + let _ = std::fs::remove_dir_all(&state); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extensions/mod.rs b/crates/openshell-driver-bluefield/bf-vm/src/extensions/mod.rs new file mode 100644 index 000000000..4fdf2fbaf --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/extensions/mod.rs @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Concrete VM lifecycle extensions used by the BlueField VM driver. + +use bf_core::BluefieldRole; + +use crate::lifecycle::LifecycleExtensionRegistry; + +pub use crate::cli::BluefieldDriverArgs; +pub use crate::extension::{BluefieldDriverConfig, BluefieldExtension}; + +#[derive(Debug, Clone, Default)] +pub struct ExtensionRuntimeConfig { + pub bluefield: BluefieldDriverConfig, +} + +/// Build the workload-side lifecycle extensions. Only the workload-running +/// roles (`all-in-one`, `compute-node`) install a BlueField extension; the +/// `control-plane` role runs no workload and serves the leader directly, so it +/// installs nothing here. +pub fn build_lifecycle_extensions( + config: &ExtensionRuntimeConfig, +) -> Result { + let mut registry = LifecycleExtensionRegistry::new(); + let extension = match config.bluefield.role { + BluefieldRole::ComputeNode => { + BluefieldExtension::from_compute_node_config(&config.bluefield)? + } + BluefieldRole::AllInOne => BluefieldExtension::from_driver_config(&config.bluefield)?, + BluefieldRole::ControlPlane => None, + }; + if let Some(extension) = extension { + registry.push(std::sync::Arc::new(extension)); + } + Ok(registry) +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs new file mode 100644 index 000000000..2cfbc426b --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Guest VF egress wiring: the `OPENSHELL_VM_DATA_*` env contract and the +//! guest-init drop-in that consumes it. + +use crate::lifecycle::GuestInitDropin; + +use bf_inventory::VfSlot; + +const ENV_EGRESS: &str = "OPENSHELL_VM_DATA_EGRESS"; +const ENV_IP_MODE: &str = "OPENSHELL_VM_DATA_IP_MODE"; +const ENV_IP: &str = "OPENSHELL_VM_DATA_IP"; +const ENV_GATEWAY: &str = "OPENSHELL_VM_DATA_GW"; +const ENV_MAC: &str = "OPENSHELL_VM_DATA_MAC"; +const EGRESS_EXTERNAL_VF: &str = "external-vf"; +const IP_MODE_STATIC: &str = "static"; +const DROPIN_SCRIPT: &[u8] = include_bytes!("../scripts/guest-egress-dropin.sh"); + +/// Static egress parameters for a sandbox's data-path VF. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GuestEgress { + pub address_cidr: String, + pub gateway: String, +} + +impl GuestEgress { + /// Build the `OPENSHELL_VM_DATA_*` env vars the guest-init drop-in reads. + /// A per-slot `guest_datapath_address` overrides `address_cidr`. + #[must_use] + pub fn env(&self, slot: &VfSlot) -> Vec { + GuestEgressEnv::for_slot(self, slot).to_env() + } +} + +/// Concrete guest-init environment for one sandbox VF. +#[derive(Debug, Clone, PartialEq, Eq)] +struct GuestEgressEnv { + address_cidr: String, + gateway: String, + guest_mac: Option, +} + +impl GuestEgressEnv { + #[must_use] + fn for_slot(egress: &GuestEgress, slot: &VfSlot) -> Self { + let address = slot + .guest_datapath_address + .as_deref() + .unwrap_or(&egress.address_cidr); + Self { + address_cidr: address.to_string(), + gateway: egress.gateway.clone(), + guest_mac: slot.guest_mac.clone(), + } + } + + #[must_use] + fn to_env(&self) -> Vec { + let mut env = vec![ + format!("{ENV_EGRESS}={EGRESS_EXTERNAL_VF}"), + format!("{ENV_IP_MODE}={IP_MODE_STATIC}"), + format!("{}={}", ENV_IP, self.address_cidr), + format!("{}={}", ENV_GATEWAY, self.gateway), + ]; + if let Some(mac) = self.guest_mac.as_deref() { + env.push(format!("{ENV_MAC}={mac}")); + } + env + } +} + +/// Name of the guest-init drop-in this extension injects. Sorted late so it +/// runs after base network setup. +pub const DROPIN_NAME: &str = "50-bluefield-vf-egress.sh"; + +/// Build the guest-init drop-in that configures the VF NIC from the +/// `OPENSHELL_VM_DATA_*` env. The TAP interface remains directly connected to +/// the gateway host address, while default egress moves to the BlueField VF. +#[must_use] +pub fn dropin() -> GuestInitDropin { + GuestInitDropin::new(DROPIN_NAME, DROPIN_SCRIPT.to_vec()) +} + +#[cfg(test)] +mod tests { + use super::GuestEgress; + use bf_inventory::VfSlot; + + #[test] + fn env_contract_uses_default_address_without_dns_or_mac() { + let egress = GuestEgress { + address_cidr: "10.0.120.10/22".to_string(), + gateway: "10.0.120.254".to_string(), + }; + let slot = VfSlot::new("vf0", "0000:03:00.2"); + let env = egress.env(&slot); + assert_eq!( + env, + vec![ + "OPENSHELL_VM_DATA_EGRESS=external-vf", + "OPENSHELL_VM_DATA_IP_MODE=static", + "OPENSHELL_VM_DATA_IP=10.0.120.10/22", + "OPENSHELL_VM_DATA_GW=10.0.120.254", + ] + ); + } + + #[test] + fn env_contract_uses_slot_address_override_and_optional_mac() { + let egress = GuestEgress { + address_cidr: "10.0.120.10/22".to_string(), + gateway: "10.0.120.254".to_string(), + }; + let slot = VfSlot::new("vf0", "0000:03:00.2").with_guest_datapath_address("10.0.120.61/22"); + let slot = slot.with_guest_mac("02:bf:64:04:00:10"); + let env = egress.env(&slot); + assert_eq!( + env, + vec![ + "OPENSHELL_VM_DATA_EGRESS=external-vf", + "OPENSHELL_VM_DATA_IP_MODE=static", + "OPENSHELL_VM_DATA_IP=10.0.120.61/22", + "OPENSHELL_VM_DATA_GW=10.0.120.254", + "OPENSHELL_VM_DATA_MAC=02:bf:64:04:00:10", + ] + ); + } + + #[test] + fn dropin_script_is_reviewable_and_configures_static_vf_egress() { + let dropin = super::dropin(); + let script = String::from_utf8(dropin.contents).expect("drop-in is utf8"); + assert!(script.contains("OPENSHELL_VM_DATA_IP")); + assert!(script.contains("OPENSHELL_VM_DATA_GW")); + assert!(script.contains("0x15b3")); + assert!(script.contains("OPENSHELL_VM_DATA_MAC")); + assert!(script.contains("find_bluefield_vf()")); + assert!(script.contains("configure_static_ip()")); + assert!(script.contains("configure_resolv_conf()")); + assert!(script.contains("main \"$@\"")); + assert!(script.contains("ip link set dev \"${vf_nic}\" address")); + assert!(script.contains("ip addr add")); + assert!(script.contains("ip route replace default")); + assert!(!script.contains("OPENSHELL_VM_DATA_DNS")); + assert!(script.contains("resolv.conf")); + assert!(script.contains("DPU-side policy")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs b/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs new file mode 100644 index 000000000..d42d56d13 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs @@ -0,0 +1,327 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Guest kernel selection for the BlueField extension. +//! +//! A VF-passthrough guest needs the in-guest NIC driver (`mlx5_core`, and +//! `mlx5_ib` for RDMA) plus the PCI/auxiliary-bus plumbing; without it the +//! assigned VF is an inert PCI function. Rather than baking NVIDIA/Mellanox +//! drivers into the *upstream* default guest kernel (which would couple +//! upstream to NVIDIA hardware), the extension selects its own BlueField +//! kernel via the existing [`LaunchPlan`] seam — keeping the generic kernel +//! NVIDIA-free. +//! +//! Two ways to express the requirement: +//! +//! - [`BluefieldKernel::image`] — a concrete kernel path. Consumed end-to-end +//! today: it becomes `plan.kernel_image` and requires +//! [`BackendFeature::ExternalKernelImage`] (QEMU-only). +//! - [`BluefieldKernel::profile`] — a named profile (e.g. `"bluefield"`). The +//! intended abstraction, but a no-op at boot until a profile→image registry +//! lands in the runtime; recorded on `plan.kernel_profile` for now. +//! +//! Kernel selection here is purely about *function* (making the VF work) and +//! must come from driver/extension config — never a tenant-settable field. +//! Tier-2 (DPU) enforcement holds regardless of the guest kernel. + +use std::path::{Path, PathBuf}; + +use crate::lifecycle::{ + BackendFeature, GuestInitDropin, LaunchPlan, LifecycleError, LifecycleResult, +}; + +/// Guest-init drop-in that loads the VF driver modules. Sorted before the +/// `50-` egress drop-in so the NIC exists when egress is configured. +pub const MODULES_DROPIN_NAME: &str = "40-bluefield-kernel-modules.sh"; + +/// Default modules for a Mellanox/NVIDIA VF. `mlx5_core` brings up the +/// ethernet function; `mlx5_ib` adds the RDMA verbs path (GPUDirect/RoCE). +/// Loading a built-in (`=y`) module via `modprobe` is a harmless no-op, so +/// this list is safe whether the BlueField kernel compiles them in or as +/// modules. +pub const MELLANOX_VF_MODULES: &[&str] = &["mlx5_core", "mlx5_ib"]; + +/// BlueField guest-kernel requirement for a sandbox. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct BluefieldKernel { + /// Concrete kernel image path (QEMU `-kernel`). Driver/extension-owned. + pub image: Option, + /// Named kernel profile, for the future profile→image registry. + pub profile: Option, + /// Expected kernel release (`uname -r`). Pins the image to the rootfs + /// module bundle: guest-init asserts the running kernel matches before + /// loading modules, so kernel↔rootfs drift fails loudly at boot instead + /// of `modprobe` finding the wrong `/lib/modules/`. + pub version: Option, + /// Expected lowercase hex SHA-256 of the kernel image. When set, + /// [`Self::validate`] refuses to launch unless the on-host image matches, + /// so every host in the fleet runs the identical vetted kernel. + pub image_sha256: Option, + /// Modules guest-init should `modprobe` before VF bring-up. + pub required_modules: Vec, +} + +impl BluefieldKernel { + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// A BlueField kernel supplied as a concrete image, with the default + /// Mellanox VF modules. + #[must_use] + pub fn from_image(image: impl Into) -> Self { + Self { + image: Some(image.into()), + profile: None, + version: None, + image_sha256: None, + required_modules: MELLANOX_VF_MODULES + .iter() + .map(|m| (*m).to_string()) + .collect(), + } + } + + /// A BlueField kernel referenced by profile name, with the default + /// Mellanox VF modules. + #[must_use] + pub fn from_profile(profile: impl Into) -> Self { + Self { + image: None, + profile: Some(profile.into()), + version: None, + image_sha256: None, + required_modules: MELLANOX_VF_MODULES + .iter() + .map(|m| (*m).to_string()) + .collect(), + } + } + + #[must_use] + pub fn with_modules(mut self, modules: impl IntoIterator) -> Self { + self.required_modules = modules.into_iter().collect(); + self + } + + /// Pin the expected guest kernel release (`uname -r`). Asserted in-guest + /// before module load to catch kernel↔rootfs drift. + #[must_use] + pub fn with_version(mut self, version: impl Into) -> Self { + self.version = Some(version.into()); + self + } + + /// Pin the expected SHA-256 (lowercase hex) of the kernel image, enforced + /// by [`Self::validate`] so every host runs the identical vetted image. + #[must_use] + pub fn with_image_sha256(mut self, sha256: impl Into) -> Self { + self.image_sha256 = Some(sha256.into()); + self + } + + /// Optional preflight check. The driver also validates `kernel_image` + /// existence at provisioning, but calling this early gives a clearer + /// error before the VM is built. FS-touching, so it is not run from + /// [`Self::apply`]. + pub fn validate(&self) -> LifecycleResult<()> { + if let Some(image) = &self.image { + if !image.is_file() { + return Err(LifecycleError::new(format!( + "bluefield kernel image does not exist: {}", + image.display() + ))); + } + if let Some(expected) = &self.image_sha256 { + let actual = file_sha256(image).map_err(|err| { + LifecycleError::new(format!( + "hashing bluefield kernel image {}: {err}", + image.display() + )) + })?; + if !actual.eq_ignore_ascii_case(expected) { + return Err(LifecycleError::new(format!( + "bluefield kernel image hash mismatch for {}: expected {expected}, got {actual}", + image.display() + ))); + } + } + } + Ok(()) + } + + /// Apply this requirement to the launch plan: select the kernel and + /// register the module-loading drop-in. Pure (no filesystem access) so + /// it is trivially testable; existence is enforced by the driver and by + /// [`Self::validate`]. + pub fn apply(&self, plan: &mut LaunchPlan) -> LifecycleResult<()> { + if let Some(image) = &self.image { + plan.kernel_image = Some(image.clone()); + plan.require_backend_feature(BackendFeature::ExternalKernelImage); + } + if let Some(profile) = &self.profile { + plan.kernel_profile = Some(profile.clone()); + } + if !self.required_modules.is_empty() { + plan.require_backend_feature(BackendFeature::GuestInitDropins); + plan.guest_init_dropins.push(self.modules_dropin()); + } + Ok(()) + } + + fn modules_dropin(&self) -> GuestInitDropin { + let mut script = String::from( + "#!/bin/bash\n# OpenShell BlueField VF kernel modules (scaffold).\nset -eu\n", + ); + if let Some(version) = &self.version { + // Fail loudly on kernel↔rootfs drift before touching modules. + script.push_str(&format!( + "want={version}\nhave=\"$(uname -r)\"\nif [ \"$have\" != \"$want\" ]; then echo \"openshell: guest kernel $have != expected $want (kernel/rootfs drift)\" >&2; exit 1; fi\n" + )); + } + for module in &self.required_modules { + // modprobe of a built-in module is a no-op returning success. + script.push_str(&format!( + "modprobe {module} || echo \"openshell: failed to modprobe {module}\" >&2\n" + )); + } + GuestInitDropin::new(MODULES_DROPIN_NAME, script.into_bytes()) + } +} + +/// Lowercase-hex SHA-256 of a file's contents. +/// +/// TODO(scaffold): reads the whole image into memory; stream once kernels +/// are large enough to matter. +fn file_sha256(path: &Path) -> std::io::Result { + use core::fmt::Write as _; + use sha2::{Digest, Sha256}; + + let bytes = std::fs::read(path)?; + let digest = Sha256::digest(&bytes); + let mut hex = String::with_capacity(digest.len() * 2); + for byte in digest { + write!(hex, "{byte:02x}").expect("writing to String never fails"); + } + Ok(hex) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn empty_plan() -> LaunchPlan { + LaunchPlan { + backend: crate::runtime::VmBackend::Qemu, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: Vec::new(), + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: None, + guest_ip: None, + host_ip: None, + vsock_cid: None, + guest_mac: None, + gateway_port: None, + guest_init_dropins: Vec::new(), + env: Vec::new(), + } + } + + #[test] + fn image_sets_plan_and_requires_external_kernel() { + let mut plan = empty_plan(); + BluefieldKernel::from_image("/opt/openshell/kernels/bluefield-vmlinux") + .apply(&mut plan) + .unwrap(); + assert_eq!( + plan.kernel_image.as_deref(), + Some(Path::new("/opt/openshell/kernels/bluefield-vmlinux")) + ); + assert!( + plan.required_backend_features + .contains(&BackendFeature::ExternalKernelImage) + ); + } + + #[test] + fn profile_is_recorded_without_external_kernel_feature() { + let mut plan = empty_plan(); + BluefieldKernel::from_profile("bluefield") + .apply(&mut plan) + .unwrap(); + assert_eq!(plan.kernel_profile.as_deref(), Some("bluefield")); + assert!( + !plan + .required_backend_features + .contains(&BackendFeature::ExternalKernelImage) + ); + } + + #[test] + fn modules_emit_modprobe_dropin() { + let mut plan = empty_plan(); + BluefieldKernel::from_profile("bluefield") + .apply(&mut plan) + .unwrap(); + let dropin = plan + .guest_init_dropins + .iter() + .find(|d| d.name == MODULES_DROPIN_NAME) + .expect("modules drop-in present"); + let script = String::from_utf8(dropin.contents.clone()).unwrap(); + assert!(script.contains("modprobe mlx5_core")); + assert!(script.contains("modprobe mlx5_ib")); + } + + #[test] + fn validate_rejects_missing_image() { + let err = BluefieldKernel::from_image("/no/such/kernel") + .validate() + .unwrap_err(); + assert!(err.to_string().contains("does not exist")); + } + + #[test] + fn validate_enforces_image_hash() { + let path = std::env::temp_dir().join(format!("bf-kernel-{}", std::process::id())); + std::fs::write(&path, b"fake-kernel-bytes").unwrap(); + let good = file_sha256(&path).unwrap(); + + assert!( + BluefieldKernel::from_image(path.clone()) + .with_image_sha256(good) + .validate() + .is_ok() + ); + + let err = BluefieldKernel::from_image(path.clone()) + .with_image_sha256("deadbeef") + .validate() + .unwrap_err(); + assert!(err.to_string().contains("hash mismatch")); + + let _ = std::fs::remove_file(&path); + } + + #[test] + fn version_pin_emits_guest_uname_assertion() { + let mut plan = empty_plan(); + BluefieldKernel::from_profile("bluefield") + .with_version("6.8.0-openshell-bf") + .apply(&mut plan) + .unwrap(); + let dropin = plan + .guest_init_dropins + .iter() + .find(|d| d.name == MODULES_DROPIN_NAME) + .expect("modules drop-in present"); + let script = String::from_utf8(dropin.contents.clone()).unwrap(); + assert!(script.contains("want=6.8.0-openshell-bf")); + assert!(script.contains("uname -r")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs new file mode 100644 index 000000000..c1f8a6b34 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField VM lifecycle integration for the OpenShell VM driver. + +pub mod cli; +mod config; +pub mod extension; +pub mod extensions; +pub mod guest_egress; +pub mod kernel; +mod slots; +mod state; +pub mod vf; + +pub use bf_core::ProxyPlacement; +pub use cli::BluefieldDriverArgs; +pub use config::BluefieldDriverConfig; +pub use extension::BluefieldExtension; +pub use openshell_driver_vm::{ + BackendFeature, ExtensionCapabilities, ExtensionDescriptor, GuestInitDropin, LaunchAbortReason, + LaunchPlan, LifecycleError, LifecycleExtension, LifecycleExtensionRegistry, LifecycleResult, + RestoreContext, VM_RUNTIME_DIR_ENV, VmBackend, VmDriver, VmDriverConfig, VmLaunchConfig, + cleanup_stale_tap_interfaces, configured_runtime_dir, driver, gpu, lifecycle, procguard, + run_vm, +}; + +pub mod runtime { + pub use openshell_driver_vm::{ + VM_RUNTIME_DIR_ENV, VmBackend, VmLaunchConfig, cleanup_stale_tap_interfaces, + configured_runtime_dir, run_vm, + }; +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/slots.rs b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs new file mode 100644 index 000000000..032666f2b --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs @@ -0,0 +1,134 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host VF slot discovery and BlueField-specific slot preparation. + +use std::collections::HashSet; + +use bf_inventory::{SysfsVfInventory, VfInventory, VfSlot}; +use openshell_vfio::SysfsRoot; + +use crate::config::BluefieldDriverConfig; + +pub(crate) struct HostSlotConfig<'a> { + reserved_vf_indexes: &'a [u32], + pf_key: Option<&'a str>, + egress_cidr_pool: &'a [String], +} + +impl<'a> From<&'a BluefieldDriverConfig> for HostSlotConfig<'a> { + fn from(config: &'a BluefieldDriverConfig) -> Self { + Self { + reserved_vf_indexes: &config.reserved_vf_indexes, + pf_key: config.pf_key.as_deref().filter(|value| !value.is_empty()), + egress_cidr_pool: &config.egress_cidr_pool, + } + } +} + +pub(crate) fn require_host_pf(config: &BluefieldDriverConfig) -> Result<&str, String> { + config + .host_pf + .as_deref() + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| "BlueField is enabled but no host PF was configured".to_string()) +} + +/// Discover the local VF slots for `host_pf` and apply the operator's +/// reservations, PF-key rewrite, and egress-pool addressing. Shared by every +/// host-side role so the local pool is built identically. +pub(crate) fn prepare_host_slots( + config: HostSlotConfig<'_>, + sysfs: &SysfsRoot, + host_pf: &str, +) -> Result, String> { + let inventory = SysfsVfInventory::new(sysfs.clone(), [host_pf.to_string()]); + let mut slots = inventory + .discover() + .map_err(|err| format!("discover BlueField VFs for host PF {host_pf}: {err}"))?; + apply_slot_config(&config, &mut slots)?; + if slots.is_empty() { + return Err(format!("BlueField host PF {host_pf} has no discovered VFs")); + } + Ok(slots) +} + +fn apply_slot_config(config: &HostSlotConfig<'_>, slots: &mut Vec) -> Result<(), String> { + if !config.reserved_vf_indexes.is_empty() { + let reserved: HashSet = config.reserved_vf_indexes.iter().copied().collect(); + slots.retain(|slot| match slot.vf_index { + Some(index) => !reserved.contains(&index), + None => true, + }); + } + if let Some(pf_key) = config.pf_key { + for slot in slots.iter_mut() { + slot.pf = Some(pf_key.to_string()); + } + } + if !config.egress_cidr_pool.is_empty() { + if config.egress_cidr_pool.len() < slots.len() { + return Err(format!( + "BlueField egress pool has {} addresses for {} usable VFs", + config.egress_cidr_pool.len(), + slots.len() + )); + } + for (slot, address) in slots.iter_mut().zip(config.egress_cidr_pool.iter()) { + slot.guest_datapath_address = Some(address.clone()); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::{HostSlotConfig, apply_slot_config}; + use bf_inventory::VfSlot; + + #[test] + fn applies_reserved_indexes_pf_key_and_egress_pool() { + let mut slots = vec![ + VfSlot::new("vf0", "0000:03:00.2") + .with_pf("p0") + .with_vf_index(0), + VfSlot::new("vf1", "0000:03:00.3") + .with_pf("p0") + .with_vf_index(1), + ]; + let egress_pool = vec!["10.0.120.61/22".to_string()]; + let config = HostSlotConfig { + reserved_vf_indexes: &[0], + pf_key: Some("bf-a"), + egress_cidr_pool: &egress_pool, + }; + + apply_slot_config(&config, &mut slots).unwrap(); + + assert_eq!(slots.len(), 1); + assert_eq!(slots[0].host_bdf, "0000:03:00.3"); + assert_eq!(slots[0].pf.as_deref(), Some("bf-a")); + assert_eq!( + slots[0].guest_datapath_address.as_deref(), + Some("10.0.120.61/22") + ); + } + + #[test] + fn rejects_egress_pool_shorter_than_usable_slots() { + let mut slots = vec![ + VfSlot::new("vf0", "0000:03:00.2").with_vf_index(0), + VfSlot::new("vf1", "0000:03:00.3").with_vf_index(1), + ]; + let egress_pool = vec!["10.0.120.61/22".to_string()]; + let config = HostSlotConfig { + reserved_vf_indexes: &[], + pf_key: None, + egress_cidr_pool: &egress_pool, + }; + + let err = apply_slot_config(&config, &mut slots).unwrap_err(); + + assert!(err.contains("egress pool has 1 addresses for 2 usable VFs")); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/state.rs b/crates/openshell-driver-bluefield/bf-vm/src/state.rs new file mode 100644 index 000000000..eee5ab86f --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/state.rs @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! BlueField VM extension runtime state persisted under the sandbox state dir. + +use std::path::{Path, PathBuf}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use bf_inventory::VfSlot; +use serde::{Deserialize, Serialize}; + +use crate::lifecycle::{LifecycleError, LifecycleResult, extension_state_dir}; + +/// Name of this extension. Must match the module name (`bluefield`). +pub const EXTENSION_NAME: &str = "bluefield"; +const PCI_BIND_STATE_FILE: &str = "pci-bind-state.json"; + +/// Per-sandbox bookkeeping for reverse-order teardown. +#[derive(Debug, Clone)] +pub(crate) struct AttachmentRecord { + pub(crate) slot: VfSlot, +} + +/// Persisted record of the VF bound to a sandbox, for crash recovery. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct BluefieldPciBindState { + pub(crate) host_bdf: String, + pub(crate) sandbox_id: String, + #[serde(default)] + pub(crate) guest_mac: Option, + pub(crate) bound_at_ms: u128, +} + +pub(crate) fn persist_bind_state( + sandbox_id: &str, + sandbox_state_dir: &Path, + slot: &VfSlot, +) -> LifecycleResult<()> { + let path = bind_state_path(sandbox_state_dir)?; + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + LifecycleError::new(format!( + "create bluefield bind state dir {}: {err}", + parent.display() + )) + })?; + } + let state = BluefieldPciBindState { + host_bdf: slot.host_bdf.clone(), + sandbox_id: sandbox_id.to_string(), + guest_mac: slot.guest_mac.clone(), + bound_at_ms: now_millis(), + }; + let data = serde_json::to_string_pretty(&state) + .map_err(|err| LifecycleError::new(format!("serialize bluefield bind state: {err}")))?; + let tmp = path.with_extension("tmp"); + std::fs::write(&tmp, data) + .map_err(|err| LifecycleError::new(format!("write bluefield bind state {}: {err}", tmp.display())))?; + std::fs::rename(&tmp, &path).map_err(|err| { + LifecycleError::new(format!( + "commit bluefield bind state {}: {err}", + path.display() + )) + }) +} + +pub(crate) fn load_bind_state( + sandbox_id: &str, + sandbox_state_dir: &Path, +) -> LifecycleResult { + let path = bind_state_path(sandbox_state_dir)?; + let data = std::fs::read_to_string(&path) + .map_err(|err| LifecycleError::new(format!("read bluefield bind state {}: {err}", path.display())))?; + let state: BluefieldPciBindState = serde_json::from_str(&data) + .map_err(|err| LifecycleError::new(format!("parse bluefield bind state {}: {err}", path.display())))?; + if state.sandbox_id != sandbox_id { + return Err(LifecycleError::new(format!( + "bluefield bind state sandbox mismatch: expected {sandbox_id}, got {}", + state.sandbox_id + ))); + } + Ok(state) +} + +pub(crate) fn remove_bind_state(sandbox_state_dir: &Path) -> LifecycleResult<()> { + let path = bind_state_path(sandbox_state_dir)?; + match std::fs::remove_file(&path) { + Ok(()) => Ok(()), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(LifecycleError::new(format!( + "remove bluefield bind state {}: {err}", + path.display() + ))), + } +} + +fn bind_state_path(sandbox_state_dir: &Path) -> LifecycleResult { + Ok(extension_state_dir(sandbox_state_dir, EXTENSION_NAME)?.join(PCI_BIND_STATE_FILE)) +} + +pub(crate) fn now_millis() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/vf.rs b/crates/openshell-driver-bluefield/bf-vm/src/vf.rs new file mode 100644 index 000000000..d2d200dff --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/vf.rs @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Thin BlueField-side wrapper over the `openshell-vfio` substrate. +//! +//! Binding actually mutates host sysfs, so these helpers are kept behind +//! explicit calls the extension makes in `before_launch` / restore, never at +//! construction time. + +use openshell_vfio::{ + PciBindGuard, SysfsRoot, prepare_pci_for_passthrough, release_pci_from_passthrough, + validate_pci_for_passthrough, +}; + +use bf_inventory::VfSlot; + +/// Host capability probe for VF passthrough. Injectable so tests (and hosts +/// without the device) don't need real hardware. Implementations check that +/// *this* host can actually pass a given BDF through to a guest. +pub trait HostReadiness: std::fmt::Debug + Send + Sync { + /// Returns `Err(reason)` if the host cannot pass `host_bdf` through + /// (IOMMU disabled, device missing, IOMMU-group conflict, ...). + fn check_passthrough(&self, host_bdf: &str) -> Result<(), String>; +} + +/// Default [`HostReadiness`] backed by the real `/sys` via `openshell-vfio`. +#[derive(Debug)] +pub struct SysfsHostReadiness { + sysfs: SysfsRoot, +} + +impl SysfsHostReadiness { + #[must_use] + pub fn new(sysfs: SysfsRoot) -> Self { + Self { sysfs } + } +} + +impl Default for SysfsHostReadiness { + fn default() -> Self { + Self { + sysfs: SysfsRoot::system(), + } + } +} + +impl HostReadiness for SysfsHostReadiness { + fn check_passthrough(&self, host_bdf: &str) -> Result<(), String> { + validate_pci_for_passthrough(&self.sysfs, host_bdf).map_err(|err| err.to_string()) + } +} + +pub(crate) trait VfBinding: std::fmt::Debug + Send { + fn disarm(self: Box); +} + +#[derive(Debug)] +struct RealVfBinding(PciBindGuard); + +impl VfBinding for RealVfBinding { + fn disarm(self: Box) { + let Self(guard) = *self; + guard.disarm(); + } +} + +pub(crate) trait VfBinder: std::fmt::Debug + Send + Sync { + fn bind_slot(&self, slot: &VfSlot) -> Result, String>; + fn adopt_slot(&self, slot: &VfSlot) -> Result, String>; + fn release_slot(&self, slot: &VfSlot) -> Result<(), String>; +} + +#[derive(Debug, Clone)] +pub(crate) struct SysfsVfBinder { + sysfs: SysfsRoot, +} + +impl SysfsVfBinder { + pub(crate) fn new(sysfs: SysfsRoot) -> Self { + Self { sysfs } + } +} + +impl Default for SysfsVfBinder { + fn default() -> Self { + Self::new(SysfsRoot::system()) + } +} + +impl VfBinder for SysfsVfBinder { + fn bind_slot(&self, slot: &VfSlot) -> Result, String> { + bind_slot(&self.sysfs, slot) + .map(|guard| { + let binding: Box = Box::new(RealVfBinding(guard)); + binding + }) + .map_err(|err| err.to_string()) + } + + fn adopt_slot(&self, slot: &VfSlot) -> Result, String> { + adopt_slot(&self.sysfs, slot) + .map(|guard| { + let binding: Box = Box::new(RealVfBinding(guard)); + binding + }) + .map_err(|err| err.to_string()) + } + + fn release_slot(&self, slot: &VfSlot) -> Result<(), String> { + release_slot(&self.sysfs, slot).map_err(|err| err.to_string()) + } +} + +/// Bind a claimed VF slot to `vfio-pci`, returning the RAII guard. +/// +/// The caller is expected to `disarm()` the guard once QEMU owns the device +/// and to persist the binding for restart reconciliation. +pub fn bind_slot( + sysfs: &SysfsRoot, + slot: &VfSlot, +) -> Result { + prepare_pci_for_passthrough(sysfs, &slot.host_bdf) +} + +/// Re-take ownership of a VF already bound to `vfio-pci` after a driver +/// restart, without rebinding or mutating sysfs. +pub fn adopt_slot( + sysfs: &SysfsRoot, + slot: &VfSlot, +) -> Result { + PciBindGuard::adopt(sysfs, &slot.host_bdf) +} + +/// Restore a VF slot's device to its host driver at teardown time. +pub fn release_slot(sysfs: &SysfsRoot, slot: &VfSlot) -> Result<(), openshell_vfio::VfioError> { + release_pci_from_passthrough(sysfs, &slot.host_bdf) +} From 36d24b040b65fd3459bc55ab7e5c26dae2b81d5f Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 19:03:18 +0000 Subject: [PATCH 04/14] feat(bluefield): add bf-driver compute driver binary bf-driver is the external compute driver process. It parses the BlueField CLI/env surface, installs the bf-vm lifecycle extension for the workload-running roles, and serves the ComputeDriver gRPC API over an authenticated Unix socket (or unauthenticated TCP for local dev). --- Cargo.lock | 59 ++ .../bf-driver/Cargo.toml | 32 + .../bf-driver/src/main.rs | 758 ++++++++++++++++++ 3 files changed, 849 insertions(+) create mode 100644 crates/openshell-driver-bluefield/bf-driver/Cargo.toml create mode 100644 crates/openshell-driver-bluefield/bf-driver/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 366f001a6..6c380e36b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -421,6 +421,61 @@ dependencies = [ "sha2 0.10.9", ] +[[package]] +name = "bf-core" +version = "0.0.0" +dependencies = [ + "async-trait", + "serde", + "serde_json", + "tokio", +] + +[[package]] +name = "bf-driver" +version = "0.0.0" +dependencies = [ + "bf-vm", + "clap", + "futures", + "miette", + "openshell-core", + "openshell-driver-vm", + "rustix 1.1.4", + "tempfile", + "tokio", + "tonic", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "bf-inventory" +version = "0.0.0" +dependencies = [ + "bf-core", + "openshell-vfio", +] + +[[package]] +name = "bf-vm" +version = "0.0.0" +dependencies = [ + "bf-core", + "bf-inventory", + "clap", + "openshell-core", + "openshell-driver-vm", + "openshell-vfio", + "serde", + "serde_json", + "sha2 0.10.9", + "tempfile", + "tokio", + "tonic", + "tracing", +] + [[package]] name = "bindgen" version = "0.72.1" @@ -3392,6 +3447,10 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-driver-bluefield" +version = "0.0.0" + [[package]] name = "openshell-driver-docker" version = "0.0.0" diff --git a/crates/openshell-driver-bluefield/bf-driver/Cargo.toml b/crates/openshell-driver-bluefield/bf-driver/Cargo.toml new file mode 100644 index 000000000..6eb801772 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-driver/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "bf-driver" +description = "External BlueField compute driver for OpenShell" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[[bin]] +name = "openshell-driver-bluefield" +path = "src/main.rs" + +[features] +default = [] + +[dependencies] +bf-vm = { path = "../bf-vm" } + +clap = { workspace = true } +futures = { workspace = true } +miette = { workspace = true } +openshell-core = { path = "../../openshell-core", default-features = false } +openshell-driver-vm = { path = "../../openshell-driver-vm" } +rustix = { workspace = true } +tokio = { workspace = true } +tonic = { workspace = true, features = ["transport"] } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +[dev-dependencies] +tempfile = "3" diff --git a/crates/openshell-driver-bluefield/bf-driver/src/main.rs b/crates/openshell-driver-bluefield/bf-driver/src/main.rs new file mode 100644 index 000000000..0024f335f --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-driver/src/main.rs @@ -0,0 +1,758 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use bf_vm::{ + BluefieldDriverArgs, BluefieldDriverConfig, + extensions::{ExtensionRuntimeConfig, build_lifecycle_extensions}, +}; +use clap::Parser; +use futures::Stream; +use miette::{IntoDiagnostic, Result, miette}; +use openshell_core::VERSION; +use openshell_core::proto::compute::v1::compute_driver_server::{ + ComputeDriver, ComputeDriverServer, +}; +#[cfg(target_os = "macos")] +use openshell_driver_vm::{VM_RUNTIME_DIR_ENV, configured_runtime_dir}; +use openshell_driver_vm::{VmBackend, VmDriver, VmDriverConfig, VmLaunchConfig, procguard, run_vm}; +use std::io; +use std::net::SocketAddr; +use std::os::unix::fs::{FileTypeExt, MetadataExt, PermissionsExt}; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::net::{UnixListener, UnixStream}; +use tracing::info; +use tracing_subscriber::EnvFilter; + +#[derive(Parser, Debug)] +#[command(name = "openshell-driver-bluefield")] +#[command(version = VERSION)] +#[allow(clippy::struct_excessive_bools)] +struct Args { + #[arg(long, hide = true, default_value_t = false)] + internal_run_vm: bool, + + #[arg(long = "vm-root-disk", hide = true, alias = "vm-rootfs")] + vm_root_disk: Option, + + #[arg(long = "vm-overlay-disk", hide = true)] + vm_overlay_disk: Option, + + #[arg(long = "vm-image-disk", hide = true)] + vm_image_disk: Option, + + #[arg(long = "vm-kernel-image", hide = true)] + vm_kernel_image: Option, + + #[arg(long, hide = true)] + vm_exec: Option, + + #[arg(long, hide = true, default_value = "/")] + vm_workdir: String, + + #[arg(long, hide = true)] + vm_env: Vec, + + #[arg(long, hide = true)] + vm_console_output: Option, + + #[arg(long, hide = true, default_value_t = 2)] + vm_vcpus: u8, + + #[arg(long, hide = true, default_value_t = 2048)] + vm_mem_mib: u32, + + #[arg(long, hide = true, default_value_t = 1)] + vm_krun_log_level: u32, + + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_BIND")] + bind_address: Option, + + #[arg(long, env = "OPENSHELL_COMPUTE_DRIVER_SOCKET")] + bind_socket: Option, + + #[arg(long, hide = true)] + expected_peer_pid: Option, + + #[arg( + long, + env = "OPENSHELL_COMPUTE_DRIVER_ALLOW_UNAUTHENTICATED_TCP", + default_value_t = false + )] + allow_unauthenticated_tcp: bool, + + #[arg( + long, + env = "OPENSHELL_COMPUTE_DRIVER_ALLOW_SAME_UID_PEER", + default_value_t = false + )] + allow_same_uid_peer: bool, + + #[arg(long, env = "OPENSHELL_LOG_LEVEL", default_value = "info")] + log_level: String, + + #[arg(long, env = "OPENSHELL_GRPC_ENDPOINT")] + openshell_endpoint: Option, + + #[arg(long, env = "OPENSHELL_SANDBOX_IMAGE", default_value = "")] + default_image: String, + + #[arg(long, env = "OPENSHELL_VM_BOOTSTRAP_IMAGE", default_value = "")] + bootstrap_image: String, + + #[arg( + long, + env = "OPENSHELL_VM_DRIVER_STATE_DIR", + default_value = "target/openshell-vm-driver" + )] + state_dir: PathBuf, + + #[arg(long = "guest-tls-ca", env = "OPENSHELL_VM_TLS_CA")] + guest_tls_ca: Option, + + #[arg(long = "guest-tls-cert", env = "OPENSHELL_VM_TLS_CERT")] + guest_tls_cert: Option, + + #[arg(long = "guest-tls-key", env = "OPENSHELL_VM_TLS_KEY")] + guest_tls_key: Option, + + #[arg(long, env = "OPENSHELL_VM_KRUN_LOG_LEVEL", default_value_t = 1)] + krun_log_level: u32, + + #[arg(long, env = "OPENSHELL_VM_DRIVER_VCPUS", default_value_t = 2)] + vcpus: u8, + + #[arg(long, env = "OPENSHELL_VM_DRIVER_MEM_MIB", default_value_t = 2048)] + mem_mib: u32, + + #[arg(long, env = "OPENSHELL_VM_OVERLAY_DISK_MIB", default_value_t = 4096)] + overlay_disk_mib: u64, + + #[arg(long, env = "OPENSHELL_VM_GPU")] + gpu: bool, + + #[arg(long, env = "OPENSHELL_VM_GPU_MEM_MIB", default_value_t = 8192)] + gpu_mem_mib: u32, + + #[arg(long, env = "OPENSHELL_VM_GPU_VCPUS", default_value_t = 4)] + gpu_vcpus: u8, + + #[command(flatten)] + bluefield: BluefieldDriverArgs, + + #[arg(long, hide = true)] + vm_backend: Option, + + #[arg(long, hide = true)] + vm_gpu_bdf: Option, + + #[arg(long, hide = true)] + vm_tap_device: Option, + + #[arg(long, hide = true)] + vm_guest_ip: Option, + + #[arg(long, hide = true)] + vm_host_ip: Option, + + #[arg(long, hide = true)] + vm_vsock_cid: Option, + + #[arg(long, hide = true)] + vm_guest_mac: Option, + + #[arg(long, hide = true)] + vm_gateway_port: Option, +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + if args.internal_run_vm { + // We intentionally defer procguard arming until `run_vm()` so + // that the only arm is the one that knows how to clean up + // gvproxy. Racing two watchers against the same parent-death + // event causes the bare arm's `exit(1)` to win, skipping the + // gvproxy cleanup and leaking the helper. The risk window + // before `run_vm` arms procguard is ~a few syscalls long + // (`build_vm_launch_config`, `configured_runtime_dir`), which + // is negligible next to the parent gRPC server's uptime. + maybe_reexec_internal_vm_with_runtime_env()?; + let config = build_vm_launch_config(&args).map_err(|err| miette::miette!("{err}"))?; + run_vm(&config).map_err(|err| miette::miette!("{err}"))?; + return Ok(()); + } + + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), + ) + .init(); + + let listen_mode = compute_driver_listen_mode(&args).map_err(|err| miette::miette!("{err}"))?; + + // Arm procguard so that if the gateway is killed (SIGKILL or crash) + // we also die. Without this the driver is reparented to init and + // keeps its per-sandbox VM launchers alive forever. Launchers have + // their own procguards (armed in `run_vm`) which cascade cleanup of + // gvproxy and the libkrun worker the moment this driver exits. + if let Err(err) = procguard::die_with_parent() { + tracing::warn!( + error = %err, + "procguard arm failed; gateway crashes may orphan this driver" + ); + } + + let bluefield_config = args + .bluefield + .to_driver_config(args.openshell_endpoint.clone()) + .map_err(|err| miette::miette!("{err}"))?; + + // This stage runs the workload-side VM driver that binds a VF per + // sandbox. The leader/control-plane role is layered on in a later stage. + let driver = build_vm_driver(&args, bluefield_config).await?; + serve_compute_driver(driver, listen_mode).await +} + +/// Build the workload-running VM driver (all-in-one or compute-node role). +async fn build_vm_driver(args: &Args, bluefield: BluefieldDriverConfig) -> Result { + let openshell_endpoint = args + .openshell_endpoint + .clone() + .ok_or_else(|| miette!("OPENSHELL_GRPC_ENDPOINT is required"))?; + let extension_config = ExtensionRuntimeConfig { bluefield }; + let lifecycle_extensions = + build_lifecycle_extensions(&extension_config).map_err(|err| miette!("{err}"))?; + + VmDriver::new_with_extensions( + VmDriverConfig { + openshell_endpoint, + state_dir: args.state_dir.clone(), + launcher_bin: None, + default_image: args.default_image.clone(), + bootstrap_image: args.bootstrap_image.clone(), + log_level: args.log_level.clone(), + krun_log_level: args.krun_log_level, + vcpus: args.vcpus, + mem_mib: args.mem_mib, + overlay_disk_mib: args.overlay_disk_mib, + guest_tls_ca: args.guest_tls_ca.clone(), + guest_tls_cert: args.guest_tls_cert.clone(), + guest_tls_key: args.guest_tls_key.clone(), + gpu_enabled: args.gpu, + gpu_mem_mib: args.gpu_mem_mib, + gpu_vcpus: args.gpu_vcpus, + }, + lifecycle_extensions, + ) + .await + .map_err(|err| miette!("{err}")) +} + +/// Serve any `ComputeDriver` over the selected listener. Shared by every role +/// so the leader and the workload driver are served identically. +async fn serve_compute_driver(driver: T, listen_mode: ComputeDriverListenMode) -> Result<()> +where + T: ComputeDriver, +{ + match listen_mode { + ComputeDriverListenMode::Unix { + socket_path, + expected_peer_pid, + } => { + prepare_compute_driver_socket(&socket_path).map_err(|err| miette!("{err}"))?; + + info!(socket = %socket_path.display(), "Starting vm compute driver"); + let listener = UnixListener::bind(&socket_path).into_diagnostic()?; + restrict_socket_permissions(&socket_path).map_err(|err| miette!("{err}"))?; + let result = tonic::transport::Server::builder() + .add_service(ComputeDriverServer::new(driver)) + .serve_with_incoming(AuthenticatedUnixIncoming::new(listener, expected_peer_pid)) + .await + .into_diagnostic(); + let _ = std::fs::remove_file(&socket_path); + result + } + ComputeDriverListenMode::Tcp(bind_address) => { + info!(address = %bind_address, "Starting unauthenticated dev vm compute driver"); + tonic::transport::Server::builder() + .add_service(ComputeDriverServer::new(driver)) + .serve(bind_address) + .await + .into_diagnostic() + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum ComputeDriverListenMode { + Unix { + socket_path: PathBuf, + expected_peer_pid: Option, + }, + Tcp(SocketAddr), +} + +fn compute_driver_listen_mode(args: &Args) -> std::result::Result { + if let Some(socket_path) = args.bind_socket.clone() { + if args.expected_peer_pid.is_none() && !args.allow_same_uid_peer { + return Err( + "--expected-peer-pid is required with --bind-socket; use --allow-same-uid-peer only for local development" + .to_string(), + ); + } + return Ok(ComputeDriverListenMode::Unix { + socket_path, + expected_peer_pid: args.expected_peer_pid, + }); + } + + if !args.allow_unauthenticated_tcp { + return Err( + "--bind-socket is required; unauthenticated TCP mode is disabled unless --allow-unauthenticated-tcp is set for local development" + .to_string(), + ); + } + + let Some(bind_address) = args.bind_address else { + return Err("--bind-address is required with --allow-unauthenticated-tcp".to_string()); + }; + + Ok(ComputeDriverListenMode::Tcp(bind_address)) +} + +fn prepare_compute_driver_socket(socket_path: &Path) -> std::result::Result<(), String> { + let Some(parent) = socket_path.parent() else { + return Err(format!( + "vm compute driver socket path '{}' has no parent directory", + socket_path.display() + )); + }; + let expected_uid = current_euid(); + prepare_private_socket_dir(parent, expected_uid)?; + remove_stale_socket(socket_path, expected_uid) +} + +fn current_euid() -> u32 { + rustix::process::geteuid().as_raw() +} + +fn prepare_private_socket_dir( + socket_dir: &Path, + expected_uid: u32, +) -> std::result::Result<(), String> { + std::fs::create_dir_all(socket_dir) + .map_err(|err| format!("create socket dir {}: {err}", socket_dir.display()))?; + let metadata = std::fs::symlink_metadata(socket_dir) + .map_err(|err| format!("stat socket dir {}: {err}", socket_dir.display()))?; + let file_type = metadata.file_type(); + if file_type.is_symlink() { + return Err(format!( + "socket dir {} is a symlink; refusing to use it", + socket_dir.display() + )); + } + if !file_type.is_dir() { + return Err(format!( + "socket dir {} is not a directory", + socket_dir.display() + )); + } + if metadata.uid() != expected_uid { + return Err(format!( + "socket dir {} is owned by uid {} but current euid is {}", + socket_dir.display(), + metadata.uid(), + expected_uid + )); + } + std::fs::set_permissions(socket_dir, std::fs::Permissions::from_mode(0o700)) + .map_err(|err| format!("chmod socket dir {}: {err}", socket_dir.display())) +} + +fn remove_stale_socket(socket_path: &Path, expected_uid: u32) -> std::result::Result<(), String> { + let metadata = match std::fs::symlink_metadata(socket_path) { + Ok(metadata) => metadata, + Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(()), + Err(err) => return Err(format!("stat socket {}: {err}", socket_path.display())), + }; + let file_type = metadata.file_type(); + if file_type.is_symlink() { + return Err(format!( + "socket {} is a symlink; refusing to remove it", + socket_path.display() + )); + } + if metadata.uid() != expected_uid { + return Err(format!( + "socket {} is owned by uid {} but current euid is {}", + socket_path.display(), + metadata.uid(), + expected_uid + )); + } + if !file_type.is_socket() { + return Err(format!( + "socket path {} exists but is not a Unix socket", + socket_path.display() + )); + } + std::fs::remove_file(socket_path) + .map_err(|err| format!("remove stale socket {}: {err}", socket_path.display())) +} + +fn restrict_socket_permissions(socket_path: &Path) -> std::result::Result<(), String> { + std::fs::set_permissions(socket_path, std::fs::Permissions::from_mode(0o600)) + .map_err(|err| format!("chmod socket {}: {err}", socket_path.display())) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct PeerCredentials { + uid: u32, + pid: Option, +} + +fn peer_credentials(stream: &UnixStream) -> std::result::Result { + let credentials = stream + .peer_cred() + .map_err(|err| format!("read peer credentials: {err}"))?; + Ok(PeerCredentials { + uid: credentials.uid(), + pid: credentials.pid(), + }) +} + +fn authorize_peer_credentials( + peer: PeerCredentials, + driver_uid: u32, + gateway_pid: Option, +) -> std::result::Result<(), String> { + if peer.uid != driver_uid { + return Err(format!( + "peer uid {} does not match current euid {}", + peer.uid, driver_uid + )); + } + let Some(gateway_pid) = gateway_pid else { + return Ok(()); + }; + let Some(peer_process_id) = peer.pid.and_then(|pid| u32::try_from(pid).ok()) else { + return Err(format!( + "peer pid is unavailable; expected gateway pid {gateway_pid}" + )); + }; + if peer_process_id != gateway_pid { + return Err(format!( + "peer pid {peer_process_id} does not match expected gateway pid {gateway_pid}" + )); + } + Ok(()) +} + +struct AuthenticatedUnixIncoming { + listener: UnixListener, + expected_uid: u32, + expected_peer_pid: Option, +} + +impl AuthenticatedUnixIncoming { + fn new(listener: UnixListener, expected_peer_pid: Option) -> Self { + Self { + listener, + expected_uid: current_euid(), + expected_peer_pid, + } + } +} + +impl Stream for AuthenticatedUnixIncoming { + type Item = io::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + match this.listener.poll_accept(cx) { + Poll::Ready(Ok((stream, _addr))) => { + let authorized = peer_credentials(&stream).and_then(|peer| { + authorize_peer_credentials(peer, this.expected_uid, this.expected_peer_pid) + }); + match authorized { + Ok(()) => return Poll::Ready(Some(Ok(stream))), + Err(err) => { + tracing::warn!( + error = %err, + "rejected vm compute driver UDS client" + ); + } + } + } + Poll::Ready(Err(err)) => return Poll::Ready(Some(Err(err))), + Poll::Pending => return Poll::Pending, + } + } + } +} + +fn build_vm_launch_config(args: &Args) -> std::result::Result { + let root_disk = args + .vm_root_disk + .clone() + .ok_or_else(|| "--vm-root-disk is required in internal VM mode".to_string())?; + let overlay_disk = args + .vm_overlay_disk + .clone() + .ok_or_else(|| "--vm-overlay-disk is required in internal VM mode".to_string())?; + let image_disk = args.vm_image_disk.clone(); + let exec_path = args + .vm_exec + .clone() + .ok_or_else(|| "--vm-exec is required in internal VM mode".to_string())?; + let console_output = args + .vm_console_output + .clone() + .ok_or_else(|| "--vm-console-output is required in internal VM mode".to_string())?; + + let backend = match args.vm_backend.as_deref() { + Some("qemu") => VmBackend::Qemu, + Some("libkrun") | None => VmBackend::Libkrun, + Some(other) => return Err(format!("unknown VM backend: {other}")), + }; + + Ok(VmLaunchConfig { + root_disk, + overlay_disk, + image_disk, + kernel_image: args.vm_kernel_image.clone(), + vcpus: args.vm_vcpus, + mem_mib: args.vm_mem_mib, + exec_path, + args: Vec::new(), + env: args.vm_env.clone(), + workdir: args.vm_workdir.clone(), + log_level: args.vm_krun_log_level, + console_output, + backend, + gpu_bdf: args.vm_gpu_bdf.clone(), + tap_device: args.vm_tap_device.clone(), + guest_ip: args.vm_guest_ip.clone(), + host_ip: args.vm_host_ip.clone(), + vsock_cid: args.vm_vsock_cid, + guest_mac: args.vm_guest_mac.clone(), + gateway_port: args.vm_gateway_port, + }) +} + +#[cfg(target_os = "macos")] +fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { + use std::os::unix::process::CommandExt as _; + + const REEXEC_ENV: &str = "__OPENSHELL_DRIVER_VM_REEXEC"; + + if std::env::var_os(REEXEC_ENV).is_some() { + return Ok(()); + } + + let runtime_dir = configured_runtime_dir().map_err(|err| miette::miette!("{err}"))?; + let runtime_str = runtime_dir.to_string_lossy(); + let needs_reexec = std::env::var_os("DYLD_LIBRARY_PATH") + .is_none_or(|value| !value.to_string_lossy().contains(runtime_str.as_ref())); + if !needs_reexec { + return Ok(()); + } + + let mut dyld_paths = vec![runtime_dir.clone()]; + if let Some(existing) = std::env::var_os("DYLD_LIBRARY_PATH") { + dyld_paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(&dyld_paths) + .map_err(|err| miette::miette!("join DYLD_LIBRARY_PATH: {err}"))?; + let exe = std::env::current_exe().into_diagnostic()?; + let args: Vec = std::env::args().skip(1).collect(); + + // Use execvp() so the current process is *replaced* by the re-exec'd + // binary — no wrapper process sits between the compute driver and + // the actually-running VM launcher. That avoids two problems: + // 1. An extra process level that survives SIGKILL of the driver + // (the wrapper was reparenting the re-exec'd child to init). + // 2. Signal forwarding: with a wrapper, a SIGTERM to the wrapper + // doesn't reach the child unless we hand-roll forwarding. + // After exec, the child inherits our PID and our procguard arming. + let err = std::process::Command::new(exe) + .args(&args) + .env("DYLD_LIBRARY_PATH", &joined) + .env(VM_RUNTIME_DIR_ENV, runtime_dir) + .env(REEXEC_ENV, "1") + .exec(); + // `exec()` only returns on failure. + Err(miette::miette!("failed to re-exec with runtime env: {err}")) +} + +#[cfg(not(target_os = "macos"))] +// Signature must match the macOS variant which can fail. +#[allow(clippy::unnecessary_wraps)] +fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::{ + Args, ComputeDriverListenMode, PeerCredentials, authorize_peer_credentials, + compute_driver_listen_mode, + }; + use clap::Parser; + use std::path::PathBuf; + + #[test] + fn peer_authorization_accepts_matching_uid_and_pid() { + authorize_peer_credentials( + PeerCredentials { + uid: 1000, + pid: Some(42), + }, + 1000, + Some(42), + ) + .unwrap(); + } + + #[test] + fn peer_authorization_rejects_wrong_pid() { + let err = authorize_peer_credentials( + PeerCredentials { + uid: 1000, + pid: Some(7), + }, + 1000, + Some(42), + ) + .expect_err("wrong pid should be rejected"); + assert!(err.contains("does not match expected gateway pid")); + } + + #[test] + fn peer_authorization_rejects_wrong_uid() { + let err = authorize_peer_credentials( + PeerCredentials { + uid: 1001, + pid: Some(42), + }, + 1000, + Some(42), + ) + .expect_err("wrong uid should be rejected"); + assert!(err.contains("does not match current euid")); + } + + #[test] + fn peer_authorization_rejects_missing_pid_when_expected() { + let err = authorize_peer_credentials( + PeerCredentials { + uid: 1000, + pid: None, + }, + 1000, + Some(42), + ) + .expect_err("missing pid should be rejected"); + assert!(err.contains("peer pid is unavailable")); + } + + #[test] + fn peer_authorization_accepts_matching_uid_without_expected_pid() { + authorize_peer_credentials( + PeerCredentials { + uid: 1000, + pid: None, + }, + 1000, + None, + ) + .unwrap(); + } + + #[test] + fn listen_mode_rejects_default_tcp() { + let args = Args::parse_from(["openshell-driver-vm"]); + let err = compute_driver_listen_mode(&args).expect_err("default TCP should be disabled"); + assert!(err.contains("--bind-socket is required")); + } + + #[test] + fn listen_mode_rejects_bind_address_without_tcp_opt_in() { + let args = Args::parse_from(["openshell-driver-vm", "--bind-address", "127.0.0.1:50061"]); + let err = + compute_driver_listen_mode(&args).expect_err("TCP bind should require explicit opt-in"); + assert!(err.contains("--allow-unauthenticated-tcp")); + } + + #[test] + fn listen_mode_requires_bind_address_with_tcp_opt_in() { + let args = Args::parse_from(["openshell-driver-vm", "--allow-unauthenticated-tcp"]); + let err = + compute_driver_listen_mode(&args).expect_err("TCP opt-in should require an address"); + assert!(err.contains("--bind-address is required")); + } + + #[test] + fn listen_mode_accepts_explicit_unauthenticated_tcp() { + let args = Args::parse_from([ + "openshell-driver-vm", + "--allow-unauthenticated-tcp", + "--bind-address", + "127.0.0.1:50061", + ]); + assert_eq!( + compute_driver_listen_mode(&args).unwrap(), + ComputeDriverListenMode::Tcp("127.0.0.1:50061".parse().unwrap()) + ); + } + + #[test] + fn listen_mode_requires_expected_peer_pid_for_uds() { + let args = Args::parse_from([ + "openshell-driver-vm", + "--bind-socket", + "/tmp/compute-driver.sock", + ]); + let err = compute_driver_listen_mode(&args) + .expect_err("UDS should require gateway peer pid by default"); + assert!(err.contains("--expected-peer-pid is required")); + } + + #[test] + fn listen_mode_accepts_uds_with_expected_peer_pid() { + let args = Args::parse_from([ + "openshell-driver-vm", + "--bind-socket", + "/tmp/compute-driver.sock", + "--expected-peer-pid", + "42", + ]); + assert_eq!( + compute_driver_listen_mode(&args).unwrap(), + ComputeDriverListenMode::Unix { + socket_path: PathBuf::from("/tmp/compute-driver.sock"), + expected_peer_pid: Some(42), + } + ); + } + + #[test] + fn listen_mode_accepts_explicit_same_uid_uds_dev_mode() { + let args = Args::parse_from([ + "openshell-driver-vm", + "--bind-socket", + "/tmp/compute-driver.sock", + "--allow-same-uid-peer", + ]); + assert_eq!( + compute_driver_listen_mode(&args).unwrap(), + ComputeDriverListenMode::Unix { + socket_path: PathBuf::from("/tmp/compute-driver.sock"), + expected_peer_pid: None, + } + ); + } +} From 29a76cd2215e126471e08ac120a7d3514cd38cf5 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 19:59:56 +0000 Subject: [PATCH 05/14] feat(vm): pass host PCI devices into the guest via a generic resource seam Introduce a generic guest-resource model on LaunchPlan so lifecycle extensions can request host PCI passthrough without growing the shared plan type per device class: - lifecycle: LaunchPlan now carries an opaque `resources: Vec` with an `add_resource` writer. `GuestResource::PciPassthrough` is the only variant today; new kinds (e.g. volume mounts) become new variants without touching the plan's shape or its constructors. - runtime: render a `pcie-root-port` + `vfio-pci` pair per passthrough device and make the GPU device block optional, so a sandbox can carry a GPU plus one or more VF NICs at once. - driver: relax the non-GPU QEMU guard when a concrete passthrough device backs the launch, and forward each device to the launched child via `--vm-pci-passthrough`. The PCI-specific projection lives in the driver layer (which renders QEMU), keeping LaunchPlan generic; the exhaustive match forces that site to be revisited when a variant is added. Wire the BlueField VM extension onto the seam: declare the claimed VF as a passthrough device in `configure_launch` (so the backend resolves to QEMU and the guard sees a concrete device) and bind it in `before_launch`, attaching it to the guest as an egress NIC alongside any GPU. --- .../bf-driver/src/main.rs | 4 + .../bf-vm/src/extension.rs | 45 ++++++++- .../bf-vm/src/kernel.rs | 1 + crates/openshell-driver-vm/src/driver.rs | 93 ++++++++++++++++--- crates/openshell-driver-vm/src/lib.rs | 6 +- crates/openshell-driver-vm/src/lifecycle.rs | 45 +++++++++ crates/openshell-driver-vm/src/main.rs | 4 + crates/openshell-driver-vm/src/runtime.rs | 63 +++++++++++-- 8 files changed, 230 insertions(+), 31 deletions(-) diff --git a/crates/openshell-driver-bluefield/bf-driver/src/main.rs b/crates/openshell-driver-bluefield/bf-driver/src/main.rs index 0024f335f..353a26bd3 100644 --- a/crates/openshell-driver-bluefield/bf-driver/src/main.rs +++ b/crates/openshell-driver-bluefield/bf-driver/src/main.rs @@ -147,6 +147,9 @@ struct Args { #[arg(long, hide = true)] vm_gpu_bdf: Option, + #[arg(long = "vm-pci-passthrough", hide = true)] + vm_pci_passthrough: Vec, + #[arg(long, hide = true)] vm_tap_device: Option, @@ -534,6 +537,7 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result LifecycleResult<()> { @@ -197,6 +198,16 @@ impl LifecycleExtension for BluefieldExtension { plan.require_backend_feature(BackendFeature::GuestInitDropins); plan.guest_init_dropins.push(guest_egress::dropin()); + // Declare the VF as a passthrough device now (before the backend is + // resolved and validated) so the driver promotes the launch to QEMU + // and the non-GPU launch guard sees a concrete device backing. The + // actual vfio-pci bind happens in `before_launch`; the claim is + // idempotent per sandbox, so claiming here and there is safe. + let slot = self.claim_slot(&sandbox.id)?; + plan.add_resource(GuestResource::PciPassthrough(PciPassthroughDevice::new( + slot.host_bdf, + ))); + // Select the BlueField guest kernel + load its VF driver modules so the // assigned VF is not an inert PCI function in the guest. if let Some(kernel) = &self.kernel { @@ -253,6 +264,12 @@ impl LifecycleExtension for BluefieldExtension { self.record_attachment(&sandbox.id, AttachmentRecord { slot: slot.clone() }); + // Attach the bound VF to the guest as a passthrough device (idempotent + // with the declaration made in `configure_launch`). + plan.add_resource(GuestResource::PciPassthrough(PciPassthroughDevice::new( + slot.host_bdf.clone(), + ))); + if let Some(egress) = &self.egress { plan.env.extend(egress.env(&slot)); } @@ -379,6 +396,15 @@ mod tests { )) } + fn passthrough_bdfs(plan: &LaunchPlan) -> Vec<&str> { + plan.resources + .iter() + .map(|resource| match resource { + GuestResource::PciPassthrough(device) => device.host_bdf.as_str(), + }) + .collect() + } + fn sample_plan() -> LaunchPlan { LaunchPlan { backend: crate::runtime::VmBackend::Qemu, @@ -397,6 +423,7 @@ mod tests { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), } } @@ -428,6 +455,7 @@ mod tests { .iter() .any(|e| e == "OPENSHELL_VM_DATA_IP=10.0.120.10/22") ); + assert_eq!(passthrough_bdfs(&plan), vec!["0000:03:00.2"]); let bind_state = state::load_bind_state("sandbox-1", &state).unwrap(); assert_eq!(bind_state.host_bdf, "0000:03:00.2"); @@ -484,8 +512,8 @@ mod tests { } #[tokio::test] - async fn configure_launch_selects_bluefield_kernel() { - let extension = BluefieldExtension::new(VfPool::new([])) + async fn configure_launch_selects_kernel_and_declares_vf_passthrough() { + let extension = BluefieldExtension::new(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])) .with_kernel(BluefieldKernel::from_image("/opt/openshell/kernels/bf-vmlinux")); let mut plan = sample_plan(); @@ -502,6 +530,13 @@ mod tests { plan.required_backend_features .contains(&BackendFeature::ExternalKernelImage) ); + // The VF is declared as a passthrough device so the driver promotes + // the launch to QEMU and the non-GPU guard sees a concrete device. + assert!( + plan.required_backend_features + .contains(&BackendFeature::PciPassthrough) + ); + assert_eq!(passthrough_bdfs(&plan), vec!["0000:03:00.2"]); } #[tokio::test] diff --git a/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs b/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs index d42d56d13..d8709c955 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/kernel.rs @@ -229,6 +229,7 @@ mod tests { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), } } diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 30fecd8be..1bfcd35d3 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -5,8 +5,8 @@ use crate::gpu::{ GpuInventory, SubnetAllocator, allocate_vsock_cid, mac_from_sandbox_id, tap_device_name, }; use crate::lifecycle::{ - BackendFeature, GuestInitDropin, LaunchAbortReason, LaunchPlan, LifecycleExtensionRegistry, - RestoreContext, extension_state_dir, + BackendFeature, GuestInitDropin, GuestResource, LaunchAbortReason, LaunchPlan, + LifecycleExtensionRegistry, PciPassthroughDevice, RestoreContext, extension_state_dir, }; use crate::rootfs::{ clone_or_copy_sparse_file, create_ext4_image_from_dir_with_size, create_rootfs_image_from_dir, @@ -847,6 +847,9 @@ impl VmDriver { if let Some(bdf) = plan.gpu_bdf.as_deref() { command.arg("--vm-gpu-bdf").arg(bdf); } + for device in pci_passthrough_devices(&plan) { + command.arg("--vm-pci-passthrough").arg(&device.host_bdf); + } if let Some(tap) = plan.tap_device.as_deref() { command.arg("--vm-tap-device").arg(tap); } @@ -1387,16 +1390,17 @@ impl VmDriver { #[allow(clippy::result_large_err)] fn validate_launch_plan_backend(is_gpu: bool, plan: &LaunchPlan) -> Result<(), Status> { - // NOTE: this guard exists because the non-GPU QEMU launch path - // (PCI device transport, VFIO root port wiring) has not landed - // yet. Until then, even though the resolver will happily promote - // a plan to QEMU when an extension requires `PciPassthrough` or - // `ExternalKernelImage`, the launch itself is blocked here so we - // don't spawn a QEMU instance with no concrete device backing. - // Remove this guard once the non-GPU QEMU launch path supports - // emitting `pcie-root-port` + `vfio-pci` for arbitrary device - // descriptors. - if plan.backend == VmBackend::Qemu && !is_gpu { + // A non-GPU sandbox may only run on QEMU when it has a concrete PCI + // device to back the launch: the runtime emits `pcie-root-port` + + // `vfio-pci` per passthrough device, so an empty plan would spawn a + // QEMU instance with nothing attached. A GPU sandbox is always backed + // by its GPU device; other sandboxes must carry at least one PCI + // passthrough resource (e.g. a BlueField VF NIC). + if plan.backend == VmBackend::Qemu + && !is_gpu + && plan.gpu_bdf.is_none() + && pci_passthrough_devices(plan).next().is_none() + { let offending_feature = plan .required_backend_features .iter() @@ -1406,7 +1410,7 @@ impl VmDriver { }); return Err(Status::failed_precondition(format!( "vm lifecycle extension required '{offending_feature}', which resolves to the QEMU backend, \ - but non-GPU QEMU launch is not yet supported (pending PCI device transport)" + but no concrete PCI passthrough device was provided to back a non-GPU QEMU launch" ))); } if plan.backend != VmBackend::Qemu && is_gpu { @@ -1497,6 +1501,7 @@ impl VmDriver { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }); } @@ -1538,6 +1543,7 @@ impl VmDriver { gateway_port, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }) } @@ -3968,6 +3974,23 @@ fn gateway_port_from_endpoint(endpoint: &str) -> Option { Url::parse(endpoint).ok().and_then(|url| url.port()) } +/// Host PCI passthrough devices declared on a plan, in declaration order. +/// +/// PCI-specific knowledge lives in the VM-driver layer (which renders the QEMU +/// command line) rather than on the generic [`LaunchPlan`]: the plan only +/// carries an opaque `resources` set. The exhaustive match means a new +/// [`GuestResource`] variant forces this projection to be revisited. +fn pci_passthrough_devices(plan: &LaunchPlan) -> impl Iterator { + // `filter_map` is deliberate: it collapses to a `map` only while + // `PciPassthrough` is the sole variant. Once other resource kinds exist it + // must skip them, and the exhaustive match makes the compiler flag this + // site when a variant is added. + #[allow(clippy::unnecessary_filter_map)] + plan.resources.iter().filter_map(|resource| match resource { + GuestResource::PciPassthrough(device) => Some(device), + }) +} + fn has_complete_qemu_network(plan: &LaunchPlan) -> bool { plan.tap_device.is_some() && plan.guest_ip.is_some() @@ -6573,8 +6596,8 @@ mod tests { } use crate::lifecycle::{ - BackendFeature, LaunchPlan, LifecycleError, LifecycleExtension, LifecycleExtensionRegistry, - LifecycleResult, + BackendFeature, GuestResource, LaunchPlan, LifecycleError, LifecycleExtension, + LifecycleExtensionRegistry, LifecycleResult, PciPassthroughDevice, }; use crate::runtime::VmBackend; @@ -6704,6 +6727,42 @@ mod tests { assert!(plan.guest_mac.is_some()); } + #[test] + fn validate_allows_non_gpu_qemu_only_with_concrete_passthrough_device() { + let mut plan = LaunchPlan { + backend: VmBackend::Qemu, + vcpus: 2, + mem_mib: 2048, + required_backends: Vec::new(), + required_backend_features: vec![BackendFeature::PciPassthrough], + kernel_profile: None, + kernel_image: None, + gpu_bdf: None, + tap_device: Some("vmtap-x".to_string()), + guest_ip: Some("10.0.0.2".to_string()), + host_ip: Some("10.0.0.1".to_string()), + vsock_cid: Some(7), + guest_mac: Some("02:00:00:00:00:01".to_string()), + gateway_port: Some(8080), + guest_init_dropins: Vec::new(), + env: Vec::new(), + resources: Vec::new(), + }; + + // Non-GPU QEMU with no concrete device backing is still rejected. + let err = VmDriver::validate_launch_plan_backend(false, &plan) + .expect_err("non-GPU QEMU without a device should be rejected"); + assert_eq!(err.code(), Code::FailedPrecondition); + assert!(err.message().contains("no concrete PCI passthrough device")); + + // With a VF passthrough device declared, the launch is permitted. + plan.resources = vec![GuestResource::PciPassthrough(PciPassthroughDevice::new( + "0000:03:00.2", + ))]; + VmDriver::validate_launch_plan_backend(false, &plan) + .expect("non-GPU QEMU with a concrete VF device should be allowed"); + } + #[test] fn launch_plan_rejects_external_kernel_on_unsupported_backend() { let mut plan = LaunchPlan { @@ -6723,6 +6782,7 @@ mod tests { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }; let err = VmDriver::validate_launch_plan_backend(false, &plan) @@ -6866,6 +6926,7 @@ mod tests { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }; let err = extensions .before_launch(&sandbox, Path::new("/tmp/state"), &mut libkrun_plan) @@ -6890,6 +6951,7 @@ mod tests { gateway_port: Some(8080), guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }; extensions .before_launch(&sandbox, Path::new("/tmp/state"), &mut qemu_plan) @@ -6923,6 +6985,7 @@ mod tests { gateway_port: Some(8080), guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), }; let err = extensions .before_launch(&sandbox, Path::new("/tmp/state"), &mut plan) diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 88e2c3b20..00d4e88e5 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -13,9 +13,9 @@ mod runtime; pub use driver::{VmDriver, VmDriverConfig}; pub use lifecycle::{ - BackendFeature, ExtensionCapabilities, ExtensionDescriptor, GuestInitDropin, LaunchAbortReason, - LaunchPlan, LifecycleError, LifecycleExtension, LifecycleExtensionRegistry, LifecycleResult, - RestoreContext, + BackendFeature, ExtensionCapabilities, ExtensionDescriptor, GuestInitDropin, GuestResource, + LaunchAbortReason, LaunchPlan, LifecycleError, LifecycleExtension, LifecycleExtensionRegistry, + LifecycleResult, PciPassthroughDevice, RestoreContext, }; pub use runtime::{ VM_RUNTIME_DIR_ENV, VmBackend, VmLaunchConfig, cleanup_stale_tap_interfaces, diff --git a/crates/openshell-driver-vm/src/lifecycle.rs b/crates/openshell-driver-vm/src/lifecycle.rs index c042715e5..03004078b 100644 --- a/crates/openshell-driver-vm/src/lifecycle.rs +++ b/crates/openshell-driver-vm/src/lifecycle.rs @@ -214,6 +214,41 @@ impl GuestInitDropin { } } +/// A host PCI device to pass through to the guest via VFIO. +/// +/// Generic and vendor-neutral: any lifecycle extension can add devices (a VF +/// NIC, an accelerator, ...). The VM driver renders each as its own +/// `pcie-root-port` + `vfio-pci` and forwards it to the launcher, so a sandbox +/// can carry several passthrough devices at once (e.g. a GPU in +/// [`LaunchPlan::gpu_bdf`] plus a VF NIC here). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PciPassthroughDevice { + /// Host PCI address in `DDDD:BB:DD.F` form. + pub host_bdf: String, +} + +impl PciPassthroughDevice { + #[must_use] + pub fn new(host_bdf: impl Into) -> Self { + Self { + host_bdf: host_bdf.into(), + } + } +} + +/// A host resource attached to the guest by a lifecycle extension. +/// +/// Stored as an open, additive set in [`LaunchPlan::resources`] so new resource +/// kinds (e.g. volume mounts) can be introduced as new variants without +/// changing the [`LaunchPlan`] shape that other crates construct. Downstream +/// consumers (backend resolution, QEMU rendering, cleanup) match on the kind, +/// so adding a variant forces those sites to handle it. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum GuestResource { + /// A host PCI device passed through to the guest via VFIO. + PciPassthrough(PciPassthroughDevice), +} + #[derive(Debug, Clone)] pub struct LaunchPlan { pub backend: VmBackend, @@ -232,9 +267,18 @@ pub struct LaunchPlan { pub gateway_port: Option, pub guest_init_dropins: Vec, pub env: Vec, + pub resources: Vec, } impl LaunchPlan { + /// Add a guest resource, de-duplicating identical entries so repeated + /// extension calls (configure + `before_launch`) stay idempotent. + pub fn add_resource(&mut self, resource: GuestResource) { + if !self.resources.contains(&resource) { + self.resources.push(resource); + } + } + pub fn require_backend(&mut self, backend: VmBackend) { if !self.required_backends.contains(&backend) { self.required_backends.push(backend); @@ -939,6 +983,7 @@ mod tests { gateway_port: None, guest_init_dropins: Vec::new(), env: Vec::new(), + resources: Vec::new(), } } diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 57db7b64b..7b3797bf0 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -138,6 +138,9 @@ struct Args { #[arg(long, hide = true)] vm_gpu_bdf: Option, + #[arg(long = "vm-pci-passthrough", hide = true)] + vm_pci_passthrough: Vec, + #[arg(long, hide = true)] vm_tap_device: Option, @@ -496,6 +499,7 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result, + /// Extra host PCI devices (BDFs) to pass through beyond the optional GPU. + pub pci_passthrough: Vec, pub tap_device: Option, pub guest_ip: Option, pub host_ip: Option, @@ -75,10 +77,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { } fn run_qemu_vm(config: &VmLaunchConfig) -> Result<(), String> { - let gpu_bdf = config - .gpu_bdf - .as_deref() - .ok_or("gpu_bdf is required for QEMU backend")?; let tap_device = config .tap_device .as_deref() @@ -175,10 +173,7 @@ fn run_qemu_vm(config: &VmLaunchConfig) -> Result<(), String> { .arg(format!( "vhost-vsock-pci,guest-cid={vsock_cid},bus=vsock_root" )) - .arg("-device") - .arg("pcie-root-port,id=gpu_root,slot=2") - .arg("-device") - .arg(format!("vfio-pci,host={gpu_bdf},bus=gpu_root")) + .args(qemu_pci_passthrough_args(config)) .arg("-serial") .arg(format!("file:{}", config.console_output.display())); @@ -220,6 +215,29 @@ fn run_qemu_vm(config: &VmLaunchConfig) -> Result<(), String> { } } +/// Build the `-device` arguments for VFIO passthrough: the optional GPU on its +/// established root port (slot 2), then any additional devices (e.g. VF NICs). +/// Slots 1-3 are reserved for vsock/gpu/net, so extra devices start at slot 4, +/// each on its own `pcie-root-port`. +fn qemu_pci_passthrough_args(config: &VmLaunchConfig) -> Vec { + let mut args = Vec::new(); + if let Some(gpu_bdf) = config.gpu_bdf.as_deref() { + args.push("-device".to_string()); + args.push("pcie-root-port,id=gpu_root,slot=2".to_string()); + args.push("-device".to_string()); + args.push(format!("vfio-pci,host={gpu_bdf},bus=gpu_root")); + } + for (index, host_bdf) in config.pci_passthrough.iter().enumerate() { + let slot = 4 + index; + let root_id = format!("pt{index}_root"); + args.push("-device".to_string()); + args.push(format!("pcie-root-port,id={root_id},slot={slot}")); + args.push("-device".to_string()); + args.push(format!("vfio-pci,host={host_bdf},bus={root_id}")); + } + args +} + fn qemu_disk_args(config: &VmLaunchConfig) -> Vec { let mut args = vec![ "-drive".to_string(), @@ -1418,6 +1436,7 @@ mod tests { console_output: PathBuf::from("/console.log"), backend: VmBackend::Qemu, gpu_bdf: Some("0000:01:00.0".to_string()), + pci_passthrough: Vec::new(), tap_device: Some("vmtap-test".to_string()), guest_ip: Some("10.0.128.2".to_string()), host_ip: Some("10.0.128.1".to_string()), @@ -1453,6 +1472,34 @@ mod tests { assert!(!cmdline.contains("GPU_ENABLED=")); } + #[test] + fn qemu_passthrough_args_emit_gpu_and_extra_vfio_devices() { + let mut config = qemu_config(); + config.gpu_bdf = Some("0000:01:00.0".to_string()); + config.pci_passthrough = vec!["0000:03:00.2".to_string(), "0000:03:00.3".to_string()]; + + let args = qemu_pci_passthrough_args(&config); + + assert!(args.contains(&"pcie-root-port,id=gpu_root,slot=2".to_string())); + assert!(args.contains(&"vfio-pci,host=0000:01:00.0,bus=gpu_root".to_string())); + assert!(args.contains(&"pcie-root-port,id=pt0_root,slot=4".to_string())); + assert!(args.contains(&"vfio-pci,host=0000:03:00.2,bus=pt0_root".to_string())); + assert!(args.contains(&"pcie-root-port,id=pt1_root,slot=5".to_string())); + assert!(args.contains(&"vfio-pci,host=0000:03:00.3,bus=pt1_root".to_string())); + } + + #[test] + fn qemu_passthrough_args_support_vf_only_without_gpu() { + let mut config = qemu_config(); + config.gpu_bdf = None; + config.pci_passthrough = vec!["0000:03:00.2".to_string()]; + + let args = qemu_pci_passthrough_args(&config); + + assert!(!args.iter().any(|a| a.contains("gpu_root"))); + assert!(args.contains(&"vfio-pci,host=0000:03:00.2,bus=pt0_root".to_string())); + } + #[test] fn qemu_disk_args_attach_base_readonly_and_overlay_readwrite() { let args = qemu_disk_args(&qemu_config()); From dff3044daec674f24a06c90106d710d24598c53f Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:28:25 +0000 Subject: [PATCH 06/14] fix(bluefield): enable extension from driver binary Signed-off-by: Patrick Riel --- .../bf-driver/src/main.rs | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/crates/openshell-driver-bluefield/bf-driver/src/main.rs b/crates/openshell-driver-bluefield/bf-driver/src/main.rs index 353a26bd3..9e0d2c61c 100644 --- a/crates/openshell-driver-bluefield/bf-driver/src/main.rs +++ b/crates/openshell-driver-bluefield/bf-driver/src/main.rs @@ -207,10 +207,8 @@ async fn main() -> Result<()> { ); } - let bluefield_config = args - .bluefield - .to_driver_config(args.openshell_endpoint.clone()) - .map_err(|err| miette::miette!("{err}"))?; + let bluefield_config = + bluefield_config_from_args(&args).map_err(|err| miette::miette!("{err}"))?; // This stage runs the workload-side VM driver that binds a VF per // sandbox. The leader/control-plane role is layered on in a later stage. @@ -253,6 +251,14 @@ async fn build_vm_driver(args: &Args, bluefield: BluefieldDriverConfig) -> Resul .map_err(|err| miette!("{err}")) } +fn bluefield_config_from_args(args: &Args) -> std::result::Result { + let mut config = args + .bluefield + .to_driver_config(args.openshell_endpoint.clone())?; + config.enabled = true; + Ok(config) +} + /// Serve any `ComputeDriver` over the selected listener. Shared by every role /// so the leader and the workload driver are served identically. async fn serve_compute_driver(driver: T, listen_mode: ComputeDriverListenMode) -> Result<()> @@ -759,4 +765,24 @@ mod tests { } ); } + + #[test] + fn bluefield_binary_enables_bluefield_extension_without_flag() { + let args = Args::parse_from([ + "openshell-driver-bluefield", + "--allow-unauthenticated-tcp", + "--bind-address", + "127.0.0.1:50061", + "--openshell-endpoint", + "http://127.0.0.1:8080", + ]); + + let config = super::bluefield_config_from_args(&args).unwrap(); + + assert!(config.enabled); + assert_eq!( + config.openshell_endpoint.as_deref(), + Some("http://127.0.0.1:8080") + ); + } } From f1a9583f5dbf9abf9915eb09427d3518859c9939 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:30:01 +0000 Subject: [PATCH 07/14] feat(bluefield): resolve host PF automatically Signed-off-by: Patrick Riel --- .../bf-vm/src/extension.rs | 6 +- .../bf-vm/src/host_pf.rs | 216 ++++++++++++++++++ .../bf-vm/src/lib.rs | 1 + .../bf-vm/src/slots.rs | 10 +- 4 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs index a3c14383c..a8a87117b 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -27,7 +27,7 @@ use bf_inventory::{VfPool, VfSlot}; use crate::config::{bluefield_kernel_from_config, guest_egress_from_config, reject_deferred_proxy}; use crate::guest_egress::{self, GuestEgress}; use crate::kernel::BluefieldKernel; -use crate::slots::{HostSlotConfig, prepare_host_slots, require_host_pf}; +use crate::slots::{HostSlotConfig, prepare_host_slots, resolve_host_pf_bdf}; use crate::state::{self, AttachmentRecord, EXTENSION_NAME}; use crate::vf::{HostReadiness, SysfsHostReadiness, SysfsVfBinder, VfBinder}; @@ -80,9 +80,9 @@ impl BluefieldExtension { } reject_deferred_proxy(config)?; - let host_pf = require_host_pf(config)?; + let host_pf = resolve_host_pf_bdf(config)?; let sysfs = SysfsRoot::system(); - let slots = prepare_host_slots(HostSlotConfig::from(config), &sysfs, host_pf)?; + let slots = prepare_host_slots(HostSlotConfig::from(config), &sysfs, &host_pf)?; let extension = Self::new(VfPool::new(slots)) .with_host_readiness(Arc::new(SysfsHostReadiness::new(sysfs.clone()))) diff --git a/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs new file mode 100644 index 000000000..6ba0bb90b --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs @@ -0,0 +1,216 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::path::{Path, PathBuf}; + +const NVIDIA_VENDOR: &str = "0x15b3"; +const PCI_NETWORK_CLASS_PREFIX: &str = "0x02"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum HostPfSource { + ConfiguredBdf, + ConfiguredNetdev, + AutoDiscovered, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct ResolvedHostPf { + pub(crate) bdf: String, + pub(crate) source: HostPfSource, +} + +pub(crate) fn resolve_host_pf( + configured: Option<&str>, + sysfs_root: &Path, +) -> Result { + if let Some(value) = configured.map(str::trim).filter(|value| !value.is_empty()) { + return resolve_configured_host_pf(value, sysfs_root); + } + auto_discover_host_pf(sysfs_root) +} + +fn resolve_configured_host_pf(value: &str, sysfs_root: &Path) -> Result { + let pci_path = sysfs_root.join("bus/pci/devices").join(value); + if pci_path.is_dir() { + return Ok(ResolvedHostPf { + bdf: value.to_string(), + source: HostPfSource::ConfiguredBdf, + }); + } + + let netdev_device = sysfs_root.join("class/net").join(value).join("device"); + let target = std::fs::read_link(&netdev_device).map_err(|err| { + format!( + "BlueField host PF {value:?} is neither a PCI BDF under {} nor a netdev under {}: {err}", + sysfs_root.join("bus/pci/devices").display(), + sysfs_root.join("class/net").display() + ) + })?; + let bdf = target + .file_name() + .and_then(|name| name.to_str()) + .ok_or_else(|| format!("BlueField netdev {value:?} device link has no PCI BDF"))? + .to_string(); + + Ok(ResolvedHostPf { + bdf, + source: HostPfSource::ConfiguredNetdev, + }) +} + +fn auto_discover_host_pf(sysfs_root: &Path) -> Result { + let mut candidates = discover_bluefield_pf_candidates(sysfs_root)?; + candidates.sort(); + match candidates.len() { + 0 => Err( + "no BlueField-capable PF with configured SR-IOV VFs found; set OPENSHELL_BLUEFIELD_HOST_PF to the PF netdev or PCI BDF" + .to_string(), + ), + 1 => Ok(ResolvedHostPf { + bdf: candidates.remove(0), + source: HostPfSource::AutoDiscovered, + }), + _ => Err(format!( + "multiple BlueField-capable PFs found: {}; set OPENSHELL_BLUEFIELD_HOST_PF to one PF netdev or PCI BDF", + candidates.join(", ") + )), + } +} + +fn discover_bluefield_pf_candidates(sysfs_root: &Path) -> Result, String> { + let devices = sysfs_root.join("bus/pci/devices"); + let entries = std::fs::read_dir(&devices) + .map_err(|err| format!("read PCI devices from {}: {err}", devices.display()))?; + let mut candidates = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let path = entry.path(); + if !is_bluefield_network_pf(&path) { + continue; + } + let bdf = entry.file_name().to_string_lossy().into_owned(); + candidates.push(bdf); + } + Ok(candidates) +} + +fn is_bluefield_network_pf(path: &Path) -> bool { + let vendor = read_trimmed(path.join("vendor")); + let class = read_trimmed(path.join("class")); + let total_vfs = read_trimmed(path.join("sriov_totalvfs")) + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + vendor.as_deref() == Some(NVIDIA_VENDOR) + && class + .as_deref() + .is_some_and(|value| value.starts_with(PCI_NETWORK_CLASS_PREFIX)) + && total_vfs > 0 + && has_any_virtfn(path) +} + +fn has_any_virtfn(path: &Path) -> bool { + (0..256).any(|index| path.join(format!("virtfn{index}")).symlink_metadata().is_ok()) +} + +fn read_trimmed(path: PathBuf) -> Option { + std::fs::read_to_string(path) + .ok() + .map(|value| value.trim().to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::unix::fs::symlink; + use std::path::PathBuf; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_sysfs_root(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!( + "openshell-bf-{name}-{}-{nonce}", + std::process::id() + )) + } + + fn write_device(root: &std::path::Path, bdf: &str, vendor: &str, class: &str, total_vfs: &str) { + let dir = root.join("bus/pci/devices").join(bdf); + std::fs::create_dir_all(&dir).unwrap(); + std::fs::write(dir.join("vendor"), vendor).unwrap(); + std::fs::write(dir.join("class"), class).unwrap(); + std::fs::write(dir.join("sriov_totalvfs"), total_vfs).unwrap(); + } + + #[test] + fn resolves_configured_bdf_to_pf_bdf() { + let root = temp_sysfs_root("bdf"); + write_device(&root, "0000:b1:00.0", "0x15b3\n", "0x020000\n", "30\n"); + + let resolved = resolve_host_pf(Some("0000:b1:00.0"), &root).unwrap(); + + assert_eq!(resolved.bdf, "0000:b1:00.0"); + assert_eq!(resolved.source, HostPfSource::ConfiguredBdf); + std::fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn resolves_configured_netdev_to_pf_bdf() { + let root = temp_sysfs_root("netdev"); + write_device(&root, "0000:b1:00.0", "0x15b3\n", "0x020000\n", "30\n"); + std::fs::create_dir_all(root.join("class/net/enp177s0f0np0")).unwrap(); + symlink( + "../../../bus/pci/devices/0000:b1:00.0", + root.join("class/net/enp177s0f0np0/device"), + ) + .unwrap(); + + let resolved = resolve_host_pf(Some("enp177s0f0np0"), &root).unwrap(); + + assert_eq!(resolved.bdf, "0000:b1:00.0"); + assert_eq!(resolved.source, HostPfSource::ConfiguredNetdev); + std::fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn auto_selects_single_bluefield_pf_with_vfs() { + let root = temp_sysfs_root("auto"); + write_device(&root, "0000:b1:00.0", "0x15b3\n", "0x020000\n", "30\n"); + std::fs::create_dir_all(root.join("bus/pci/devices/0000:b1:04.1")).unwrap(); + symlink( + "../0000:b1:04.1", + root.join("bus/pci/devices/0000:b1:00.0/virtfn29"), + ) + .unwrap(); + + let resolved = resolve_host_pf(None, &root).unwrap(); + + assert_eq!(resolved.bdf, "0000:b1:00.0"); + assert_eq!(resolved.source, HostPfSource::AutoDiscovered); + std::fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn rejects_multiple_auto_candidates_with_specific_message() { + let root = temp_sysfs_root("multiple"); + for (bdf, vf) in [ + ("0000:b1:00.0", "0000:b1:04.1"), + ("0000:b2:00.0", "0000:b2:04.1"), + ] { + write_device(&root, bdf, "0x15b3\n", "0x020000\n", "8\n"); + std::fs::create_dir_all(root.join("bus/pci/devices").join(vf)).unwrap(); + symlink( + format!("../{vf}"), + root.join("bus/pci/devices").join(bdf).join("virtfn0"), + ) + .unwrap(); + } + + let err = resolve_host_pf(None, &root).unwrap_err(); + + assert!(err.contains("multiple BlueField-capable PFs found")); + assert!(err.contains("OPENSHELL_BLUEFIELD_HOST_PF")); + std::fs::remove_dir_all(root).unwrap(); + } +} diff --git a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs index c1f8a6b34..c46c6d9ec 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs @@ -8,6 +8,7 @@ mod config; pub mod extension; pub mod extensions; pub mod guest_egress; +mod host_pf; pub mod kernel; mod slots; mod state; diff --git a/crates/openshell-driver-bluefield/bf-vm/src/slots.rs b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs index 032666f2b..952973ed6 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/slots.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs @@ -9,6 +9,7 @@ use bf_inventory::{SysfsVfInventory, VfInventory, VfSlot}; use openshell_vfio::SysfsRoot; use crate::config::BluefieldDriverConfig; +use crate::host_pf::resolve_host_pf; pub(crate) struct HostSlotConfig<'a> { reserved_vf_indexes: &'a [u32], @@ -26,12 +27,9 @@ impl<'a> From<&'a BluefieldDriverConfig> for HostSlotConfig<'a> { } } -pub(crate) fn require_host_pf(config: &BluefieldDriverConfig) -> Result<&str, String> { - config - .host_pf - .as_deref() - .filter(|value| !value.trim().is_empty()) - .ok_or_else(|| "BlueField is enabled but no host PF was configured".to_string()) +pub(crate) fn resolve_host_pf_bdf(config: &BluefieldDriverConfig) -> Result { + let resolved = resolve_host_pf(config.host_pf.as_deref(), std::path::Path::new("/sys"))?; + Ok(resolved.bdf) } /// Discover the local VF slots for `host_pf` and apply the operator's From 817d16c9e9618c6209ca7645e99231618d24127b Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:36:41 +0000 Subject: [PATCH 08/14] feat(bluefield): resolve qemu kernel from runtime assets Signed-off-by: Patrick Riel --- .../bf-vm/src/extension.rs | 51 ++++++-- .../bf-vm/src/lib.rs | 1 + .../bf-vm/src/qemu_kernel_resolver.rs | 119 ++++++++++++++++++ 3 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs index a8a87117b..a990aabb7 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -24,9 +24,11 @@ use crate::lifecycle::{ use bf_inventory::{VfPool, VfSlot}; -use crate::config::{bluefield_kernel_from_config, guest_egress_from_config, reject_deferred_proxy}; +use crate::config::{ + bluefield_kernel_from_config, guest_egress_from_config, reject_deferred_proxy, +}; use crate::guest_egress::{self, GuestEgress}; -use crate::kernel::BluefieldKernel; +use crate::kernel::{BluefieldKernel, MELLANOX_VF_MODULES}; use crate::slots::{HostSlotConfig, prepare_host_slots, resolve_host_pf_bdf}; use crate::state::{self, AttachmentRecord, EXTENSION_NAME}; use crate::vf::{HostReadiness, SysfsHostReadiness, SysfsVfBinder, VfBinder}; @@ -43,6 +45,25 @@ fn deterministic_vf_mac(sandbox_id: &str) -> String { ) } +fn qemu_kernel_from_config(config: &BluefieldDriverConfig) -> Result { + let image = crate::qemu_kernel_resolver::resolve_qemu_kernel_image( + config.kernel_image.clone(), + &crate::qemu_kernel_resolver::default_runtime_roots(), + )?; + let mut kernel = bluefield_kernel_from_config(config) + .unwrap_or_else(|| BluefieldKernel::from_image(image.clone())); + if kernel.image.is_none() { + kernel.image = Some(image); + } + if kernel.required_modules.is_empty() { + kernel.required_modules = MELLANOX_VF_MODULES + .iter() + .map(|module| (*module).to_string()) + .collect(); + } + Ok(kernel) +} + /// BlueField lifecycle extension: claims a VF, binds it for passthrough, and /// wires optional guest egress into the launch plan. #[derive(Debug)] @@ -83,8 +104,10 @@ impl BluefieldExtension { let host_pf = resolve_host_pf_bdf(config)?; let sysfs = SysfsRoot::system(); let slots = prepare_host_slots(HostSlotConfig::from(config), &sysfs, &host_pf)?; + let kernel = qemu_kernel_from_config(config)?; let extension = Self::new(VfPool::new(slots)) + .with_kernel(kernel) .with_host_readiness(Arc::new(SysfsHostReadiness::new(sysfs.clone()))) .with_vf_binder(Arc::new(SysfsVfBinder::new(sysfs))); @@ -100,9 +123,6 @@ impl BluefieldExtension { } fn apply_runtime_options(mut self, config: &BluefieldDriverConfig) -> Result { - if let Some(kernel) = bluefield_kernel_from_config(config) { - self = self.with_kernel(kernel); - } if let Some(egress) = guest_egress_from_config(config)? { self = self.with_guest_egress(egress); } @@ -182,8 +202,10 @@ impl LifecycleExtension for BluefieldExtension { fn descriptor(&self) -> ExtensionDescriptor { let mut descriptor = ExtensionDescriptor::new(EXTENSION_NAME); - descriptor.required_backend_features = - vec![BackendFeature::PciPassthrough, BackendFeature::GuestInitDropins]; + descriptor.required_backend_features = vec![ + BackendFeature::PciPassthrough, + BackendFeature::GuestInitDropins, + ]; descriptor } @@ -436,7 +458,7 @@ mod tests { #[tokio::test] async fn before_launch_claims_slot_records_bind_state_and_injects_egress_env() { let extension = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") ])) .with_guest_egress(GuestEgress { address_cidr: "10.0.120.10/22".to_string(), @@ -504,7 +526,10 @@ mod tests { .before_launch(&sandbox("sb-del"), &state, &mut plan) .await .unwrap(); - extension.after_delete(&sandbox("sb-del"), &state).await.unwrap(); + extension + .after_delete(&sandbox("sb-del"), &state) + .await + .unwrap(); assert!(extension.take_attachment("sb-del").is_none()); assert!(state::load_bind_state("sb-del", &state).is_err()); @@ -514,7 +539,9 @@ mod tests { #[tokio::test] async fn configure_launch_selects_kernel_and_declares_vf_passthrough() { let extension = BluefieldExtension::new(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])) - .with_kernel(BluefieldKernel::from_image("/opt/openshell/kernels/bf-vmlinux")); + .with_kernel(BluefieldKernel::from_image( + "/opt/openshell/kernels/bf-vmlinux", + )); let mut plan = sample_plan(); extension @@ -543,7 +570,7 @@ mod tests { async fn before_restore_reclaims_and_records() { let state = state_dir("restore"); let initial = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") ])); let mut plan = sample_plan(); initial @@ -552,7 +579,7 @@ mod tests { .unwrap(); let extension = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0"), + VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") ])); let ctx = RestoreContext { sandbox: sandbox("sb-restore"), diff --git a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs index c46c6d9ec..7abfb1095 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs @@ -10,6 +10,7 @@ pub mod extensions; pub mod guest_egress; mod host_pf; pub mod kernel; +mod qemu_kernel_resolver; mod slots; mod state; pub mod vf; diff --git a/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs b/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs new file mode 100644 index 000000000..b6a44e534 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM/QEMU-only guest kernel discovery for BlueField VF passthrough. + +use std::path::{Path, PathBuf}; + +pub(crate) fn resolve_qemu_kernel_image( + explicit: Option, + runtime_roots: &[PathBuf], +) -> Result { + if let Some(path) = explicit { + if path.is_file() { + return Ok(path); + } + return Err(format!( + "configured BlueField QEMU kernel image does not exist: {}", + path.display() + )); + } + + for root in runtime_roots { + let candidate = root.join("vmlinux"); + if candidate.is_file() { + return Ok(candidate); + } + } + + Err(format!( + "BlueField QEMU kernel image not found; searched: {}. Set OPENSHELL_BLUEFIELD_KERNEL_IMAGE or place vmlinux in the OpenShell vm-runtime directory. Docker and Kubernetes BlueField runtimes do not use this QEMU kernel path.", + runtime_roots + .iter() + .map(|path| path.display().to_string()) + .collect::>() + .join(", ") + )) +} + +pub(crate) fn default_runtime_roots() -> Vec { + let mut roots = Vec::new(); + if let Some(path) = std::env::var_os(crate::VM_RUNTIME_DIR_ENV) { + push_unique(&mut roots, PathBuf::from(path)); + } + if let Ok(exe) = std::env::current_exe() + && let Some(bin_dir) = exe.parent() + { + push_unique(&mut roots, bin_dir.join("vm-runtime")); + push_unique(&mut roots, bin_dir.join("../vm-runtime")); + push_unique(&mut roots, bin_dir.join("../../vm-runtime")); + } + push_unique( + &mut roots, + Path::new("/opt/openshell/vm-runtime").to_path_buf(), + ); + roots +} + +fn push_unique(roots: &mut Vec, path: PathBuf) { + if !roots.iter().any(|existing| existing == &path) { + roots.push(path); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_root(name: &str) -> PathBuf { + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!( + "openshell-bf-qemu-kernel-{name}-{}-{nonce}", + std::process::id() + )) + } + + #[test] + fn explicit_kernel_wins() { + let root = temp_root("explicit"); + std::fs::create_dir_all(&root).unwrap(); + let explicit = root.join("custom-vmlinux"); + std::fs::write(&explicit, "kernel").unwrap(); + + let resolved = resolve_qemu_kernel_image(Some(explicit.clone()), &[]).unwrap(); + + assert_eq!(resolved, explicit); + std::fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn finds_vmlinux_in_runtime_roots() { + let root = temp_root("runtime"); + let runtime = root.join("vm-runtime"); + std::fs::create_dir_all(&runtime).unwrap(); + let kernel = runtime.join("vmlinux"); + std::fs::write(&kernel, "kernel").unwrap(); + + let resolved = resolve_qemu_kernel_image(None, &[runtime]).unwrap(); + + assert_eq!(resolved, kernel); + std::fs::remove_dir_all(root).unwrap(); + } + + #[test] + fn reports_all_searched_roots_when_missing() { + let root = temp_root("missing"); + std::fs::create_dir_all(&root).unwrap(); + + let err = resolve_qemu_kernel_image(None, std::slice::from_ref(&root)).unwrap_err(); + + assert!(err.contains("BlueField QEMU kernel image not found")); + assert!(err.contains(root.to_string_lossy().as_ref())); + std::fs::remove_dir_all(root).unwrap(); + } +} From aa57ce942aa9ea227b685d79ae6643acfd056db4 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:38:44 +0000 Subject: [PATCH 09/14] feat(bluefield): preflight host qemu passthrough Signed-off-by: Patrick Riel --- .../bf-vm/src/extension.rs | 12 ++ .../bf-vm/src/lib.rs | 1 + .../bf-vm/src/preflight.rs | 184 ++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 crates/openshell-driver-bluefield/bf-vm/src/preflight.rs diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs index a990aabb7..cc2020972 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -105,6 +105,18 @@ impl BluefieldExtension { let sysfs = SysfsRoot::system(); let slots = prepare_host_slots(HostSlotConfig::from(config), &sysfs, &host_pf)?; let kernel = qemu_kernel_from_config(config)?; + let qemu_kernel_image = kernel + .image + .clone() + .ok_or_else(|| "BlueField QEMU kernel image was not resolved".to_string())?; + crate::preflight::run_preflight( + &crate::preflight::RealHostProbe, + &crate::preflight::PreflightInput { + host_pf: host_pf.to_string(), + vf_bdfs: slots.iter().map(|slot| slot.host_bdf.clone()).collect(), + qemu_kernel_image, + }, + )?; let extension = Self::new(VfPool::new(slots)) .with_kernel(kernel) diff --git a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs index 7abfb1095..7c1d7be89 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/lib.rs @@ -10,6 +10,7 @@ pub mod extensions; pub mod guest_egress; mod host_pf; pub mod kernel; +mod preflight; mod qemu_kernel_resolver; mod slots; mod state; diff --git a/crates/openshell-driver-bluefield/bf-vm/src/preflight.rs b/crates/openshell-driver-bluefield/bf-vm/src/preflight.rs new file mode 100644 index 000000000..1001cfaca --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/src/preflight.rs @@ -0,0 +1,184 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Aggregated host preflight for the VM/QEMU BlueField passthrough path. + +use std::path::{Path, PathBuf}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct PreflightInput { + pub(crate) host_pf: String, + pub(crate) vf_bdfs: Vec, + pub(crate) qemu_kernel_image: PathBuf, +} + +pub(crate) trait HostProbe: std::fmt::Debug + Send + Sync { + fn command_exists(&self, name: &str) -> bool; + fn path_exists(&self, path: &Path) -> bool; + fn iommu_groups_populated(&self) -> bool; + fn vfio_pci_available(&self) -> bool; + fn check_passthrough(&self, bdf: &str) -> Result<(), String>; +} + +#[derive(Debug, Default)] +pub(crate) struct RealHostProbe; + +impl HostProbe for RealHostProbe { + fn command_exists(&self, name: &str) -> bool { + std::env::var_os("PATH") + .is_some_and(|paths| std::env::split_paths(&paths).any(|dir| dir.join(name).is_file())) + } + + fn path_exists(&self, path: &Path) -> bool { + path.exists() + } + + fn iommu_groups_populated(&self) -> bool { + std::fs::read_dir("/sys/kernel/iommu_groups") + .map(|mut entries| entries.next().is_some()) + .unwrap_or(false) + } + + fn vfio_pci_available(&self) -> bool { + Path::new("/sys/bus/pci/drivers/vfio-pci").exists() + || Path::new("/sys/module/vfio_pci").exists() + } + + fn check_passthrough(&self, bdf: &str) -> Result<(), String> { + openshell_vfio::validate_pci_for_passthrough(&openshell_vfio::SysfsRoot::system(), bdf) + .map_err(|err| err.to_string()) + } +} + +pub(crate) fn run_preflight(probe: &dyn HostProbe, input: &PreflightInput) -> Result<(), String> { + let mut failures = Vec::new(); + + require_command(probe, "qemu-system-x86_64", &mut failures); + require_command(probe, "ip", &mut failures); + require_command(probe, "nft", &mut failures); + require_command(probe, "debugfs", &mut failures); + if !probe.command_exists("mkfs.ext4") && !probe.command_exists("mke2fs") { + failures.push("missing mkfs.ext4 or mke2fs; install e2fsprogs".to_string()); + } + if !probe.path_exists(Path::new("/dev/kvm")) { + failures.push("missing /dev/kvm; enable KVM virtualization on the host".to_string()); + } + if !probe.iommu_groups_populated() { + failures.push( + "IOMMU groups are not populated; boot with intel_iommu=on iommu=pt or amd_iommu=on iommu=pt" + .to_string(), + ); + } + if !probe.vfio_pci_available() { + failures.push("vfio-pci is not available; load it with modprobe vfio-pci".to_string()); + } + if input.vf_bdfs.is_empty() { + failures.push(format!( + "BlueField host PF {} has no usable VFs after reservations", + input.host_pf + )); + } + if !probe.path_exists(&input.qemu_kernel_image) { + failures.push(format!( + "BlueField QEMU kernel image does not exist: {}", + input.qemu_kernel_image.display() + )); + } + for vf in &input.vf_bdfs { + if let Err(reason) = probe.check_passthrough(vf) { + failures.push(format!("VF {vf} is not ready for passthrough: {reason}")); + } + } + + if failures.is_empty() { + return Ok(()); + } + + Err(format!( + "BlueField QEMU host preflight failed:\n- {}", + failures.join("\n- ") + )) +} + +fn require_command(probe: &dyn HostProbe, name: &str, failures: &mut Vec) { + if !probe.command_exists(name) { + failures.push(format!("missing {name} in PATH")); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use std::path::PathBuf; + + #[derive(Debug, Default)] + struct StubProbe { + commands: HashSet<&'static str>, + paths: HashSet<&'static str>, + iommu_groups: bool, + vfio_loaded: bool, + passthrough_ok: bool, + } + + impl HostProbe for StubProbe { + fn command_exists(&self, name: &str) -> bool { + self.commands.contains(name) + } + + fn path_exists(&self, path: &std::path::Path) -> bool { + self.paths.contains(path.to_string_lossy().as_ref()) + } + + fn iommu_groups_populated(&self) -> bool { + self.iommu_groups + } + + fn vfio_pci_available(&self) -> bool { + self.vfio_loaded + } + + fn check_passthrough(&self, _bdf: &str) -> Result<(), String> { + self.passthrough_ok + .then_some(()) + .ok_or_else(|| "IOMMU group conflict".to_string()) + } + } + + #[test] + fn preflight_reports_multiple_failures_together() { + let probe = StubProbe::default(); + let input = PreflightInput { + host_pf: "0000:b1:00.0".to_string(), + vf_bdfs: vec!["0000:b1:04.1".to_string()], + qemu_kernel_image: PathBuf::from("/missing/vmlinux"), + }; + + let err = run_preflight(&probe, &input).unwrap_err(); + + assert!(err.contains("qemu-system-x86_64")); + assert!(err.contains("/dev/kvm")); + assert!(err.contains("IOMMU")); + assert!(err.contains("vfio-pci")); + assert!(err.contains("IOMMU group conflict")); + assert!(err.contains("/missing/vmlinux")); + } + + #[test] + fn preflight_passes_when_all_required_inputs_are_present() { + let probe = StubProbe { + commands: HashSet::from(["qemu-system-x86_64", "ip", "nft", "debugfs", "mkfs.ext4"]), + paths: HashSet::from(["/dev/kvm", "/runtime/vmlinux"]), + iommu_groups: true, + vfio_loaded: true, + passthrough_ok: true, + }; + let input = PreflightInput { + host_pf: "0000:b1:00.0".to_string(), + vf_bdfs: vec!["0000:b1:04.1".to_string()], + qemu_kernel_image: PathBuf::from("/runtime/vmlinux"), + }; + + run_preflight(&probe, &input).unwrap(); + } +} From b05de0dc1e5ceffdc938e2c50ebceebc07cea12d Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:39:31 +0000 Subject: [PATCH 10/14] test(bluefield): cover driver-owned qemu launch details Signed-off-by: Patrick Riel --- .../bf-vm/src/extension.rs | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs index cc2020972..5c9b54154 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -578,6 +578,40 @@ mod tests { assert_eq!(passthrough_bdfs(&plan), vec!["0000:03:00.2"]); } + #[tokio::test] + async fn configure_launch_sets_qemu_requirements_without_manual_internal_args() { + let extension = ext(VfPool::new([ + VfSlot::new("vf29", "0000:b1:04.1").with_vf_index(29), + ])) + .with_kernel(BluefieldKernel::from_image("/runtime/vmlinux")); + + let mut plan = sample_plan(); + extension + .configure_launch(&sandbox("sandbox-bluefield"), &PathBuf::from("/tmp/s"), &mut plan) + .await + .unwrap(); + + assert!( + plan.required_backend_features + .contains(&BackendFeature::PciPassthrough) + ); + assert!( + plan.required_backend_features + .contains(&BackendFeature::GuestInitDropins) + ); + assert!( + plan.required_backend_features + .contains(&BackendFeature::ExternalKernelImage) + ); + assert_eq!(plan.kernel_image.as_deref(), Some(Path::new("/runtime/vmlinux"))); + assert!(plan.tap_device.is_none()); + assert!(plan.guest_ip.is_none()); + assert!(plan.host_ip.is_none()); + assert!(plan.vsock_cid.is_none()); + assert!(plan.guest_mac.is_none()); + assert!(plan.gateway_port.is_none()); + } + #[tokio::test] async fn before_restore_reclaims_and_records() { let state = state_dir("restore"); From 63ce99d3e2aa3780b7b928bafaa077bd85179486 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:40:25 +0000 Subject: [PATCH 11/14] fix(bluefield): preserve startup preflight errors Signed-off-by: Patrick Riel --- .../bf-driver/src/main.rs | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/crates/openshell-driver-bluefield/bf-driver/src/main.rs b/crates/openshell-driver-bluefield/bf-driver/src/main.rs index 9e0d2c61c..e7bee24f5 100644 --- a/crates/openshell-driver-bluefield/bf-driver/src/main.rs +++ b/crates/openshell-driver-bluefield/bf-driver/src/main.rs @@ -207,8 +207,7 @@ async fn main() -> Result<()> { ); } - let bluefield_config = - bluefield_config_from_args(&args).map_err(|err| miette::miette!("{err}"))?; + let bluefield_config = bluefield_config_from_args(&args).map_err(bluefield_driver_error)?; // This stage runs the workload-side VM driver that binds a VF per // sandbox. The leader/control-plane role is layered on in a later stage. @@ -224,7 +223,7 @@ async fn build_vm_driver(args: &Args, bluefield: BluefieldDriverConfig) -> Resul .ok_or_else(|| miette!("OPENSHELL_GRPC_ENDPOINT is required"))?; let extension_config = ExtensionRuntimeConfig { bluefield }; let lifecycle_extensions = - build_lifecycle_extensions(&extension_config).map_err(|err| miette!("{err}"))?; + build_lifecycle_extensions(&extension_config).map_err(bluefield_driver_error)?; VmDriver::new_with_extensions( VmDriverConfig { @@ -259,6 +258,10 @@ fn bluefield_config_from_args(args: &Args) -> std::result::Result miette::Report { + miette!("{message}") +} + /// Serve any `ComputeDriver` over the selected listener. Shared by every role /// so the leader and the workload driver are served identically. async fn serve_compute_driver(driver: T, listen_mode: ComputeDriverListenMode) -> Result<()> @@ -785,4 +788,17 @@ mod tests { Some("http://127.0.0.1:8080") ); } + + #[test] + fn build_driver_error_preserves_bluefield_preflight_text() { + let err = super::bluefield_driver_error( + "BlueField QEMU host preflight failed:\n- missing qemu-system-x86_64", + ); + + assert!( + err.to_string() + .contains("BlueField QEMU host preflight failed") + ); + assert!(err.to_string().contains("missing qemu-system-x86_64")); + } } From a08ebe11ed8c2e2f4c81d9efd3201f10ae3a1193 Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Fri, 12 Jun 2026 21:41:00 +0000 Subject: [PATCH 12/14] docs(bluefield): document simplified driver startup Signed-off-by: Patrick Riel --- crates/openshell-driver-bluefield/README.md | 51 +++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 crates/openshell-driver-bluefield/README.md diff --git a/crates/openshell-driver-bluefield/README.md b/crates/openshell-driver-bluefield/README.md new file mode 100644 index 000000000..685378088 --- /dev/null +++ b/crates/openshell-driver-bluefield/README.md @@ -0,0 +1,51 @@ +# openshell-driver-bluefield + +`openshell-driver-bluefield` is the BlueField compute driver for OpenShell. +The current backend wraps the VM compute driver with a BlueField lifecycle +extension that claims one host VF per sandbox, binds the VF to `vfio-pci`, +passes it into the QEMU guest, and configures the guest data-plane NIC. + +## Operator Contract + +Install QEMU and prepare the host once for the current VM backend: + +- KVM is available at `/dev/kvm`. +- IOMMU groups are populated. +- `vfio-pci` is loaded. +- The BlueField or ConnectX PF has SR-IOV VFs. +- `qemu-system-x86_64`, `ip`, `nft`, `debugfs`, and `mkfs.ext4` or `mke2fs` + are on `PATH`. +- A BlueField-capable `vmlinux` is present in the OpenShell `vm-runtime` + directory, or `OPENSHELL_BLUEFIELD_KERNEL_IMAGE` points to it. + +Then run the driver as a normal OpenShell compute driver: + +```shell +OPENSHELL_COMPUTE_DRIVER_SOCKET=/run/openshell/bluefield.sock \ +OPENSHELL_GRPC_ENDPOINT=http://127.0.0.1:8080 \ +openshell-driver-bluefield +``` + +If the host has more than one usable PF, select one: + +```shell +OPENSHELL_BLUEFIELD_HOST_PF=enp177s0f0np0 +``` + +Reserve VFs that are owned by DRA, another service, or manual testing: + +```shell +OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES=0,1,2 +``` + +Do not call `--internal-run-vm` directly for normal operation. The driver +creates root and overlay disks, tap devices, guest IPs, guest MACs, vsock CIDs, +and QEMU passthrough arguments internally. + +Docker and Kubernetes BlueField runtimes will have their own prerequisites. +They should not inherit the VM/QEMU guest-kernel requirement. + +## Failure Model + +At startup the VM backend runs preflight and reports all missing host +prerequisites in one message. Fix the listed host issues and restart the driver. From 7604abf36e7e3eed4f42bb0e7e9f85a63467e51b Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Sat, 13 Jun 2026 20:06:23 +0000 Subject: [PATCH 13/14] refactor(bluefield): generalize VF handles into kind-aware network functions Rename the VF-specific inventory/handle types into runtime-neutral network-function types so SF and future container/Kubernetes adapters can reuse the discovery, allocation, and assignment contracts: - bf-core: VfRef -> NetFunction, VfSlot -> FunctionSlot (with a FunctionKind { Vf, Sf } discriminant and a generic `index`), drop VM-centric guest_* field names (guest_mac -> mac, guest_datapath_address -> datapath_address, AttachSpec guest_ip -> endpoint_ip). BluefieldAssignment carries `kind` and uses generalized label keys. - bf-inventory: VfInventory -> FunctionInventory, VfPool -> FunctionPool, StaticVfInventory -> StaticFunctionInventory, VfError/VfResult -> InventoryError/InventoryResult. Sysfs VF/representor impls keep their kind-specific names. - bf-vm: update all call sites; VFIO binding mechanism names retained. Also restructure the bluefield README into a package-marker overview that links the bf-vm implementation guide. --- Cargo.lock | 1 + crates/openshell-driver-bluefield/README.md | 122 ++++--- .../bf-core/src/assignment.rs | 60 ++-- .../bf-core/src/handles.rs | 108 +++++-- .../bf-core/src/lib.rs | 2 +- .../bf-inventory/src/inventory.rs | 88 ++--- .../bf-inventory/src/lib.rs | 7 +- .../bf-inventory/src/pool.rs | 58 ++-- .../bf-vm/Cargo.toml | 1 + .../bf-vm/README.md | 306 ++++++++++++++++++ .../bf-vm/scripts/guest-egress-dropin.sh | 23 ++ .../bf-vm/src/config.rs | 59 +++- .../bf-vm/src/extension.rs | 219 ++++++++++--- .../bf-vm/src/guest_egress.rs | 31 +- .../bf-vm/src/host_pf.rs | 6 +- .../bf-vm/src/slots.rs | 32 +- .../bf-vm/src/state.rs | 34 +- .../bf-vm/src/vf.rs | 23 +- .../scripts/openshell-vm-sandbox-init.sh | 16 +- crates/openshell-driver-vm/src/rootfs.rs | 10 + 20 files changed, 955 insertions(+), 251 deletions(-) create mode 100644 crates/openshell-driver-bluefield/bf-vm/README.md diff --git a/Cargo.lock b/Cargo.lock index 6c380e36b..b0a13cff1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -474,6 +474,7 @@ dependencies = [ "tokio", "tonic", "tracing", + "url", ] [[package]] diff --git a/crates/openshell-driver-bluefield/README.md b/crates/openshell-driver-bluefield/README.md index 685378088..553e7e60e 100644 --- a/crates/openshell-driver-bluefield/README.md +++ b/crates/openshell-driver-bluefield/README.md @@ -1,51 +1,95 @@ # openshell-driver-bluefield -`openshell-driver-bluefield` is the BlueField compute driver for OpenShell. -The current backend wraps the VM compute driver with a BlueField lifecycle -extension that claims one host VF per sandbox, binds the VF to `vfio-pci`, -passes it into the QEMU guest, and configures the guest data-plane NIC. - -## Operator Contract - -Install QEMU and prepare the host once for the current VM backend: - -- KVM is available at `/dev/kvm`. -- IOMMU groups are populated. -- `vfio-pci` is loaded. -- The BlueField or ConnectX PF has SR-IOV VFs. -- `qemu-system-x86_64`, `ip`, `nft`, `debugfs`, and `mkfs.ext4` or `mke2fs` - are on `PATH`. -- A BlueField-capable `vmlinux` is present in the OpenShell `vm-runtime` - directory, or `OPENSHELL_BLUEFIELD_KERNEL_IMAGE` points to it. - -Then run the driver as a normal OpenShell compute driver: - -```shell -OPENSHELL_COMPUTE_DRIVER_SOCKET=/run/openshell/bluefield.sock \ -OPENSHELL_GRPC_ENDPOINT=http://127.0.0.1:8080 \ -openshell-driver-bluefield -``` +> Status: Experimental. The current BlueField compute driver variant is +> `bf-vm`, which extends the VM compute driver with BlueField VF passthrough +> and VF-backed guest egress. + +Host-side BlueField compute driver for OpenShell. This crate is a package +marker — a workspace anchor for the private `bf-*` implementation crates that +intentionally re-exports nothing. The driver runs sandbox workloads on a worker +host while offloading egress to a BlueField DPU: each sandbox claims one host +VF, which is bound to `vfio-pci` and passed through to the runtime as the +sandbox's egress NIC. The agent workload still runs behind the normal OpenShell +veth-to-policy-proxy path and never sees the VF directly. + +## How it fits together -If the host has more than one usable PF, select one: +```mermaid +flowchart LR + subgraph host["Worker host"] + gateway["openshell-gateway"] + driver["openshell-driver-bluefield
(bf-driver binary)
├── bf-vm (lifecycle extension)
├── bf-inventory (VF discovery)
└── bf-core (contracts)"] + vf["Host VF
vfio-pci"] + gateway <-->|"gRPC over private UDS"| driver + driver -->|"claim / bind / pass through"| vf + end -```shell -OPENSHELL_BLUEFIELD_HOST_PF=enp177s0f0np0 + subgraph guest["Sandbox runtime (QEMU guest today)"] + proxy["OpenShell policy proxy"] + guestvf["BlueField VF NIC"] + agent["agent process"] + agent -->|"veth"| proxy --> guestvf + end + + subgraph dpu["BlueField / DPU"] + rep["VF representor"] + policy["DPU policy / proxy path"] + end + + driver -->|"launch + attach VF"| guest + gateway -.->|"sandbox callback"| proxy + vf -.-> guestvf + guestvf --> rep --> policy --> upstream["gateway / internet"] ``` -Reserve VFs that are owned by DRA, another service, or manual testing: +The gateway runs as a host process and spawns the BlueField driver as a +subprocess over a private Unix socket. For each sandbox the driver selects a +free host VF, binds it to `vfio-pci`, launches the sandbox runtime with the VF +attached, wires the VF as the guest egress NIC, and restores the VF to its +original binding on teardown. The resulting datapath is: -```shell -OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES=0,1,2 +```text +agent -> veth -> OpenShell policy proxy -> VF -> DPU representor -> gateway/internet ``` -Do not call `--internal-run-vm` directly for normal operation. The driver -creates root and overlay disks, tap devices, guest IPs, guest MACs, vsock CIDs, -and QEMU passthrough arguments internally. +## Crate Layout + +The implementation is split into private `bf-*` crates so the build and review +boundaries match responsibilities: + +| Crate | Role | +|---|---| +| `bf-core` | Shared contracts: VF handles (`VfRef`, `VfSlot`), DPU claims and network/storage modes, the `BluefieldLifecycleExtension` and `RuntimeAdapter` traits, sandbox state records, and error types. Holds no host I/O. | +| `bf-inventory` | VF discovery and allocation: sysfs-backed VF and representor inventories, a static inventory for tests, and the `VfPool` allocator that hands out slots. | +| `bf-vm` | The current driver implementation. A BlueField lifecycle extension over the VM compute driver that handles preflight, VF binding, guest egress wiring, host PF resolution, and guest kernel selection. | +| `bf-driver` | The external driver binary (`openshell-driver-bluefield`) that the gateway spawns. Wires the chosen implementation to the gRPC driver transport. | + +## Implementations + +A BlueField implementation pairs a sandbox runtime with the VF passthrough and +egress contract above. Each implementation documents its own requirements, +configuration, and validation steps. + +| Implementation | Description | README | +|---|---|---| +| `bf-vm` | VM runtime adapter: BlueField VF passthrough and VF-backed guest egress on a QEMU-backed guest. The current driver variant. | [bf-vm/README.md](bf-vm/README.md) | + +## Driver Contract + +Regardless of implementation, the BlueField driver owns these +security-relevant behaviors: -Docker and Kubernetes BlueField runtimes will have their own prerequisites. -They should not inherit the VM/QEMU guest-kernel requirement. +| Behavior | Purpose | +|---|---| +| VF selection | Allocates VFs only from the configured PF, skipping reserved indexes and VFs owned by DRA, Kubernetes, or other services. | +| `vfio-pci` binding | Rebinds the selected VF to `vfio-pci` for passthrough and restores the original binding on sandbox teardown. | +| Egress placement | Configures the VF as the guest root-namespace egress NIC; the agent keeps using the veth-to-policy-proxy path and never receives the VF directly. | +| Preflight gating | Refuses to start unless host prerequisites (IOMMU, `/dev/kvm`, `vfio-pci`, required tools, a BlueField-capable guest kernel) are satisfied. | +| Lifecycle ownership | The gateway owns the driver subprocess; the driver owns VF claim, runtime launch, and cleanup so leaked VFs are returned to the host. | -## Failure Model +## Build and Deploy -At startup the VM backend runs preflight and reports all missing host -prerequisites in one message. Fix the listed host issues and restart the driver. +The driver binary is built from `bf-driver` and installed where the gateway's +VM driver path expects it. Build commands, install layout, gateway and driver +configuration, sandbox lifecycle, network verification, and troubleshooting +live with the implementation in [`bf-vm/README.md`](bf-vm/README.md). diff --git a/crates/openshell-driver-bluefield/bf-core/src/assignment.rs b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs index 3a36f27f5..7f1ce9ffb 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/assignment.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs @@ -1,8 +1,9 @@ -//! The VF assignment the control-plane leader hands to a compute node. +//! The network-function assignment the control-plane leader hands to a compute +//! node. //! -//! The leader allocates a VF, programs OVS via the DPU controller, then stamps -//! the resulting assignment into the sandbox's `template.labels`. The -//! compute-node role reads it back and binds exactly that VF. Carrying the +//! The leader allocates a function, programs OVS via the DPU controller, then +//! stamps the resulting assignment into the sandbox's `template.labels`. The +//! compute-node role reads it back and binds exactly that function. Carrying the //! assignment as labels keeps it on the existing `ComputeDriver` contract with //! no new proto, and makes it policy-stamped (a guest cannot forge it). @@ -10,31 +11,37 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use crate::FunctionKind; + /// Label key prefix for all BlueField assignment fields. pub const LABEL_PREFIX: &str = "openshell.io/bluefield."; pub const LABEL_HOST_BDF: &str = "openshell.io/bluefield.host-bdf"; pub const LABEL_LEASE_GENERATION: &str = "openshell.io/bluefield.lease-generation"; -pub const LABEL_GUEST_MAC: &str = "openshell.io/bluefield.guest-mac"; +pub const LABEL_MAC: &str = "openshell.io/bluefield.mac"; pub const LABEL_ATTACHMENT_ID: &str = "openshell.io/bluefield.attachment-id"; +pub const LABEL_KIND: &str = "openshell.io/bluefield.kind"; pub const LABEL_PF: &str = "openshell.io/bluefield.pf"; -pub const LABEL_VF_INDEX: &str = "openshell.io/bluefield.vf-index"; +pub const LABEL_INDEX: &str = "openshell.io/bluefield.index"; -/// A leader-decided VF assignment for one sandbox. +/// A leader-decided network-function assignment for one sandbox. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct BluefieldAssignment { - /// Host PCI BDF of the VF the compute node must bind. + /// Host PCI BDF of the function the compute node must bind. pub host_bdf: String, /// Controller lease generation. Carried for correlation/fencing; the /// compute node never detaches, so it does not act on this directly. pub lease_generation: u64, - /// Guest-visible VF MAC (the leader derives this deterministically). - pub guest_mac: String, + /// Guest-visible function MAC (the leader derives this deterministically). + pub mac: String, /// Controller attachment id, for logging/correlation. pub attachment_id: String, + /// Kind of network function the leader allocated. Defaults to `Vf`. + pub kind: FunctionKind, /// Optional cross-host coordinate; not required to bind. pub pf: Option, - pub vf_index: Option, + /// Identity index within the parent PF (`vf_index`, `sf_num`, ...). + pub index: Option, } impl BluefieldAssignment { @@ -54,14 +61,15 @@ impl BluefieldAssignment { LABEL_LEASE_GENERATION.to_string(), self.lease_generation.to_string(), ), - (LABEL_GUEST_MAC.to_string(), self.guest_mac.clone()), + (LABEL_MAC.to_string(), self.mac.clone()), (LABEL_ATTACHMENT_ID.to_string(), self.attachment_id.clone()), + (LABEL_KIND.to_string(), self.kind.as_str().to_string()), ]; if let Some(pf) = &self.pf { out.push((LABEL_PF.to_string(), pf.clone())); } - if let Some(vf_index) = self.vf_index { - out.push((LABEL_VF_INDEX.to_string(), vf_index.to_string())); + if let Some(index) = self.index { + out.push((LABEL_INDEX.to_string(), index.to_string())); } out } @@ -85,20 +93,26 @@ impl BluefieldAssignment { }; let host_bdf = required(LABEL_HOST_BDF)?; - let guest_mac = required(LABEL_GUEST_MAC)?; + let mac = required(LABEL_MAC)?; let attachment_id = required(LABEL_ATTACHMENT_ID)?; let lease_generation = required(LABEL_LEASE_GENERATION)? .parse::() .map_err(|err| format!("invalid {LABEL_LEASE_GENERATION}: {err}"))?; + let kind = match labels.get(LABEL_KIND).map(|v| v.trim()) { + Some(v) if !v.is_empty() => { + FunctionKind::parse(v).ok_or_else(|| format!("invalid {LABEL_KIND}: {v}"))? + } + _ => FunctionKind::Vf, + }; let pf = labels .get(LABEL_PF) .map(|v| v.trim().to_string()) .filter(|v| !v.is_empty()); - let vf_index = match labels.get(LABEL_VF_INDEX).map(|v| v.trim()) { + let index = match labels.get(LABEL_INDEX).map(|v| v.trim()) { Some(v) if !v.is_empty() => Some( v.parse::() - .map_err(|err| format!("invalid {LABEL_VF_INDEX}: {err}"))?, + .map_err(|err| format!("invalid {LABEL_INDEX}: {err}"))?, ), _ => None, }; @@ -106,10 +120,11 @@ impl BluefieldAssignment { Ok(Self { host_bdf, lease_generation, - guest_mac, + mac, attachment_id, + kind, pf, - vf_index, + index, }) } } @@ -122,10 +137,11 @@ mod tests { BluefieldAssignment { host_bdf: "0000:03:00.2".to_string(), lease_generation: 42, - guest_mac: "02:00:00:00:00:01".to_string(), + mac: "02:00:00:00:00:01".to_string(), attachment_id: "bf-sb-1".to_string(), + kind: FunctionKind::Vf, pf: Some("0".to_string()), - vf_index: Some(3), + index: Some(3), } } @@ -145,7 +161,7 @@ mod tests { fn round_trips_without_optional_coordinate() { let assignment = BluefieldAssignment { pf: None, - vf_index: None, + index: None, ..sample() }; let mut labels = HashMap::new(); diff --git a/crates/openshell-driver-bluefield/bf-core/src/handles.rs b/crates/openshell-driver-bluefield/bf-core/src/handles.rs index 2cdb6a1f0..2306f8dcf 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/handles.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/handles.rs @@ -1,32 +1,81 @@ //! Shared BlueField handles that cross the driver, host, and DPU seam. +use serde::{Deserialize, Serialize}; + +/// The kind of network function backing a sandbox. +/// +/// BlueField can hand a sandbox different function types depending on the +/// runtime and fabric configuration. The discovery and allocation layers are +/// kind-agnostic; this discriminant lets a consumer (and the attach mechanism) +/// know which kind a slot represents. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum FunctionKind { + /// SR-IOV virtual function (the `bf-vm` passthrough path). + #[default] + Vf, + /// Scalable Function (e.g. container/Kubernetes adapters via `mlnx-sf`). + Sf, +} + +impl FunctionKind { + /// Stable wire/label string for this kind. + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + Self::Vf => "vf", + Self::Sf => "sf", + } + } + + /// Parse a [`FunctionKind`] from its wire/label string. + #[must_use] + pub fn parse(s: &str) -> Option { + match s.trim() { + "vf" => Some(Self::Vf), + "sf" => Some(Self::Sf), + _ => None, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq)] -pub struct VfSlot { +pub struct FunctionSlot { pub id: String, pub host_bdf: String, + pub kind: FunctionKind, pub pf: Option, - pub vf_index: Option, + /// Identity index within the parent PF. `vf_index` for a VF, `sf_num` for + /// an SF, function index for a virtio-net device. + pub index: Option, pub representor: Option, pub ovs_port: Option, - pub guest_datapath_address: Option, - pub guest_mac: Option, + pub datapath_address: Option, + pub mac: Option, } -impl VfSlot { +impl FunctionSlot { #[must_use] pub fn new(id: impl Into, host_bdf: impl Into) -> Self { Self { id: id.into(), host_bdf: host_bdf.into(), + kind: FunctionKind::Vf, pf: None, - vf_index: None, + index: None, representor: None, ovs_port: None, - guest_datapath_address: None, - guest_mac: None, + datapath_address: None, + mac: None, } } + #[must_use] + pub fn with_kind(mut self, kind: FunctionKind) -> Self { + self.kind = kind; + self + } + #[must_use] pub fn with_pf(mut self, pf: impl Into) -> Self { self.pf = Some(pf.into()); @@ -34,8 +83,8 @@ impl VfSlot { } #[must_use] - pub fn with_vf_index(mut self, vf_index: u32) -> Self { - self.vf_index = Some(vf_index); + pub fn with_index(mut self, index: u32) -> Self { + self.index = Some(index); self } @@ -52,40 +101,49 @@ impl VfSlot { } #[must_use] - pub fn with_guest_datapath_address(mut self, address: impl Into) -> Self { - self.guest_datapath_address = Some(address.into()); + pub fn with_datapath_address(mut self, address: impl Into) -> Self { + self.datapath_address = Some(address.into()); self } #[must_use] - pub fn with_guest_mac(mut self, mac: impl Into) -> Self { - self.guest_mac = Some(mac.into()); + pub fn with_mac(mut self, mac: impl Into) -> Self { + self.mac = Some(mac.into()); self } #[must_use] - pub fn vf_ref(&self) -> Option { - match (&self.pf, self.vf_index) { - (Some(pf), Some(idx)) => Some(VfRef::new(pf.clone(), idx)), + pub fn net_function(&self) -> Option { + match (&self.pf, self.index) { + (Some(pf), Some(idx)) => Some(NetFunction::new(pf.clone(), idx).with_kind(self.kind)), _ => None, } } } +/// A reference to a single network function: `(kind, pf, index)`. #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct VfRef { +pub struct NetFunction { + pub kind: FunctionKind, pub pf: String, - pub vf_index: u32, + pub index: u32, } -impl VfRef { +impl NetFunction { #[must_use] - pub fn new(pf: impl Into, vf_index: u32) -> Self { + pub fn new(pf: impl Into, index: u32) -> Self { Self { + kind: FunctionKind::Vf, pf: pf.into(), - vf_index, + index, } } + + #[must_use] + pub fn with_kind(mut self, kind: FunctionKind) -> Self { + self.kind = kind; + self + } } #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] @@ -98,11 +156,11 @@ pub enum ProxyPlacement { #[derive(Debug, Clone, PartialEq, Eq)] pub struct AttachSpec { pub sandbox_id: String, - pub vf: VfRef, + pub function: NetFunction, pub host_bdf: String, pub representor: Option, - pub guest_ip: Option, - pub guest_mac: Option, + pub endpoint_ip: Option, + pub mac: Option, pub openshell_endpoint: Option, pub sandbox_token: Option, } diff --git a/crates/openshell-driver-bluefield/bf-core/src/lib.rs b/crates/openshell-driver-bluefield/bf-core/src/lib.rs index ce31f5444..a29a1ae3b 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/lib.rs @@ -12,7 +12,7 @@ pub mod state; pub use assignment::BluefieldAssignment; pub use claim::{DpuClaim, NetworkMode, StorageMode}; pub use error::{BluefieldError, Result}; -pub use handles::{AttachSpec, ProxyPlacement, VfRef, VfSlot}; +pub use handles::{AttachSpec, FunctionKind, FunctionSlot, NetFunction, ProxyPlacement}; pub use lifecycle::{ BluefieldLifecycleExtension, LaunchAbortReason, LifecycleActivation, LifecycleContext, LifecycleRegistry, RestoreContext, RuntimePlan, SandboxIdentity, diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs b/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs index 8df572872..fa4fd57a8 100644 --- a/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs +++ b/crates/openshell-driver-bluefield/bf-inventory/src/inventory.rs @@ -1,81 +1,87 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! VF inventory discovery. +//! Network-function inventory discovery. //! -//! Replaces hand-fed VF slots with discovery behind a trait, so the same +//! Replaces hand-fed function slots with discovery behind a trait, so the same //! extension works in both topologies and is unit-testable against a mock //! `/sys` (no hardware): //! -//! - [`StaticVfInventory`] — an explicit slot list (pinned setups, tests). +//! - [`StaticFunctionInventory`] — an explicit slot list (pinned setups, tests). //! - [`SysfsVfInventory`] — **host** side: enumerates a PF's VFs from //! `/sys/bus/pci/devices//virtfn` to get each VF's BDF + index. //! - [`SysfsRepresentorInventory`] — **DPU** side: enumerates switchdev //! representor netdevs and reads `phys_port_name` (`pfXvfY`) to map a VF //! coordinate to its representor / OVS port. //! -//! The two sides agree on a [`VfRef`] = `(pf, vf_index)`. The host uses the -//! PF's PCI BDF as the `pf` key; the DPU uses the e-switch PF index. Mapping +//! Other function kinds (SF, virtio-net) plug in as additional implementations +//! of [`FunctionInventory`] without changing the allocation layer. +//! +//! The two sides agree on a [`NetFunction`] = `(kind, pf, index)`. The host uses +//! the PF's PCI BDF as the `pf` key; the DPU uses the e-switch PF index. Mapping //! one to the other on a given deployment is a config concern (the host PF //! BDF that backs `pf0`), kept out of this mechanical discovery layer. use std::path::PathBuf; -use bf_core::{VfRef, VfSlot}; +use bf_core::{FunctionSlot, NetFunction}; use openshell_vfio::SysfsRoot; /// Error surface for inventory discovery. #[derive(Debug, Clone)] -pub enum VfError { +pub enum InventoryError { Discovery(String), } -impl core::fmt::Display for VfError { +impl core::fmt::Display for InventoryError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { - Self::Discovery(m) => write!(f, "vf discovery failed: {m}"), + Self::Discovery(m) => write!(f, "function discovery failed: {m}"), } } } -impl std::error::Error for VfError {} +impl std::error::Error for InventoryError {} -pub type VfResult = Result; +pub type InventoryResult = Result; -/// Source of the VF slots a [`super::pool::VfPool`] hands to sandboxes. -pub trait VfInventory: core::fmt::Debug + Send + Sync { - /// Enumerate all VF slots this inventory knows about. - fn discover(&self) -> VfResult>; +/// Source of the function slots a [`super::pool::FunctionPool`] hands to +/// sandboxes. +pub trait FunctionInventory: core::fmt::Debug + Send + Sync { + /// Enumerate all function slots this inventory knows about. + fn discover(&self) -> InventoryResult>; - /// Resolve the representor for a VF coordinate. Defaults to a scan of + /// Resolve the representor for a function coordinate. Defaults to a scan of /// [`discover`](Self::discover); sysfs impls may override for efficiency. - fn resolve_representor(&self, vf: &VfRef) -> VfResult> { + fn resolve_representor(&self, function: &NetFunction) -> InventoryResult> { Ok(self .discover()? .into_iter() - .find(|s| s.pf.as_deref() == Some(vf.pf.as_str()) && s.vf_index == Some(vf.vf_index)) + .find(|s| { + s.pf.as_deref() == Some(function.pf.as_str()) && s.index == Some(function.index) + }) .and_then(|s| s.representor)) } } -/// Explicit, hand-fed inventory. Equivalent to the original `VfPool::new` +/// Explicit, hand-fed inventory. Equivalent to the original `FunctionPool::new` /// behavior; ideal for tests and pinned deployments. #[derive(Debug, Default, Clone)] -pub struct StaticVfInventory { - slots: Vec, +pub struct StaticFunctionInventory { + slots: Vec, } -impl StaticVfInventory { +impl StaticFunctionInventory { #[must_use] - pub fn new(slots: impl IntoIterator) -> Self { + pub fn new(slots: impl IntoIterator) -> Self { Self { slots: slots.into_iter().collect(), } } } -impl VfInventory for StaticVfInventory { - fn discover(&self) -> VfResult> { +impl FunctionInventory for StaticFunctionInventory { + fn discover(&self) -> InventoryResult> { Ok(self.slots.clone()) } } @@ -101,8 +107,8 @@ impl SysfsVfInventory { } } -impl VfInventory for SysfsVfInventory { - fn discover(&self) -> VfResult> { +impl FunctionInventory for SysfsVfInventory { + fn discover(&self) -> InventoryResult> { let mut slots = Vec::new(); for pf in &self.pfs { let pf_dir = self.sysfs.pci_device(pf); @@ -114,23 +120,23 @@ impl VfInventory for SysfsVfInventory { break; } let target = std::fs::read_link(&link).map_err(|e| { - VfError::Discovery(format!("read_link {}: {e}", link.display())) + InventoryError::Discovery(format!("read_link {}: {e}", link.display())) })?; let vf_bdf = target .file_name() .and_then(|n| n.to_str()) .ok_or_else(|| { - VfError::Discovery(format!( + InventoryError::Discovery(format!( "virtfn target has no bdf: {}", target.display() )) })? .to_string(); - let mut slot = VfSlot::new(vf_bdf.clone(), vf_bdf.clone()) + let mut slot = FunctionSlot::new(vf_bdf.clone(), vf_bdf.clone()) .with_pf(pf.clone()) - .with_vf_index(index); + .with_index(index); if let Some(mac) = read_vf_mac(&self.sysfs, &vf_bdf) { - slot = slot.with_guest_mac(mac); + slot = slot.with_mac(mac); } slots.push(slot); } @@ -196,14 +202,14 @@ fn parse_phys_port_name(s: &str) -> Option<(u32, u32)> { Some((pf_num, vf_num)) } -impl VfInventory for SysfsRepresentorInventory { - fn discover(&self) -> VfResult> { +impl FunctionInventory for SysfsRepresentorInventory { + fn discover(&self) -> InventoryResult> { let mut slots = Vec::new(); let entries = std::fs::read_dir(&self.net_sysfs).map_err(|e| { - VfError::Discovery(format!("read_dir {}: {e}", self.net_sysfs.display())) + InventoryError::Discovery(format!("read_dir {}: {e}", self.net_sysfs.display())) })?; for entry in entries { - let entry = entry.map_err(|e| VfError::Discovery(e.to_string()))?; + let entry = entry.map_err(|e| InventoryError::Discovery(e.to_string()))?; let ifname = entry.file_name().to_string_lossy().into_owned(); let ppn_path = entry.path().join("phys_port_name"); let Ok(ppn) = std::fs::read_to_string(&ppn_path) else { @@ -213,9 +219,9 @@ impl VfInventory for SysfsRepresentorInventory { continue; }; slots.push( - VfSlot::new(ifname.clone(), String::new()) + FunctionSlot::new(ifname.clone(), String::new()) .with_pf(pf_index.to_string()) - .with_vf_index(vf_index) + .with_index(vf_index) .with_representor(ifname.clone()) .with_ovs_port(ifname), ); @@ -239,7 +245,7 @@ mod tests { } #[test] - fn sysfs_vf_inventory_reads_guest_mac_from_vf_netdev() { + fn sysfs_vf_inventory_reads_mac_from_vf_netdev() { let root = temp_sysfs_root("vf-mac"); let devices = root.join("bus/pci/devices"); let pf = devices.join("0000:03:00.0"); @@ -254,8 +260,8 @@ mod tests { assert_eq!(slots.len(), 1); assert_eq!(slots[0].host_bdf, "0000:03:00.2"); - assert_eq!(slots[0].vf_index, Some(0)); - assert_eq!(slots[0].guest_mac.as_deref(), Some("86:7f:6e:5b:e0:7b")); + assert_eq!(slots[0].index, Some(0)); + assert_eq!(slots[0].mac.as_deref(), Some("86:7f:6e:5b:e0:7b")); std::fs::remove_dir_all(root).unwrap(); } diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs b/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs index 875e19740..dd2d74f72 100644 --- a/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-inventory/src/lib.rs @@ -3,12 +3,13 @@ //! BlueField function inventory, discovery, and allocation. -pub use bf_core::{VfRef, VfSlot}; +pub use bf_core::{FunctionKind, FunctionSlot, NetFunction}; pub mod inventory; pub mod pool; pub use inventory::{ - StaticVfInventory, SysfsRepresentorInventory, SysfsVfInventory, VfError, VfInventory, VfResult, + FunctionInventory, InventoryError, InventoryResult, StaticFunctionInventory, + SysfsRepresentorInventory, SysfsVfInventory, }; -pub use pool::VfPool; +pub use pool::FunctionPool; diff --git a/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs b/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs index 8fcbc892a..84d7f1bd9 100644 --- a/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs +++ b/crates/openshell-driver-bluefield/bf-inventory/src/pool.rs @@ -1,42 +1,47 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! VF slot pool. In-memory claim/release of BlueField VFs to sandboxes. +//! Function slot pool. In-memory claim/release of BlueField functions to +//! sandboxes. use std::collections::HashMap; use std::sync::Mutex; -pub use bf_core::VfSlot; +pub use bf_core::FunctionSlot; -/// Inventory of VF slots with per-sandbox claim tracking. +/// Inventory of function slots with per-sandbox claim tracking. #[derive(Debug, Default)] -pub struct VfPool { - slots: Vec, +pub struct FunctionPool { + slots: Vec, /// sandbox_id -> slot index. claims: Mutex>, } -impl VfPool { +impl FunctionPool { #[must_use] - pub fn new(slots: impl IntoIterator) -> Self { + pub fn new(slots: impl IntoIterator) -> Self { Self { slots: slots.into_iter().collect(), claims: Mutex::new(HashMap::new()), } } - /// Build a pool from a [`VfInventory`](super::inventory::VfInventory), - /// discovering the available slots at startup instead of hand-feeding them. + /// Build a pool from a + /// [`FunctionInventory`](super::inventory::FunctionInventory), discovering + /// the available slots at startup instead of hand-feeding them. pub fn from_inventory( - inventory: &dyn crate::inventory::VfInventory, - ) -> Result { + inventory: &dyn crate::inventory::FunctionInventory, + ) -> Result { Ok(Self::new(inventory.discover()?)) } /// Claim a free slot for `sandbox_id`. Idempotent: a sandbox that already /// holds a slot gets the same one back. Returns `None` when exhausted. - pub fn claim(&self, sandbox_id: &str) -> Option { - let mut claims = self.claims.lock().expect("vf pool claims lock poisoned"); + pub fn claim(&self, sandbox_id: &str) -> Option { + let mut claims = self + .claims + .lock() + .expect("function pool claims lock poisoned"); if let Some(&idx) = claims.get(sandbox_id) { return self.slots.get(idx).cloned(); } @@ -48,8 +53,11 @@ impl VfPool { /// Claim a specific host BDF for `sandbox_id`. Idempotent for an existing /// matching claim and fails if another sandbox owns the slot. - pub fn claim_by_host_bdf(&self, sandbox_id: &str, host_bdf: &str) -> Option { - let mut claims = self.claims.lock().expect("vf pool claims lock poisoned"); + pub fn claim_by_host_bdf(&self, sandbox_id: &str, host_bdf: &str) -> Option { + let mut claims = self + .claims + .lock() + .expect("function pool claims lock poisoned"); let idx = self .slots .iter() @@ -68,7 +76,7 @@ impl VfPool { /// Return the slot with the given host BDF. #[must_use] - pub fn slot_by_host_bdf(&self, host_bdf: &str) -> Option { + pub fn slot_by_host_bdf(&self, host_bdf: &str) -> Option { self.slots .iter() .find(|slot| slot.host_bdf == host_bdf) @@ -79,18 +87,18 @@ impl VfPool { pub fn release(&self, sandbox_id: &str) { self.claims .lock() - .expect("vf pool claims lock poisoned") + .expect("function pool claims lock poisoned") .remove(sandbox_id); } } #[cfg(test)] mod tests { - use super::{VfPool, VfSlot}; + use super::{FunctionPool, FunctionSlot}; #[test] fn claim_is_idempotent_per_sandbox() { - let pool = VfPool::new([VfSlot::new("vf0", "0000:03:00.2")]); + let pool = FunctionPool::new([FunctionSlot::new("vf0", "0000:03:00.2")]); let a = pool.claim("sandbox-1").unwrap(); let b = pool.claim("sandbox-1").unwrap(); assert_eq!(a, b); @@ -98,9 +106,9 @@ mod tests { #[test] fn distinct_sandboxes_get_distinct_slots_and_release_frees() { - let pool = VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2"), - VfSlot::new("vf1", "0000:03:00.3"), + let pool = FunctionPool::new([ + FunctionSlot::new("vf0", "0000:03:00.2"), + FunctionSlot::new("vf1", "0000:03:00.3"), ]); let s1 = pool.claim("sandbox-1").unwrap(); let s2 = pool.claim("sandbox-2").unwrap(); @@ -114,9 +122,9 @@ mod tests { #[test] fn claim_by_host_bdf_reuses_matching_restore_slot() { - let pool = VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2"), - VfSlot::new("vf1", "0000:03:00.3"), + let pool = FunctionPool::new([ + FunctionSlot::new("vf0", "0000:03:00.2"), + FunctionSlot::new("vf1", "0000:03:00.3"), ]); let restored = pool.claim_by_host_bdf("sandbox-1", "0000:03:00.3").unwrap(); diff --git a/crates/openshell-driver-bluefield/bf-vm/Cargo.toml b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml index 463f1d1a2..82671a3fd 100644 --- a/crates/openshell-driver-bluefield/bf-vm/Cargo.toml +++ b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml @@ -23,6 +23,7 @@ sha2 = { workspace = true } tokio = { workspace = true } tonic = { workspace = true } tracing = { workspace = true } +url = { workspace = true } [dev-dependencies] tempfile = "3" diff --git a/crates/openshell-driver-bluefield/bf-vm/README.md b/crates/openshell-driver-bluefield/bf-vm/README.md new file mode 100644 index 000000000..a4554e926 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-vm/README.md @@ -0,0 +1,306 @@ +# bf-vm + +> Status: Experimental. `bf-vm` is the current BlueField compute driver +> variant. It extends the VM compute driver with BlueField VF passthrough and +> VF-backed guest egress. + +`bf-vm` is the bare-metal VM runtime adapter for the OpenShell BlueField +driver. It wraps the VM compute driver with a BlueField lifecycle extension +that claims one host VF per sandbox, binds it to `vfio-pci`, passes it into the +QEMU guest, and configures the guest root namespace to use that VF as the +egress NIC. + +The agent workload still runs behind the normal OpenShell sandbox +veth-to-policy-proxy path. The agent does not see the VF directly. + +## Runtime Model + +```mermaid +flowchart LR + subgraph host["Worker host"] + gateway["openshell-gateway"] + driver["openshell-driver-bluefield
(installed as openshell-driver-vm today)"] + vf["Host VF
vfio-pci"] + gateway <-->|"gRPC over private UDS"| driver + driver -->|"bind / pass through"| vf + end + + subgraph guest["Sandbox VM"] + rootns["VM root namespace"] + agentns["Agent namespace"] + proxy["OpenShell policy proxy"] + guestvf["BlueField VF NIC"] + veth["veth pair"] + agent["agent process"] + + agent --> veth --> proxy --> guestvf + rootns --- proxy + rootns --- guestvf + agentns --- agent + end + + subgraph dpu["BlueField / DPU"] + rep["VF representor"] + policy["DPU policy / proxy path"] + end + + gateway -.->|"sandbox callback"| proxy + vf -.-> guestvf + guestvf --> rep --> policy --> upstream["gateway / internet"] +``` + +The current path is: + +```text +agent -> veth -> OpenShell policy proxy -> VF -> DPU representor -> gateway/internet +``` + +## bf-vm Contract + +The `bf-vm` variant is QEMU-backed. It has VM-specific requirements: + +- `/dev/kvm` exists. +- IOMMU is enabled, for example `intel_iommu=on iommu=pt`. +- IOMMU groups are populated. +- `vfio-pci` is loaded. +- `qemu-system-x86_64`, `ip`, `nft`, `debugfs`, and `mkfs.ext4` or `mke2fs` + are on `PATH`. +- The BlueField or ConnectX PF has SR-IOV VFs. +- At least one VF is not owned by DRA, Kubernetes, or another service. +- A BlueField-capable `vmlinux` is staged with the VM runtime assets. + +The `vmlinux` is the VM guest kernel, not the host kernel. It must boot the +OpenShell root disk without an initrd, so the root block device and filesystem +support needed by the VM image must be built in. + +## Build + +From the OpenShell repo root: + +```shell +cargo build --release \ + -p openshell-cli \ + -p openshell-server \ + -p bf-driver +``` + +If package builds need bundled Z3 for the CLI and gateway, enable the +package-qualified features: + +```shell +cargo build --release \ + -p openshell-cli \ + -p openshell-server \ + -p bf-driver \ + --features openshell-cli/bundled-z3,openshell-server/bundled-z3 +``` + +The BlueField driver binary is: + +```text +target/release/openshell-driver-bluefield +``` + +## Install Layout + +Stage the gateway, CLI, BlueField driver, and VM runtime assets on the worker: + +```text +/opt/openshell/bin/openshell +/opt/openshell/bin/openshell-gateway +/opt/openshell/libexec/openshell/openshell-driver-vm +/opt/openshell/vm-runtime/vmlinux +/opt/openshell/vm-runtime/gvproxy +/opt/openshell/vm-runtime/libkrun.so +/opt/openshell/vm-runtime/umoci +``` + +Current integration note: the gateway VM driver path resolves a binary named +`openshell-driver-vm` from `[openshell.drivers.vm].driver_dir`. Until the +gateway has a first-class BlueField driver selector, install or symlink the +BlueField binary at that name: + +```shell +sudo install -d -m 0755 /opt/openshell/bin /opt/openshell/libexec/openshell +sudo install -m 0755 target/release/openshell /opt/openshell/bin/openshell +sudo install -m 0755 target/release/openshell-gateway /opt/openshell/bin/openshell-gateway +sudo install -m 0755 target/release/openshell-driver-bluefield \ + /opt/openshell/libexec/openshell/openshell-driver-vm +``` + +## Gateway Configuration + +Configure the gateway to use the VM driver path and point `driver_dir` at the +staged BlueField binary: + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:18083" +disable_tls = true +compute_drivers = ["vm"] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" + +[openshell.drivers.vm] +grpc_endpoint = "http://10.0.110.4:18083" +driver_dir = "/opt/openshell/libexec/openshell" +state_dir = "/var/lib/openshell/bluefield-vm-driver" +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +bootstrap_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +``` + +`grpc_endpoint` must be reachable from inside the VM guest. For a split host +deployment, use the gateway's real host address, not `127.0.0.1`. + +## Driver Configuration + +Set these in the gateway service environment so the gateway-spawned driver +inherits them: + +```shell +export OPENSHELL_BLUEFIELD=1 +export OPENSHELL_BLUEFIELD_HOST_PF=enp177s0f0np0 +export OPENSHELL_BLUEFIELD_EGRESS_CIDR=100.64.3.30/24 +export OPENSHELL_BLUEFIELD_EGRESS_GATEWAY=100.64.3.1 +export OPENSHELL_BLUEFIELD_KERNEL_IMAGE=/opt/openshell/vm-runtime/vmlinux +``` + +Reserve VF indexes that should not be allocated: + +```shell +export OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES=0,1,2,3,4 +``` + +Use one static address for a single-sandbox validation: + +```shell +export OPENSHELL_BLUEFIELD_EGRESS_CIDR=100.64.3.30/24 +``` + +Use a pool when more than one VF can run sandboxes: + +```shell +export OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL=100.64.3.30/24,100.64.3.31/24 +``` + +| Variable | Purpose | +|---|---| +| `OPENSHELL_BLUEFIELD_HOST_PF` | Host PF netdev or BDF used for VF discovery. | +| `OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES` | Comma-separated VF indexes excluded from allocation. | +| `OPENSHELL_BLUEFIELD_EGRESS_CIDR` | Static guest VF address for single-sandbox validation. | +| `OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL` | Per-VF guest address pool for multiple usable VFs. | +| `OPENSHELL_BLUEFIELD_EGRESS_GATEWAY` | Gateway reachable through the passed-through VF. | +| `OPENSHELL_BLUEFIELD_KERNEL_IMAGE` | BlueField-capable guest kernel image. | + +## Starting The Gateway + +Run the gateway with the config and environment above: + +```shell +sudo -E /opt/openshell/bin/openshell-gateway \ + --config /opt/openshell/etc/gateway.toml \ + --db-url 'sqlite:/var/lib/openshell/gateway/openshell.db?mode=rwc' +``` + +For a persistent deployment, put the same command and environment into a +systemd unit. The gateway owns the driver subprocess and passes the expected +gateway PID to the driver's Unix socket listener. + +## Sandbox Lifecycle + +Register the gateway: + +```shell +/opt/openshell/bin/openshell gateway add \ + http://10.0.110.4:18083 \ + --local \ + --name worker3-bf + +/opt/openshell/bin/openshell --gateway worker3-bf status +``` + +Create a sandbox: + +```shell +/opt/openshell/bin/openshell --gateway worker3-bf sandbox create \ + --name bf-vf-egress \ + --from ghcr.io/nvidia/openshell-community/sandboxes/base:latest \ + -- sleep infinity +``` + +Inspect or connect: + +```shell +/opt/openshell/bin/openshell --gateway worker3-bf sandbox list +/opt/openshell/bin/openshell --gateway worker3-bf sandbox connect bf-vf-egress +``` + +Delete the sandbox when finished: + +```shell +/opt/openshell/bin/openshell --gateway worker3-bf sandbox delete bf-vf-egress +``` + +## Network Verification + +The sandbox process should see the normal OpenShell sandbox network path, not +the BlueField VF: + +```shell +/opt/openshell/bin/openshell --gateway worker3-bf sandbox exec \ + --name bf-vf-egress -- ip link +``` + +Verify internet egress through the policy path: + +```shell +/opt/openshell/bin/openshell --gateway worker3-bf sandbox exec \ + --name bf-vf-egress -- curl -I https://example.com +``` + +On the host, the selected VF should be bound to `vfio-pci` while the sandbox is +running and restored when the sandbox is deleted. + +## Worker3 Validation Values + +The worker3 validation used: + +```text +gateway: 10.0.110.4:18083 +host: worker3 / 10.0.110.23 +PF: enp177s0f0np0 / 0000:b1:00.0 +VF: enp177s0f0v29 / 0000:b1:04.1 +DPU representor: pf0vf29 +guest VF address: 100.64.3.30/24 +guest VF gateway: 100.64.3.1 +``` + +## Troubleshooting + +If the driver fails at startup, fix every preflight item it reports. Common +causes are missing `qemu-system-x86_64`, missing `/dev/kvm`, missing +`vfio-pci`, no isolated IOMMU group for the VF, or a missing BlueField guest +kernel. + +If the gateway cannot find the driver, confirm: + +```shell +ls -l /opt/openshell/libexec/openshell/openshell-driver-vm +``` + +If sandbox creation starts but the VM does not boot, inspect the VM driver +state directory and console logs: + +```shell +sudo find /var/lib/openshell/bluefield-vm-driver -maxdepth 3 -type f | sort +``` + +If egress fails, check: + +- the selected VF is not allocated elsewhere, +- the guest egress CIDR is free, +- the guest egress gateway is reachable through the VF, +- the DPU-side representor and policy path are already provisioned, and +- the gateway endpoint is allowlisted by policy when it is on a private IP. diff --git a/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh b/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh index 1e891d8a5..4818ce694 100644 --- a/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh +++ b/crates/openshell-driver-bluefield/bf-vm/scripts/guest-egress-dropin.sh @@ -67,13 +67,36 @@ set_optional_mac() { fi } +remove_inherited_default_routes() { + while ip route show default 2>/dev/null | grep -q '^default '; do + ip route del default + done +} + +verify_vf_default_route() { + local vf_nic="$1" + local route + + route="$(ip route get "${OPENSHELL_VM_DATA_GW}" 2>/dev/null || true)" + case "${route}" in + *" dev ${vf_nic} "*|*" dev ${vf_nic}") + return 0 + ;; + esac + + echo "openshell: bluefield VF egress route check failed: gateway ${OPENSHELL_VM_DATA_GW} route was ${route}" >&2 + return 1 +} + configure_static_ip() { local vf_nic="$1" ip link set "${vf_nic}" up ip addr flush dev "${vf_nic}" 2>/dev/null || true ip addr add "${OPENSHELL_VM_DATA_IP}" dev "${vf_nic}" + remove_inherited_default_routes ip route replace default via "${OPENSHELL_VM_DATA_GW}" dev "${vf_nic}" + verify_vf_default_route "${vf_nic}" } configure_resolv_conf() { diff --git a/crates/openshell-driver-bluefield/bf-vm/src/config.rs b/crates/openshell-driver-bluefield/bf-vm/src/config.rs index 7d6d5874a..e80de0dce 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/config.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/config.rs @@ -126,7 +126,7 @@ pub(crate) fn guest_egress_from_config( address_cidr, gateway: gateway.clone(), })), - (None, None) => Ok(None), + (None, None) if !config.enabled => Ok(None), _ => Err( "BlueField guest egress requires OPENSHELL_BLUEFIELD_EGRESS_GATEWAY with OPENSHELL_BLUEFIELD_EGRESS_CIDR or OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL" .to_string(), @@ -136,7 +136,7 @@ pub(crate) fn guest_egress_from_config( #[cfg(test)] mod tests { - use super::{BluefieldDriverConfig, reject_deferred_proxy}; + use super::{BluefieldDriverConfig, guest_egress_from_config, reject_deferred_proxy}; use bf_core::ProxyPlacement; #[test] @@ -164,4 +164,59 @@ mod tests { assert!(err.contains("explicit proxy URL is deferred")); } + + #[test] + fn guest_egress_from_config_accepts_cidr_and_gateway() { + let config = BluefieldDriverConfig { + enabled: true, + egress_cidr: Some("10.0.120.10/22".to_string()), + egress_gateway: Some("10.0.120.254".to_string()), + ..Default::default() + }; + + let egress = guest_egress_from_config(&config) + .unwrap() + .expect("egress config"); + + assert_eq!(egress.address_cidr, "10.0.120.10/22"); + assert_eq!(egress.gateway, "10.0.120.254"); + } + + #[test] + fn guest_egress_requires_cidr_and_gateway_when_bluefield_enabled() { + let config = BluefieldDriverConfig { + enabled: true, + ..Default::default() + }; + + let err = guest_egress_from_config(&config).unwrap_err(); + + assert!(err.contains("BlueField guest egress requires")); + } + + #[test] + fn guest_egress_requires_gateway_with_cidr() { + let config = BluefieldDriverConfig { + enabled: true, + egress_cidr: Some("10.0.120.10/22".to_string()), + ..Default::default() + }; + + let err = guest_egress_from_config(&config).unwrap_err(); + + assert!(err.contains("BlueField guest egress requires")); + } + + #[test] + fn guest_egress_requires_cidr_with_gateway() { + let config = BluefieldDriverConfig { + enabled: true, + egress_gateway: Some("10.0.120.254".to_string()), + ..Default::default() + }; + + let err = guest_egress_from_config(&config).unwrap_err(); + + assert!(err.contains("BlueField guest egress requires")); + } } diff --git a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs index 5c9b54154..84c59085d 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/extension.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/extension.rs @@ -9,11 +9,13 @@ //! is layered on in later stages. use std::collections::HashMap; +use std::net::IpAddr; use std::path::Path; use std::sync::{Arc, Mutex}; use openshell_core::proto::compute::v1::DriverSandbox as Sandbox; use openshell_vfio::SysfsRoot; +use url::{Host, Url}; use crate::gpu::mac_from_sandbox_id; use crate::lifecycle::{ @@ -22,7 +24,7 @@ use crate::lifecycle::{ RestoreContext, }; -use bf_inventory::{VfPool, VfSlot}; +use bf_inventory::{FunctionPool, FunctionSlot}; use crate::config::{ bluefield_kernel_from_config, guest_egress_from_config, reject_deferred_proxy, @@ -68,9 +70,10 @@ fn qemu_kernel_from_config(config: &BluefieldDriverConfig) -> Result, kernel: Option, + openshell_endpoint: Option, readiness: Arc, binder: Arc, attachments: Mutex>, @@ -78,11 +81,12 @@ pub struct BluefieldExtension { impl BluefieldExtension { #[must_use] - pub fn new(pool: VfPool) -> Self { + pub fn new(pool: FunctionPool) -> Self { Self { pool, egress: None, kernel: None, + openshell_endpoint: None, readiness: Arc::new(SysfsHostReadiness::default()), binder: Arc::new(SysfsVfBinder::default()), attachments: Mutex::new(HashMap::new()), @@ -118,7 +122,7 @@ impl BluefieldExtension { }, )?; - let extension = Self::new(VfPool::new(slots)) + let extension = Self::new(FunctionPool::new(slots)) .with_kernel(kernel) .with_host_readiness(Arc::new(SysfsHostReadiness::new(sysfs.clone()))) .with_vf_binder(Arc::new(SysfsVfBinder::new(sysfs))); @@ -135,6 +139,7 @@ impl BluefieldExtension { } fn apply_runtime_options(mut self, config: &BluefieldDriverConfig) -> Result { + self.openshell_endpoint = config.openshell_endpoint.clone(); if let Some(egress) = guest_egress_from_config(config)? { self = self.with_guest_egress(egress); } @@ -147,6 +152,16 @@ impl BluefieldExtension { self } + /// Preserve a non-loopback gateway endpoint for split deployments where + /// the gateway runs somewhere other than the compute host. The generic VM + /// QEMU TAP path rewrites the endpoint to the TAP host IP; this override is + /// appended later and lets the guest route to the real gateway via TAP NAT. + #[must_use] + pub fn with_openshell_endpoint(mut self, endpoint: impl Into) -> Self { + self.openshell_endpoint = Some(endpoint.into()); + self + } + /// Select the BlueField guest kernel (image or profile) and the VF driver /// modules to load in guest-init. #[must_use] @@ -182,21 +197,25 @@ impl BluefieldExtension { .remove(sandbox_id) } - fn release_binding(&self, sandbox_state_dir: &Path, slot: &VfSlot) -> LifecycleResult<()> { + fn release_binding( + &self, + sandbox_state_dir: &Path, + slot: &FunctionSlot, + ) -> LifecycleResult<()> { self.binder.release_slot(slot).map_err(|err| { LifecycleError::new(format!("bluefield: release VF {}: {err}", slot.host_bdf)) })?; state::remove_bind_state(sandbox_state_dir) } - fn claim_slot(&self, sandbox_id: &str) -> LifecycleResult { + fn claim_slot(&self, sandbox_id: &str) -> LifecycleResult { let mut slot = self.pool.claim(sandbox_id).ok_or_else(|| { LifecycleError::resource_exhausted(format!( "bluefield: no free VF for sandbox {sandbox_id}" )) })?; - if slot.guest_mac.is_none() { - slot.guest_mac = Some(deterministic_vf_mac(sandbox_id)); + if slot.mac.is_none() { + slot.mac = Some(deterministic_vf_mac(sandbox_id)); } Ok(slot) } @@ -247,6 +266,12 @@ impl LifecycleExtension for BluefieldExtension { if let Some(kernel) = &self.kernel { kernel.apply(plan)?; } + if let Some(endpoint) = self.guest_routable_openshell_endpoint() { + plan.env.push(format!( + "{}={endpoint}", + openshell_core::sandbox_env::ENDPOINT + )); + } Ok(()) } @@ -355,9 +380,9 @@ impl LifecycleExtension for BluefieldExtension { bind_state.host_bdf, ctx.sandbox.id )) })?; - if slot.guest_mac.is_none() { - slot.guest_mac = bind_state - .guest_mac + if slot.mac.is_none() { + slot.mac = bind_state + .mac .clone() .or_else(|| Some(deterministic_vf_mac(&ctx.sandbox.id))); } @@ -373,6 +398,33 @@ impl LifecycleExtension for BluefieldExtension { } } +impl BluefieldExtension { + fn guest_routable_openshell_endpoint(&self) -> Option<&str> { + let endpoint = self.openshell_endpoint.as_deref()?; + if endpoint_host_is_guest_routable(endpoint) { + Some(endpoint) + } else { + None + } + } +} + +fn endpoint_host_is_guest_routable(endpoint: &str) -> bool { + let Ok(url) = Url::parse(endpoint) else { + return false; + }; + match url.host() { + Some(Host::Ipv4(ip)) => ip_is_guest_routable(IpAddr::V4(ip)), + Some(Host::Ipv6(ip)) => ip_is_guest_routable(IpAddr::V6(ip)), + Some(Host::Domain(host)) => !host.eq_ignore_ascii_case("localhost"), + None => false, + } +} + +fn ip_is_guest_routable(ip: IpAddr) -> bool { + !ip.is_loopback() && !ip.is_unspecified() +} + #[cfg(test)] mod tests { use super::*; @@ -403,13 +455,16 @@ mod tests { #[derive(Debug)] struct TestVfBinder; impl VfBinder for TestVfBinder { - fn bind_slot(&self, _slot: &VfSlot) -> Result, String> { + fn bind_slot(&self, _slot: &FunctionSlot) -> Result, String> { Ok(Box::new(TestVfBinding)) } - fn adopt_slot(&self, _slot: &VfSlot) -> Result, String> { + fn adopt_slot( + &self, + _slot: &FunctionSlot, + ) -> Result, String> { Ok(Box::new(TestVfBinding)) } - fn release_slot(&self, _slot: &VfSlot) -> Result<(), String> { + fn release_slot(&self, _slot: &FunctionSlot) -> Result<(), String> { Ok(()) } } @@ -461,7 +516,7 @@ mod tests { } } - fn ext(pool: VfPool) -> BluefieldExtension { + fn ext(pool: FunctionPool) -> BluefieldExtension { BluefieldExtension::new(pool) .with_host_readiness(Arc::new(AlwaysReady)) .with_vf_binder(Arc::new(TestVfBinder)) @@ -469,9 +524,11 @@ mod tests { #[tokio::test] async fn before_launch_claims_slot_records_bind_state_and_injects_egress_env() { - let extension = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") - ])) + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + ) + .with_representor("pf0vf0")])) .with_guest_egress(GuestEgress { address_cidr: "10.0.120.10/22".to_string(), gateway: "10.0.120.254".to_string(), @@ -489,6 +546,17 @@ mod tests { .iter() .any(|e| e == "OPENSHELL_VM_DATA_IP=10.0.120.10/22") ); + assert!( + plan.env + .iter() + .any(|e| e == "OPENSHELL_VM_DATA_EGRESS=external-vf") + ); + assert!( + plan.env + .iter() + .any(|e| e == "OPENSHELL_VM_DATA_GW=10.0.120.254") + ); + assert!(!plan.env.iter().any(|e| e.contains("veth"))); assert_eq!(passthrough_bdfs(&plan), vec!["0000:03:00.2"]); let bind_state = state::load_bind_state("sandbox-1", &state).unwrap(); @@ -503,7 +571,7 @@ mod tests { #[tokio::test] async fn before_launch_fails_closed_when_pool_exhausted() { - let extension = ext(VfPool::new([])); + let extension = ext(FunctionPool::new([])); let mut plan = sample_plan(); let err = extension .before_launch(&sandbox("sandbox-1"), &PathBuf::from("/tmp/s"), &mut plan) @@ -514,9 +582,12 @@ mod tests { #[tokio::test] async fn before_launch_fails_closed_when_host_not_vfio_ready() { - let extension = BluefieldExtension::new(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])) - .with_host_readiness(Arc::new(NeverReady)) - .with_vf_binder(Arc::new(TestVfBinder)); + let extension = BluefieldExtension::new(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + )])) + .with_host_readiness(Arc::new(NeverReady)) + .with_vf_binder(Arc::new(TestVfBinder)); let mut plan = sample_plan(); let err = extension @@ -531,7 +602,10 @@ mod tests { #[tokio::test] async fn after_delete_releases_slot_and_state() { - let extension = ext(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])); + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + )])); let state = state_dir("delete"); let mut plan = sample_plan(); extension @@ -550,10 +624,13 @@ mod tests { #[tokio::test] async fn configure_launch_selects_kernel_and_declares_vf_passthrough() { - let extension = BluefieldExtension::new(VfPool::new([VfSlot::new("vf0", "0000:03:00.2")])) - .with_kernel(BluefieldKernel::from_image( - "/opt/openshell/kernels/bf-vmlinux", - )); + let extension = BluefieldExtension::new(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + )])) + .with_kernel(BluefieldKernel::from_image( + "/opt/openshell/kernels/bf-vmlinux", + )); let mut plan = sample_plan(); extension @@ -580,14 +657,20 @@ mod tests { #[tokio::test] async fn configure_launch_sets_qemu_requirements_without_manual_internal_args() { - let extension = ext(VfPool::new([ - VfSlot::new("vf29", "0000:b1:04.1").with_vf_index(29), - ])) + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf29", + "0000:b1:04.1", + ) + .with_index(29)])) .with_kernel(BluefieldKernel::from_image("/runtime/vmlinux")); let mut plan = sample_plan(); extension - .configure_launch(&sandbox("sandbox-bluefield"), &PathBuf::from("/tmp/s"), &mut plan) + .configure_launch( + &sandbox("sandbox-bluefield"), + &PathBuf::from("/tmp/s"), + &mut plan, + ) .await .unwrap(); @@ -603,7 +686,10 @@ mod tests { plan.required_backend_features .contains(&BackendFeature::ExternalKernelImage) ); - assert_eq!(plan.kernel_image.as_deref(), Some(Path::new("/runtime/vmlinux"))); + assert_eq!( + plan.kernel_image.as_deref(), + Some(Path::new("/runtime/vmlinux")) + ); assert!(plan.tap_device.is_none()); assert!(plan.guest_ip.is_none()); assert!(plan.host_ip.is_none()); @@ -612,21 +698,78 @@ mod tests { assert!(plan.gateway_port.is_none()); } + #[tokio::test] + async fn configure_launch_preserves_non_loopback_gateway_endpoint() { + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf29", + "0000:b1:04.1", + ) + .with_index(29)])) + .with_openshell_endpoint("http://10.0.110.4:18091/"); + + let mut plan = sample_plan(); + extension + .configure_launch( + &sandbox("sandbox-bluefield"), + &PathBuf::from("/tmp/s"), + &mut plan, + ) + .await + .unwrap(); + + assert!( + plan.env + .iter() + .any(|env| env == "OPENSHELL_ENDPOINT=http://10.0.110.4:18091/") + ); + } + + #[tokio::test] + async fn configure_launch_does_not_preserve_loopback_gateway_endpoint() { + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf29", + "0000:b1:04.1", + ) + .with_index(29)])) + .with_openshell_endpoint("http://127.0.0.1:18091/"); + + let mut plan = sample_plan(); + extension + .configure_launch( + &sandbox("sandbox-bluefield"), + &PathBuf::from("/tmp/s"), + &mut plan, + ) + .await + .unwrap(); + + assert!( + !plan + .env + .iter() + .any(|env| env.starts_with("OPENSHELL_ENDPOINT=")) + ); + } + #[tokio::test] async fn before_restore_reclaims_and_records() { let state = state_dir("restore"); - let initial = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") - ])); + let initial = ext(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + ) + .with_representor("pf0vf0")])); let mut plan = sample_plan(); initial .before_launch(&sandbox("sb-restore"), &state, &mut plan) .await .unwrap(); - let extension = ext(VfPool::new([ - VfSlot::new("vf0", "0000:03:00.2").with_representor("pf0vf0") - ])); + let extension = ext(FunctionPool::new([FunctionSlot::new( + "vf0", + "0000:03:00.2", + ) + .with_representor("pf0vf0")])); let ctx = RestoreContext { sandbox: sandbox("sb-restore"), state_dir: state.clone(), diff --git a/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs index 2cfbc426b..e008e27ef 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs @@ -6,7 +6,7 @@ use crate::lifecycle::GuestInitDropin; -use bf_inventory::VfSlot; +use bf_inventory::FunctionSlot; const ENV_EGRESS: &str = "OPENSHELL_VM_DATA_EGRESS"; const ENV_IP_MODE: &str = "OPENSHELL_VM_DATA_IP_MODE"; @@ -26,32 +26,32 @@ pub struct GuestEgress { impl GuestEgress { /// Build the `OPENSHELL_VM_DATA_*` env vars the guest-init drop-in reads. - /// A per-slot `guest_datapath_address` overrides `address_cidr`. + /// A per-slot `datapath_address` overrides `address_cidr`. #[must_use] - pub fn env(&self, slot: &VfSlot) -> Vec { + pub fn env(&self, slot: &FunctionSlot) -> Vec { GuestEgressEnv::for_slot(self, slot).to_env() } } -/// Concrete guest-init environment for one sandbox VF. +/// Concrete guest-init environment for one sandbox function. #[derive(Debug, Clone, PartialEq, Eq)] struct GuestEgressEnv { address_cidr: String, gateway: String, - guest_mac: Option, + mac: Option, } impl GuestEgressEnv { #[must_use] - fn for_slot(egress: &GuestEgress, slot: &VfSlot) -> Self { + fn for_slot(egress: &GuestEgress, slot: &FunctionSlot) -> Self { let address = slot - .guest_datapath_address + .datapath_address .as_deref() .unwrap_or(&egress.address_cidr); Self { address_cidr: address.to_string(), gateway: egress.gateway.clone(), - guest_mac: slot.guest_mac.clone(), + mac: slot.mac.clone(), } } @@ -63,7 +63,7 @@ impl GuestEgressEnv { format!("{}={}", ENV_IP, self.address_cidr), format!("{}={}", ENV_GATEWAY, self.gateway), ]; - if let Some(mac) = self.guest_mac.as_deref() { + if let Some(mac) = self.mac.as_deref() { env.push(format!("{ENV_MAC}={mac}")); } env @@ -85,7 +85,7 @@ pub fn dropin() -> GuestInitDropin { #[cfg(test)] mod tests { use super::GuestEgress; - use bf_inventory::VfSlot; + use bf_inventory::FunctionSlot; #[test] fn env_contract_uses_default_address_without_dns_or_mac() { @@ -93,7 +93,7 @@ mod tests { address_cidr: "10.0.120.10/22".to_string(), gateway: "10.0.120.254".to_string(), }; - let slot = VfSlot::new("vf0", "0000:03:00.2"); + let slot = FunctionSlot::new("vf0", "0000:03:00.2"); let env = egress.env(&slot); assert_eq!( env, @@ -112,8 +112,8 @@ mod tests { address_cidr: "10.0.120.10/22".to_string(), gateway: "10.0.120.254".to_string(), }; - let slot = VfSlot::new("vf0", "0000:03:00.2").with_guest_datapath_address("10.0.120.61/22"); - let slot = slot.with_guest_mac("02:bf:64:04:00:10"); + let slot = FunctionSlot::new("vf0", "0000:03:00.2").with_datapath_address("10.0.120.61/22"); + let slot = slot.with_mac("02:bf:64:04:00:10"); let env = egress.env(&slot); assert_eq!( env, @@ -138,10 +138,15 @@ mod tests { assert!(script.contains("find_bluefield_vf()")); assert!(script.contains("configure_static_ip()")); assert!(script.contains("configure_resolv_conf()")); + assert!(script.contains("remove_inherited_default_routes()")); + assert!(script.contains("verify_vf_default_route()")); assert!(script.contains("main \"$@\"")); assert!(script.contains("ip link set dev \"${vf_nic}\" address")); assert!(script.contains("ip addr add")); + assert!(script.contains("ip route del default")); assert!(script.contains("ip route replace default")); + assert!(script.contains("ip route get \"${OPENSHELL_VM_DATA_GW}\"")); + assert!(script.contains("dev ${vf_nic}")); assert!(!script.contains("OPENSHELL_VM_DATA_DNS")); assert!(script.contains("resolv.conf")); assert!(script.contains("DPU-side policy")); diff --git a/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs index 6ba0bb90b..72ae94048 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs @@ -108,7 +108,11 @@ fn is_bluefield_network_pf(path: &Path) -> bool { } fn has_any_virtfn(path: &Path) -> bool { - (0..256).any(|index| path.join(format!("virtfn{index}")).symlink_metadata().is_ok()) + (0..256).any(|index| { + path.join(format!("virtfn{index}")) + .symlink_metadata() + .is_ok() + }) } fn read_trimmed(path: PathBuf) -> Option { diff --git a/crates/openshell-driver-bluefield/bf-vm/src/slots.rs b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs index 952973ed6..6e87fb3a5 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/slots.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/slots.rs @@ -5,7 +5,7 @@ use std::collections::HashSet; -use bf_inventory::{SysfsVfInventory, VfInventory, VfSlot}; +use bf_inventory::{FunctionInventory, FunctionSlot, SysfsVfInventory}; use openshell_vfio::SysfsRoot; use crate::config::BluefieldDriverConfig; @@ -39,7 +39,7 @@ pub(crate) fn prepare_host_slots( config: HostSlotConfig<'_>, sysfs: &SysfsRoot, host_pf: &str, -) -> Result, String> { +) -> Result, String> { let inventory = SysfsVfInventory::new(sysfs.clone(), [host_pf.to_string()]); let mut slots = inventory .discover() @@ -51,10 +51,13 @@ pub(crate) fn prepare_host_slots( Ok(slots) } -fn apply_slot_config(config: &HostSlotConfig<'_>, slots: &mut Vec) -> Result<(), String> { +fn apply_slot_config( + config: &HostSlotConfig<'_>, + slots: &mut Vec, +) -> Result<(), String> { if !config.reserved_vf_indexes.is_empty() { let reserved: HashSet = config.reserved_vf_indexes.iter().copied().collect(); - slots.retain(|slot| match slot.vf_index { + slots.retain(|slot| match slot.index { Some(index) => !reserved.contains(&index), None => true, }); @@ -73,7 +76,7 @@ fn apply_slot_config(config: &HostSlotConfig<'_>, slots: &mut Vec) -> Re )); } for (slot, address) in slots.iter_mut().zip(config.egress_cidr_pool.iter()) { - slot.guest_datapath_address = Some(address.clone()); + slot.datapath_address = Some(address.clone()); } } Ok(()) @@ -82,17 +85,17 @@ fn apply_slot_config(config: &HostSlotConfig<'_>, slots: &mut Vec) -> Re #[cfg(test)] mod tests { use super::{HostSlotConfig, apply_slot_config}; - use bf_inventory::VfSlot; + use bf_inventory::FunctionSlot; #[test] fn applies_reserved_indexes_pf_key_and_egress_pool() { let mut slots = vec![ - VfSlot::new("vf0", "0000:03:00.2") + FunctionSlot::new("vf0", "0000:03:00.2") .with_pf("p0") - .with_vf_index(0), - VfSlot::new("vf1", "0000:03:00.3") + .with_index(0), + FunctionSlot::new("vf1", "0000:03:00.3") .with_pf("p0") - .with_vf_index(1), + .with_index(1), ]; let egress_pool = vec!["10.0.120.61/22".to_string()]; let config = HostSlotConfig { @@ -106,17 +109,14 @@ mod tests { assert_eq!(slots.len(), 1); assert_eq!(slots[0].host_bdf, "0000:03:00.3"); assert_eq!(slots[0].pf.as_deref(), Some("bf-a")); - assert_eq!( - slots[0].guest_datapath_address.as_deref(), - Some("10.0.120.61/22") - ); + assert_eq!(slots[0].datapath_address.as_deref(), Some("10.0.120.61/22")); } #[test] fn rejects_egress_pool_shorter_than_usable_slots() { let mut slots = vec![ - VfSlot::new("vf0", "0000:03:00.2").with_vf_index(0), - VfSlot::new("vf1", "0000:03:00.3").with_vf_index(1), + FunctionSlot::new("vf0", "0000:03:00.2").with_index(0), + FunctionSlot::new("vf1", "0000:03:00.3").with_index(1), ]; let egress_pool = vec!["10.0.120.61/22".to_string()]; let config = HostSlotConfig { diff --git a/crates/openshell-driver-bluefield/bf-vm/src/state.rs b/crates/openshell-driver-bluefield/bf-vm/src/state.rs index eee5ab86f..057a87819 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/state.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/state.rs @@ -6,7 +6,7 @@ use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; -use bf_inventory::VfSlot; +use bf_inventory::FunctionSlot; use serde::{Deserialize, Serialize}; use crate::lifecycle::{LifecycleError, LifecycleResult, extension_state_dir}; @@ -18,7 +18,7 @@ const PCI_BIND_STATE_FILE: &str = "pci-bind-state.json"; /// Per-sandbox bookkeeping for reverse-order teardown. #[derive(Debug, Clone)] pub(crate) struct AttachmentRecord { - pub(crate) slot: VfSlot, + pub(crate) slot: FunctionSlot, } /// Persisted record of the VF bound to a sandbox, for crash recovery. @@ -27,14 +27,14 @@ pub(crate) struct BluefieldPciBindState { pub(crate) host_bdf: String, pub(crate) sandbox_id: String, #[serde(default)] - pub(crate) guest_mac: Option, + pub(crate) mac: Option, pub(crate) bound_at_ms: u128, } pub(crate) fn persist_bind_state( sandbox_id: &str, sandbox_state_dir: &Path, - slot: &VfSlot, + slot: &FunctionSlot, ) -> LifecycleResult<()> { let path = bind_state_path(sandbox_state_dir)?; if let Some(parent) = path.parent() { @@ -48,14 +48,18 @@ pub(crate) fn persist_bind_state( let state = BluefieldPciBindState { host_bdf: slot.host_bdf.clone(), sandbox_id: sandbox_id.to_string(), - guest_mac: slot.guest_mac.clone(), + mac: slot.mac.clone(), bound_at_ms: now_millis(), }; let data = serde_json::to_string_pretty(&state) .map_err(|err| LifecycleError::new(format!("serialize bluefield bind state: {err}")))?; let tmp = path.with_extension("tmp"); - std::fs::write(&tmp, data) - .map_err(|err| LifecycleError::new(format!("write bluefield bind state {}: {err}", tmp.display())))?; + std::fs::write(&tmp, data).map_err(|err| { + LifecycleError::new(format!( + "write bluefield bind state {}: {err}", + tmp.display() + )) + })?; std::fs::rename(&tmp, &path).map_err(|err| { LifecycleError::new(format!( "commit bluefield bind state {}: {err}", @@ -69,10 +73,18 @@ pub(crate) fn load_bind_state( sandbox_state_dir: &Path, ) -> LifecycleResult { let path = bind_state_path(sandbox_state_dir)?; - let data = std::fs::read_to_string(&path) - .map_err(|err| LifecycleError::new(format!("read bluefield bind state {}: {err}", path.display())))?; - let state: BluefieldPciBindState = serde_json::from_str(&data) - .map_err(|err| LifecycleError::new(format!("parse bluefield bind state {}: {err}", path.display())))?; + let data = std::fs::read_to_string(&path).map_err(|err| { + LifecycleError::new(format!( + "read bluefield bind state {}: {err}", + path.display() + )) + })?; + let state: BluefieldPciBindState = serde_json::from_str(&data).map_err(|err| { + LifecycleError::new(format!( + "parse bluefield bind state {}: {err}", + path.display() + )) + })?; if state.sandbox_id != sandbox_id { return Err(LifecycleError::new(format!( "bluefield bind state sandbox mismatch: expected {sandbox_id}, got {}", diff --git a/crates/openshell-driver-bluefield/bf-vm/src/vf.rs b/crates/openshell-driver-bluefield/bf-vm/src/vf.rs index d2d200dff..0f8862821 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/vf.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/vf.rs @@ -12,7 +12,7 @@ use openshell_vfio::{ validate_pci_for_passthrough, }; -use bf_inventory::VfSlot; +use bf_inventory::FunctionSlot; /// Host capability probe for VF passthrough. Injectable so tests (and hosts /// without the device) don't need real hardware. Implementations check that @@ -65,9 +65,9 @@ impl VfBinding for RealVfBinding { } pub(crate) trait VfBinder: std::fmt::Debug + Send + Sync { - fn bind_slot(&self, slot: &VfSlot) -> Result, String>; - fn adopt_slot(&self, slot: &VfSlot) -> Result, String>; - fn release_slot(&self, slot: &VfSlot) -> Result<(), String>; + fn bind_slot(&self, slot: &FunctionSlot) -> Result, String>; + fn adopt_slot(&self, slot: &FunctionSlot) -> Result, String>; + fn release_slot(&self, slot: &FunctionSlot) -> Result<(), String>; } #[derive(Debug, Clone)] @@ -88,7 +88,7 @@ impl Default for SysfsVfBinder { } impl VfBinder for SysfsVfBinder { - fn bind_slot(&self, slot: &VfSlot) -> Result, String> { + fn bind_slot(&self, slot: &FunctionSlot) -> Result, String> { bind_slot(&self.sysfs, slot) .map(|guard| { let binding: Box = Box::new(RealVfBinding(guard)); @@ -97,7 +97,7 @@ impl VfBinder for SysfsVfBinder { .map_err(|err| err.to_string()) } - fn adopt_slot(&self, slot: &VfSlot) -> Result, String> { + fn adopt_slot(&self, slot: &FunctionSlot) -> Result, String> { adopt_slot(&self.sysfs, slot) .map(|guard| { let binding: Box = Box::new(RealVfBinding(guard)); @@ -106,7 +106,7 @@ impl VfBinder for SysfsVfBinder { .map_err(|err| err.to_string()) } - fn release_slot(&self, slot: &VfSlot) -> Result<(), String> { + fn release_slot(&self, slot: &FunctionSlot) -> Result<(), String> { release_slot(&self.sysfs, slot).map_err(|err| err.to_string()) } } @@ -117,7 +117,7 @@ impl VfBinder for SysfsVfBinder { /// and to persist the binding for restart reconciliation. pub fn bind_slot( sysfs: &SysfsRoot, - slot: &VfSlot, + slot: &FunctionSlot, ) -> Result { prepare_pci_for_passthrough(sysfs, &slot.host_bdf) } @@ -126,12 +126,15 @@ pub fn bind_slot( /// restart, without rebinding or mutating sysfs. pub fn adopt_slot( sysfs: &SysfsRoot, - slot: &VfSlot, + slot: &FunctionSlot, ) -> Result { PciBindGuard::adopt(sysfs, &slot.host_bdf) } /// Restore a VF slot's device to its host driver at teardown time. -pub fn release_slot(sysfs: &SysfsRoot, slot: &VfSlot) -> Result<(), openshell_vfio::VfioError> { +pub fn release_slot( + sysfs: &SysfsRoot, + slot: &FunctionSlot, +) -> Result<(), openshell_vfio::VfioError> { release_pci_from_passthrough(sysfs, &slot.host_bdf) } diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 2166e0617..86044e804 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -603,9 +603,16 @@ run_openshell_init_dropins() { # rather than run a half-configured sandbox. Aborting exits this init # non-zero; the VM helper then exits and the driver runs # lifecycle-extension cleanup, so a failed init does not leak resources. - # The loop runs in the main shell (process substitution, not a pipe), so - # `exit` here terminates init as intended. - local name dropin rc + # The loop reads a regular file instead of process substitution because + # this init runs before /dev/fd is guaranteed to exist. It still runs in + # the main shell, so `exit` here terminates init as intended. + local name dropin rc sorted_manifest + sorted_manifest="$(root_path "/tmp/openshell-init-dropins.$$.manifest")" + if ! LC_ALL=C sort -u "$manifest" > "$sorted_manifest"; then + ts "FATAL: could not sort OpenShell VM init drop-in manifest" + exit 1 + fi + while IFS= read -r name; do [ -n "$name" ] || continue # Manifest entries are bare file names that the driver already @@ -641,7 +648,8 @@ run_openshell_init_dropins() { ts "FATAL: OpenShell VM init drop-in ${name} failed with exit code ${rc}" exit 1 fi - done < <(LC_ALL=C sort -u "$manifest") + done < "$sorted_manifest" + rm -f "$sorted_manifest" } run_post_overlay_setup() { diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index d59e7b4b9..1965fdcb2 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -970,6 +970,16 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + #[test] + fn guest_init_dropin_manifest_loop_does_not_require_dev_fd() { + let script = include_str!("../scripts/openshell-vm-sandbox-init.sh"); + + assert!( + !script.contains("< <("), + "guest init runs before /dev/fd is guaranteed to exist" + ); + } + #[test] fn prepare_sandbox_rootfs_preserves_image_workdir_contents_in_rootfs() { let dir = unique_temp_dir(); From 0bb74a64d6eebc6b07ba2fbc296dd19e0337675b Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Sat, 13 Jun 2026 21:39:38 +0000 Subject: [PATCH 14/14] refactor env vars Signed-off-by: Patrick Riel --- .../bf-core/Cargo.toml | 3 + .../bf-core/src/assignment.rs | 3 + .../bf-core/src/claim.rs | 3 + .../bf-core/src/env.rs | 85 +++++++++++++++++++ .../bf-core/src/error.rs | 3 + .../bf-core/src/handles.rs | 3 + .../bf-core/src/lib.rs | 4 + .../bf-core/src/lifecycle.rs | 3 + .../bf-core/src/role.rs | 3 + .../bf-core/src/runtime.rs | 3 + .../bf-core/src/state.rs | 3 + .../bf-driver/Cargo.toml | 3 + .../bf-inventory/Cargo.toml | 3 + .../bf-vm/Cargo.toml | 3 + .../bf-vm/README.md | 22 ++--- .../bf-vm/src/cli.rs | 40 ++++----- .../bf-vm/src/config.rs | 10 ++- .../bf-vm/src/guest_egress.rs | 11 +-- .../bf-vm/src/host_pf.rs | 13 +-- .../bf-vm/src/qemu_kernel_resolver.rs | 5 +- 20 files changed, 178 insertions(+), 48 deletions(-) create mode 100644 crates/openshell-driver-bluefield/bf-core/src/env.rs diff --git a/crates/openshell-driver-bluefield/bf-core/Cargo.toml b/crates/openshell-driver-bluefield/bf-core/Cargo.toml index d901359f6..747885e10 100644 --- a/crates/openshell-driver-bluefield/bf-core/Cargo.toml +++ b/crates/openshell-driver-bluefield/bf-core/Cargo.toml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + [package] name = "bf-core" description = "Shared contracts for the OpenShell BlueField compute driver" diff --git a/crates/openshell-driver-bluefield/bf-core/src/assignment.rs b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs index 7f1ce9ffb..914c3a4a6 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/assignment.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/assignment.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! The network-function assignment the control-plane leader hands to a compute //! node. //! diff --git a/crates/openshell-driver-bluefield/bf-core/src/claim.rs b/crates/openshell-driver-bluefield/bf-core/src/claim.rs index fca06a0eb..1cca69d2d 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/claim.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/claim.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Runtime-neutral BlueField resource claims. use serde::{Deserialize, Serialize}; diff --git a/crates/openshell-driver-bluefield/bf-core/src/env.rs b/crates/openshell-driver-bluefield/bf-core/src/env.rs new file mode 100644 index 000000000..ecfa289d9 --- /dev/null +++ b/crates/openshell-driver-bluefield/bf-core/src/env.rs @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Environment variable names that make up the BlueField driver's external +//! configuration contract. +//! +//! Centralizing the names here gives the host-side driver, the guest-init +//! path, and any future runtime adapters (containers, Kubernetes) a single +//! source of truth. The values are `&'static str` so they can be referenced +//! both from `clap` `env = ...` attributes and from plain `std::env` lookups. + +/// Master switch that enables the BlueField driver. +pub const BLUEFIELD: &str = "OPENSHELL_BLUEFIELD"; + +/// Deployment role: `all-in-one`, `control-plane`, or `compute-node`. +pub const BLUEFIELD_ROLE: &str = "OPENSHELL_BLUEFIELD_ROLE"; + +/// gRPC endpoint of the control-plane controller (compute-node role). +pub const BLUEFIELD_CONTROLLER_ENDPOINT: &str = "OPENSHELL_BLUEFIELD_CONTROLLER_ENDPOINT"; + +/// Directory holding the mutual-TLS material for the controller channel. +pub const BLUEFIELD_TLS_DIR: &str = "OPENSHELL_BLUEFIELD_TLS_DIR"; + +/// Expected TLS server name for the controller certificate. +pub const BLUEFIELD_TLS_DOMAIN: &str = "OPENSHELL_BLUEFIELD_TLS_DOMAIN"; + +/// Host physical function (netdev name or PCI BDF) backing the VFs. +pub const BLUEFIELD_HOST_PF: &str = "OPENSHELL_BLUEFIELD_HOST_PF"; + +/// Comma-separated VF indexes reserved from the assignable pool. +pub const BLUEFIELD_RESERVED_VF_INDEXES: &str = "OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES"; + +/// Identifier of the PF used when computing per-function keys. +pub const BLUEFIELD_PF_KEY: &str = "OPENSHELL_BLUEFIELD_PF_KEY"; + +/// Source NAT IP applied on the DPU for sandbox egress. +pub const BLUEFIELD_SNAT_IP: &str = "OPENSHELL_BLUEFIELD_SNAT_IP"; + +/// Uplink port on the DPU that carries sandbox egress. +pub const BLUEFIELD_UPLINK_PORT: &str = "OPENSHELL_BLUEFIELD_UPLINK_PORT"; + +/// Path to the BlueField guest kernel image. +pub const BLUEFIELD_KERNEL_IMAGE: &str = "OPENSHELL_BLUEFIELD_KERNEL_IMAGE"; + +/// Expected version string for the guest kernel image. +pub const BLUEFIELD_KERNEL_VERSION: &str = "OPENSHELL_BLUEFIELD_KERNEL_VERSION"; + +/// Expected SHA-256 of the guest kernel image. +pub const BLUEFIELD_KERNEL_SHA256: &str = "OPENSHELL_BLUEFIELD_KERNEL_SHA256"; + +/// Comma-separated guest kernel modules to load. +pub const BLUEFIELD_KERNEL_MODULES: &str = "OPENSHELL_BLUEFIELD_KERNEL_MODULES"; + +/// Egress CIDR assigned to a single sandbox function. +pub const BLUEFIELD_EGRESS_CIDR: &str = "OPENSHELL_BLUEFIELD_EGRESS_CIDR"; + +/// Comma-separated pool of egress CIDRs handed out per function. +pub const BLUEFIELD_EGRESS_CIDR_POOL: &str = "OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL"; + +/// Default gateway for sandbox egress traffic. +pub const BLUEFIELD_EGRESS_GATEWAY: &str = "OPENSHELL_BLUEFIELD_EGRESS_GATEWAY"; + +/// Comma-separated DNS resolvers advertised to the sandbox. +pub const BLUEFIELD_EGRESS_DNS: &str = "OPENSHELL_BLUEFIELD_EGRESS_DNS"; + +/// Proxy placement: `none` or `dpu`. +pub const BLUEFIELD_PROXY_PLACEMENT: &str = "OPENSHELL_BLUEFIELD_PROXY_PLACEMENT"; + +/// Explicit proxy URL injected into the sandbox when proxying is enabled. +pub const BLUEFIELD_EXPLICIT_PROXY_URL: &str = "OPENSHELL_BLUEFIELD_EXPLICIT_PROXY_URL"; + +/// Guest data-path egress mode (e.g. `external-vf`). +pub const VM_DATA_EGRESS: &str = "OPENSHELL_VM_DATA_EGRESS"; + +/// Guest data-path IP assignment mode (e.g. `static`). +pub const VM_DATA_IP_MODE: &str = "OPENSHELL_VM_DATA_IP_MODE"; + +/// Guest data-path interface address in CIDR notation. +pub const VM_DATA_IP: &str = "OPENSHELL_VM_DATA_IP"; + +/// Guest data-path default gateway. +pub const VM_DATA_GW: &str = "OPENSHELL_VM_DATA_GW"; + +/// Guest data-path interface MAC address. +pub const VM_DATA_MAC: &str = "OPENSHELL_VM_DATA_MAC"; diff --git a/crates/openshell-driver-bluefield/bf-core/src/error.rs b/crates/openshell-driver-bluefield/bf-core/src/error.rs index 09a71c2f8..88bca56bd 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/error.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/error.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Error surface shared by BlueField driver crates. pub type Result = std::result::Result; diff --git a/crates/openshell-driver-bluefield/bf-core/src/handles.rs b/crates/openshell-driver-bluefield/bf-core/src/handles.rs index 2306f8dcf..4ef5774a7 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/handles.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/handles.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Shared BlueField handles that cross the driver, host, and DPU seam. use serde::{Deserialize, Serialize}; diff --git a/crates/openshell-driver-bluefield/bf-core/src/lib.rs b/crates/openshell-driver-bluefield/bf-core/src/lib.rs index a29a1ae3b..80b094a11 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/lib.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/lib.rs @@ -1,7 +1,11 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Shared contracts for the BlueField compute driver. pub mod assignment; pub mod claim; +pub mod env; pub mod error; pub mod handles; pub mod lifecycle; diff --git a/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs b/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs index 8a46dbf1d..e0715f0bb 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/lifecycle.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! BlueField driver lifecycle extension framework. //! //! This mirrors the in-tree VM lifecycle extension hook chain, but the hooks diff --git a/crates/openshell-driver-bluefield/bf-core/src/role.rs b/crates/openshell-driver-bluefield/bf-core/src/role.rs index 6a730a3f1..fc2a4bc38 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/role.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/role.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Deployment role for the BlueField compute driver. //! //! A single driver binary runs in one of three roles, selected at startup. diff --git a/crates/openshell-driver-bluefield/bf-core/src/runtime.rs b/crates/openshell-driver-bluefield/bf-core/src/runtime.rs index f9d0fdd5f..c9e2d8fdb 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/runtime.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/runtime.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Runtime adapter contract. use async_trait::async_trait; diff --git a/crates/openshell-driver-bluefield/bf-core/src/state.rs b/crates/openshell-driver-bluefield/bf-core/src/state.rs index 548813c6e..3cee7d530 100644 --- a/crates/openshell-driver-bluefield/bf-core/src/state.rs +++ b/crates/openshell-driver-bluefield/bf-core/src/state.rs @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + //! Persisted BlueField driver state. use serde::{Deserialize, Serialize}; diff --git a/crates/openshell-driver-bluefield/bf-driver/Cargo.toml b/crates/openshell-driver-bluefield/bf-driver/Cargo.toml index 6eb801772..00f8ef3dd 100644 --- a/crates/openshell-driver-bluefield/bf-driver/Cargo.toml +++ b/crates/openshell-driver-bluefield/bf-driver/Cargo.toml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + [package] name = "bf-driver" description = "External BlueField compute driver for OpenShell" diff --git a/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml b/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml index cce233534..eadecc0eb 100644 --- a/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml +++ b/crates/openshell-driver-bluefield/bf-inventory/Cargo.toml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + [package] name = "bf-inventory" description = "BlueField function inventory, discovery, and allocation" diff --git a/crates/openshell-driver-bluefield/bf-vm/Cargo.toml b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml index 82671a3fd..11e8b9625 100644 --- a/crates/openshell-driver-bluefield/bf-vm/Cargo.toml +++ b/crates/openshell-driver-bluefield/bf-vm/Cargo.toml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + [package] name = "bf-vm" description = "Bare-metal VM runtime adapter for the OpenShell BlueField driver" diff --git a/crates/openshell-driver-bluefield/bf-vm/README.md b/crates/openshell-driver-bluefield/bf-vm/README.md index a4554e926..f65b72aa0 100644 --- a/crates/openshell-driver-bluefield/bf-vm/README.md +++ b/crates/openshell-driver-bluefield/bf-vm/README.md @@ -216,15 +216,15 @@ Register the gateway: /opt/openshell/bin/openshell gateway add \ http://10.0.110.4:18083 \ --local \ - --name worker3-bf + --name bf-gateway -/opt/openshell/bin/openshell --gateway worker3-bf status +/opt/openshell/bin/openshell --gateway bf-gateway status ``` Create a sandbox: ```shell -/opt/openshell/bin/openshell --gateway worker3-bf sandbox create \ +/opt/openshell/bin/openshell --gateway bf-gateway sandbox create \ --name bf-vf-egress \ --from ghcr.io/nvidia/openshell-community/sandboxes/base:latest \ -- sleep infinity @@ -233,14 +233,14 @@ Create a sandbox: Inspect or connect: ```shell -/opt/openshell/bin/openshell --gateway worker3-bf sandbox list -/opt/openshell/bin/openshell --gateway worker3-bf sandbox connect bf-vf-egress +/opt/openshell/bin/openshell --gateway bf-gateway sandbox list +/opt/openshell/bin/openshell --gateway bf-gateway sandbox connect bf-vf-egress ``` Delete the sandbox when finished: ```shell -/opt/openshell/bin/openshell --gateway worker3-bf sandbox delete bf-vf-egress +/opt/openshell/bin/openshell --gateway bf-gateway sandbox delete bf-vf-egress ``` ## Network Verification @@ -249,27 +249,27 @@ The sandbox process should see the normal OpenShell sandbox network path, not the BlueField VF: ```shell -/opt/openshell/bin/openshell --gateway worker3-bf sandbox exec \ +/opt/openshell/bin/openshell --gateway bf-gateway sandbox exec \ --name bf-vf-egress -- ip link ``` Verify internet egress through the policy path: ```shell -/opt/openshell/bin/openshell --gateway worker3-bf sandbox exec \ +/opt/openshell/bin/openshell --gateway bf-gateway sandbox exec \ --name bf-vf-egress -- curl -I https://example.com ``` On the host, the selected VF should be bound to `vfio-pci` while the sandbox is running and restored when the sandbox is deleted. -## Worker3 Validation Values +## Validation Values -The worker3 validation used: +A representative validation run used: ```text gateway: 10.0.110.4:18083 -host: worker3 / 10.0.110.23 +host: compute-host / 10.0.110.23 PF: enp177s0f0np0 / 0000:b1:00.0 VF: enp177s0f0v29 / 0000:b1:04.1 DPU representor: pf0vf29 diff --git a/crates/openshell-driver-bluefield/bf-vm/src/cli.rs b/crates/openshell-driver-bluefield/bf-vm/src/cli.rs index 945bc9253..748e5f035 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/cli.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/cli.rs @@ -17,7 +17,7 @@ use super::{BluefieldDriverConfig, ProxyPlacement}; pub struct BluefieldDriverArgs { #[arg( long = "bluefield", - env = "OPENSHELL_BLUEFIELD", + env = bf_core::env::BLUEFIELD, default_value_t = false )] pub enabled: bool, @@ -27,110 +27,110 @@ pub struct BluefieldDriverArgs { /// topology. #[arg( long = "bluefield-role", - env = "OPENSHELL_BLUEFIELD_ROLE", + env = bf_core::env::BLUEFIELD_ROLE, default_value = "all-in-one" )] pub role: String, #[arg( long = "bluefield-controller-endpoint", - env = "OPENSHELL_BLUEFIELD_CONTROLLER_ENDPOINT" + env = bf_core::env::BLUEFIELD_CONTROLLER_ENDPOINT )] pub controller_endpoint: Option, - #[arg(long = "bluefield-tls-dir", env = "OPENSHELL_BLUEFIELD_TLS_DIR")] + #[arg(long = "bluefield-tls-dir", env = bf_core::env::BLUEFIELD_TLS_DIR)] pub tls_dir: Option, #[arg( long = "bluefield-tls-domain", - env = "OPENSHELL_BLUEFIELD_TLS_DOMAIN", + env = bf_core::env::BLUEFIELD_TLS_DOMAIN, default_value = "bluefield-controller" )] pub tls_domain: String, - #[arg(long = "bluefield-host-pf", env = "OPENSHELL_BLUEFIELD_HOST_PF")] + #[arg(long = "bluefield-host-pf", env = bf_core::env::BLUEFIELD_HOST_PF)] pub host_pf: Option, #[arg( long = "bluefield-reserved-vf-index", - env = "OPENSHELL_BLUEFIELD_RESERVED_VF_INDEXES", + env = bf_core::env::BLUEFIELD_RESERVED_VF_INDEXES, value_delimiter = ',' )] pub reserved_vf_indexes: Vec, - #[arg(long = "bluefield-pf-key", env = "OPENSHELL_BLUEFIELD_PF_KEY")] + #[arg(long = "bluefield-pf-key", env = bf_core::env::BLUEFIELD_PF_KEY)] pub pf_key: Option, - #[arg(long = "bluefield-snat-ip", env = "OPENSHELL_BLUEFIELD_SNAT_IP")] + #[arg(long = "bluefield-snat-ip", env = bf_core::env::BLUEFIELD_SNAT_IP)] pub snat_ip: Option, #[arg( long = "bluefield-uplink-port", - env = "OPENSHELL_BLUEFIELD_UPLINK_PORT" + env = bf_core::env::BLUEFIELD_UPLINK_PORT )] pub uplink_port: Option, #[arg( long = "bluefield-kernel-image", - env = "OPENSHELL_BLUEFIELD_KERNEL_IMAGE" + env = bf_core::env::BLUEFIELD_KERNEL_IMAGE )] pub kernel_image: Option, #[arg( long = "bluefield-kernel-version", - env = "OPENSHELL_BLUEFIELD_KERNEL_VERSION" + env = bf_core::env::BLUEFIELD_KERNEL_VERSION )] pub kernel_version: Option, #[arg( long = "bluefield-kernel-sha256", - env = "OPENSHELL_BLUEFIELD_KERNEL_SHA256" + env = bf_core::env::BLUEFIELD_KERNEL_SHA256 )] pub kernel_sha256: Option, #[arg( long = "bluefield-kernel-modules", - env = "OPENSHELL_BLUEFIELD_KERNEL_MODULES", + env = bf_core::env::BLUEFIELD_KERNEL_MODULES, value_delimiter = ',' )] pub kernel_modules: Vec, #[arg( long = "bluefield-egress-cidr", - env = "OPENSHELL_BLUEFIELD_EGRESS_CIDR" + env = bf_core::env::BLUEFIELD_EGRESS_CIDR )] pub egress_cidr: Option, #[arg( long = "bluefield-egress-cidr-pool", - env = "OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL", + env = bf_core::env::BLUEFIELD_EGRESS_CIDR_POOL, value_delimiter = ',' )] pub egress_cidr_pool: Vec, #[arg( long = "bluefield-egress-gateway", - env = "OPENSHELL_BLUEFIELD_EGRESS_GATEWAY" + env = bf_core::env::BLUEFIELD_EGRESS_GATEWAY )] pub egress_gateway: Option, #[arg( long = "bluefield-egress-dns", - env = "OPENSHELL_BLUEFIELD_EGRESS_DNS", + env = bf_core::env::BLUEFIELD_EGRESS_DNS, value_delimiter = ',' )] pub egress_dns: Vec, #[arg( long = "bluefield-proxy-placement", - env = "OPENSHELL_BLUEFIELD_PROXY_PLACEMENT", + env = bf_core::env::BLUEFIELD_PROXY_PLACEMENT, default_value = "none" )] pub proxy_placement: String, #[arg( long = "bluefield-explicit-proxy-url", - env = "OPENSHELL_BLUEFIELD_EXPLICIT_PROXY_URL" + env = bf_core::env::BLUEFIELD_EXPLICIT_PROXY_URL )] pub explicit_proxy_url: Option, } diff --git a/crates/openshell-driver-bluefield/bf-vm/src/config.rs b/crates/openshell-driver-bluefield/bf-vm/src/config.rs index e80de0dce..771ae297e 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/config.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/config.rs @@ -127,10 +127,12 @@ pub(crate) fn guest_egress_from_config( gateway: gateway.clone(), })), (None, None) if !config.enabled => Ok(None), - _ => Err( - "BlueField guest egress requires OPENSHELL_BLUEFIELD_EGRESS_GATEWAY with OPENSHELL_BLUEFIELD_EGRESS_CIDR or OPENSHELL_BLUEFIELD_EGRESS_CIDR_POOL" - .to_string(), - ), + _ => Err(format!( + "BlueField guest egress requires {} with {} or {}", + bf_core::env::BLUEFIELD_EGRESS_GATEWAY, + bf_core::env::BLUEFIELD_EGRESS_CIDR, + bf_core::env::BLUEFIELD_EGRESS_CIDR_POOL + )), } } diff --git a/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs index e008e27ef..c5c924f62 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/guest_egress.rs @@ -6,13 +6,14 @@ use crate::lifecycle::GuestInitDropin; +use bf_core::env; use bf_inventory::FunctionSlot; -const ENV_EGRESS: &str = "OPENSHELL_VM_DATA_EGRESS"; -const ENV_IP_MODE: &str = "OPENSHELL_VM_DATA_IP_MODE"; -const ENV_IP: &str = "OPENSHELL_VM_DATA_IP"; -const ENV_GATEWAY: &str = "OPENSHELL_VM_DATA_GW"; -const ENV_MAC: &str = "OPENSHELL_VM_DATA_MAC"; +const ENV_EGRESS: &str = env::VM_DATA_EGRESS; +const ENV_IP_MODE: &str = env::VM_DATA_IP_MODE; +const ENV_IP: &str = env::VM_DATA_IP; +const ENV_GATEWAY: &str = env::VM_DATA_GW; +const ENV_MAC: &str = env::VM_DATA_MAC; const EGRESS_EXTERNAL_VF: &str = "external-vf"; const IP_MODE_STATIC: &str = "static"; const DROPIN_SCRIPT: &[u8] = include_bytes!("../scripts/guest-egress-dropin.sh"); diff --git a/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs index 72ae94048..a82c26610 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/host_pf.rs @@ -62,17 +62,18 @@ fn auto_discover_host_pf(sysfs_root: &Path) -> Result { let mut candidates = discover_bluefield_pf_candidates(sysfs_root)?; candidates.sort(); match candidates.len() { - 0 => Err( - "no BlueField-capable PF with configured SR-IOV VFs found; set OPENSHELL_BLUEFIELD_HOST_PF to the PF netdev or PCI BDF" - .to_string(), - ), + 0 => Err(format!( + "no BlueField-capable PF with configured SR-IOV VFs found; set {} to the PF netdev or PCI BDF", + bf_core::env::BLUEFIELD_HOST_PF + )), 1 => Ok(ResolvedHostPf { bdf: candidates.remove(0), source: HostPfSource::AutoDiscovered, }), _ => Err(format!( - "multiple BlueField-capable PFs found: {}; set OPENSHELL_BLUEFIELD_HOST_PF to one PF netdev or PCI BDF", - candidates.join(", ") + "multiple BlueField-capable PFs found: {}; set {} to one PF netdev or PCI BDF", + candidates.join(", "), + bf_core::env::BLUEFIELD_HOST_PF )), } } diff --git a/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs b/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs index b6a44e534..ad413c966 100644 --- a/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs +++ b/crates/openshell-driver-bluefield/bf-vm/src/qemu_kernel_resolver.rs @@ -27,12 +27,13 @@ pub(crate) fn resolve_qemu_kernel_image( } Err(format!( - "BlueField QEMU kernel image not found; searched: {}. Set OPENSHELL_BLUEFIELD_KERNEL_IMAGE or place vmlinux in the OpenShell vm-runtime directory. Docker and Kubernetes BlueField runtimes do not use this QEMU kernel path.", + "BlueField QEMU kernel image not found; searched: {}. Set {} or place vmlinux in the OpenShell vm-runtime directory. Docker and Kubernetes BlueField runtimes do not use this QEMU kernel path.", runtime_roots .iter() .map(|path| path.display().to_string()) .collect::>() - .join(", ") + .join(", "), + bf_core::env::BLUEFIELD_KERNEL_IMAGE )) }