diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md index a0030d35..04500feb 100644 --- a/docs/cookbook/intro.md +++ b/docs/cookbook/intro.md @@ -13,19 +13,16 @@ of the aforementioned systems. The absolute best way to understand `Hyper` and how it works is to play around with it. -## Getting Started +#### Auto-redistributed -The absolute best way to get started with `Hyper` is to play with it. +`umoci` and the guest `vmlinux` kernels are downloaded, checksum-verified, and +managed by Hyper itself; you do not install them. -### Requirements - -Hyper requires the following software be installed on each node running it: - - - [`skopeo`](https://github.com/containers/skopeo) - - [`e2fsprogs`](https://github.com/tytso/e2fsprogs) - -Hyper has more runtime dependencies, but they are automatically redistributed -by Hyper. +`firecracker` and `jailer` are not auto-downloaded. Install them with +`mix firecracker.install [--prefix ]` (default prefix `/opt/firecracker`), +which downloads the pinned v1.16.0 release, places the binaries at +`/firecracker` and `/jailer`, and prints the `/etc/hyper/config.toml` +snippet to paste in. ### Installation @@ -33,31 +30,24 @@ by Hyper. ### Configuration -Running `Hyper` is involved and requires a large number of pre-requisites. The -configuration of `:hyper` can be done by creating a `config :hyper` entry in -your `config.exs`. Refer to the given snippet for details on each -configuration. +Almost all host configuration — `work_dir`, the `[tools]` binary paths +(`firecracker`, `jailer`, `dmsetup`, ...), and the `[jails]` table (`cgroup`, +`uid_gid_range`) — lives in `/etc/hyper/config.toml` (the single source of +truth shared with the setuid helper, shown above), and every node-local path +(`jails`, `socks`, `scratch`, `layers`) is derived from `work_dir`. None of it is +repeated in `config :hyper`. + +The node's own tool paths (`skopeo`, `mke2fs`, `umoci`, `suidhelper`) now live in +the `[tools]` table of `/etc/hyper/config.toml` alongside the privileged binaries, +so `config :hyper` holds only the per-architecture guest kernels (each with a +default, so the block may be omitted): ```elixir config :hyper, - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - jailer_bin: "/opt/firecracker/jailer-v1.16.0-x86_64", - # TODO(markovejnovic): Remove this after it gets auto-downloaded. - firecracker_bin: "/opt/firecracker/firecracker-v1.16.0-x86_64", - # You must create a parent cgroup on your system. Continue reading for - # further details. - cgroup_parent: "hyper", - # TODO(markovejnovic): Merge these directories into one. - jailer_chroot_base: "/srv/hyper/jails", - socket_dir: "/srv/hyper/socks", - scratch_dir: "/srv/hyper/scratch", - # Hyper requires that each VM you pass - uid_gid_range: {900_000, 999_999}, - layer_dir: "/srv/hyper/layers" + # Per-architecture guest kernel images placed on the host. + vmlinux: %{x86_64: "/srv/hyper/redist/vmlinux/vmlinux-x86_64"} ``` - - ### Usage diff --git a/lib/hyper.ex b/lib/hyper.ex index fb0a626e..9b70f131 100644 --- a/lib/hyper.ex +++ b/lib/hyper.ex @@ -18,7 +18,7 @@ defmodule Hyper do @spec create_vm(Hyper.Vm.Spec.t()) :: {:ok, Hyper.Vm.t()} | {:error, term()} def create_vm(%Hyper.Vm.Spec{} = spec) do with {:ok, arch} <- resolve_arch(spec.arch) do - vm_id = gen_vm_id() + vm_id = Hyper.Vm.Id.generate() spec = %{spec | arch: arch} instance_spec = Hyper.Vm.Instance.spec(spec.type) @@ -34,17 +34,13 @@ defmodule Hyper do end end - @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)." - @spec gen_vm_id() :: Hyper.Vm.id() - def gen_vm_id, do: Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false) - @spec resolve_arch(Hyper.Vm.Instance.arch() | nil) :: {:ok, Hyper.Vm.Instance.arch()} | {:error, term()} defp resolve_arch(nil), do: Sys.Arch.current() defp resolve_arch(arch), do: {:ok, arch} @doc "Cluster-wide: which node currently runs `vm_id`? `nil` if unknown." - @spec whereis(Hyper.Vm.id()) :: node() | nil + @spec whereis(Hyper.Vm.Id.t()) :: node() | nil def whereis(vm_id), do: Hyper.Cluster.Routing.whereis(vm_id) @doc """ @@ -57,7 +53,7 @@ defmodule Hyper do died with its host, so "unknown" is the truthful answer. Only `:erpc`'s own transport failures are swallowed; a genuine fault in the lookup still raises. """ - @spec id(Hyper.Vm.t()) :: Hyper.Vm.id() | nil + @spec id(Hyper.Vm.t()) :: Hyper.Vm.Id.t() | nil def id(pid) when is_pid(pid) do :erpc.call(node(pid), Hyper.Cluster.Routing, :id_for, [pid]) catch diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex index c7802595..9491a132 100644 --- a/lib/hyper/cluster/routing.ex +++ b/lib/hyper/cluster/routing.ex @@ -28,8 +28,28 @@ defmodule Hyper.Cluster.Routing do @spec via(term()) :: {:via, module(), {atom(), term()}} def via(key), do: {:via, Horde.Registry, {@name, key}} + @doc """ + Register the calling process under `key` from inside its own `init`. + + Prefer this over starting a process with a `{:via, Horde.Registry, _}` name. + OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name` + immediately after the synchronous `register`, but Horde materialises the name + into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under + registry churn that read loses the race and OTP aborts startup with + `{:process_not_registered_via, Horde.Registry}`. Registering from within + `init` carries no such self-check, while leaving the name cluster-resolvable + through `via/1` once the diff propagates (callers already tolerate that lag). + """ + @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}} + def register_self(key) do + case Horde.Registry.register(@name, key, nil) do + {:ok, _pid} -> :ok + {:error, {:already_registered, _pid}} = err -> err + end + end + @doc "Which node currently runs `vm_id`? `nil` if unknown." - @spec whereis(Hyper.Vm.id()) :: node() | nil + @spec whereis(Hyper.Vm.Id.t()) :: node() | nil @decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id]) def whereis(vm_id) do case Horde.Registry.lookup(@name, {vm_id, :supervisor}) do @@ -43,7 +63,7 @@ defmodule Hyper.Cluster.Routing do replica via a registry match spec; intended to be called on the node that owns `pid` (see `Hyper.id/1`). """ - @spec id_for(pid()) :: Hyper.Vm.id() | nil + @spec id_for(pid()) :: Hyper.Vm.Id.t() | nil @decorate with_span("Hyper.Cluster.Routing.id_for") def id_for(pid) when is_pid(pid) do case Horde.Registry.select(@name, [ @@ -55,7 +75,7 @@ defmodule Hyper.Cluster.Routing do end @doc "Every VM the cluster currently knows about, paired with the node it runs on." - @spec all() :: [{Hyper.Vm.id(), node()}] + @spec all() :: [{Hyper.Vm.Id.t(), node()}] @decorate with_span("Hyper.Cluster.Routing.all") def all do @name diff --git a/lib/hyper/cluster/scheduler.ex b/lib/hyper/cluster/scheduler.ex index 1412a0b2..4877658c 100644 --- a/lib/hyper/cluster/scheduler.ex +++ b/lib/hyper/cluster/scheduler.ex @@ -16,6 +16,8 @@ defmodule Hyper.Cluster.Scheduler do alias Hyper.Vm.Instance.Spec alias Unit.Information + require Logger + use OpenTelemetryDecorator @type layer_sizes :: [{Hyper.Layer.id(), Unit.Information.t()}] @@ -45,8 +47,15 @@ defmodule Hyper.Cluster.Scheduler do |> candidates(layers) |> Enum.reduce_while({:error, :no_capacity}, fn node, acc -> case attempt.(node) do - {:ok, result} -> {:halt, {:ok, {node, result}}} - {:error, _reason} -> {:cont, acc} + {:ok, result} -> + {:halt, {:ok, {node, result}}} + + {:error, reason} -> + # The candidate fit the snapshot but refused at confirmation time. + # Log the real reason: otherwise an actual boot failure on the only + # candidate is indistinguishable from genuine `:no_capacity`. + Logger.warning("scheduler: #{inspect(node)} refused placement: #{inspect(reason)}") + {:cont, acc} end end) end diff --git a/lib/hyper/grpc/codec.ex b/lib/hyper/grpc/codec.ex index 650040a1..9da622a0 100644 --- a/lib/hyper/grpc/codec.ex +++ b/lib/hyper/grpc/codec.ex @@ -85,15 +85,15 @@ defmodule Hyper.Grpc.Codec do end @doc "Convert a domain result to an outbound response message, or an error to `GRPC.RPCError`." - @spec to_grpc({:created, Hyper.Vm.id(), node()}) :: CreateVmResponse.t() + @spec to_grpc({:created, Hyper.Vm.Id.t(), node()}) :: CreateVmResponse.t() def to_grpc({:created, vm_id, node}) when is_binary(vm_id), do: %CreateVmResponse{vm_id: vm_id, node: to_string(node)} - @spec to_grpc({:located, Hyper.Vm.id(), node()}) :: GetVmResponse.t() + @spec to_grpc({:located, Hyper.Vm.Id.t(), node()}) :: GetVmResponse.t() def to_grpc({:located, vm_id, node}), do: %GetVmResponse{vm_id: vm_id, node: to_string(node)} - @spec to_grpc({:vms, [{Hyper.Vm.id(), node()}]}) :: ListVmsResponse.t() + @spec to_grpc({:vms, [{Hyper.Vm.Id.t(), node()}]}) :: ListVmsResponse.t() def to_grpc({:vms, vms}), do: %ListVmsResponse{vms: Enum.map(vms, &vm/1)} @@ -117,7 +117,7 @@ defmodule Hyper.Grpc.Codec do def to_grpc({:exit, {:nodedown, _}}), do: rpc_error(:machine_unreachable) def to_grpc({:exit, reason}), do: rpc_error({:stop_failed, reason}) - @spec vm({Hyper.Vm.id(), node()}) :: Vm.t() + @spec vm({Hyper.Vm.Id.t(), node()}) :: Vm.t() defp vm({vm_id, node}), do: %Vm{vm_id: vm_id, node: to_string(node)} @spec instance_type(instance_enum()) :: {:ok, Hyper.Vm.Instance.t()} diff --git a/lib/hyper/img/db/lease.ex b/lib/hyper/img/db/lease.ex index 7c737406..979b5f84 100644 --- a/lib/hyper/img/db/lease.ex +++ b/lib/hyper/img/db/lease.ex @@ -49,7 +49,7 @@ defmodule Hyper.Img.Db.Lease do Upserts on `(node_id, vm_id)` - the same call both takes a fresh lease and heartbeats a live one. """ - @spec bump(Hyper.Img.id(), Hyper.Vm.id(), Unit.Time.t()) :: + @spec bump(Hyper.Img.id(), Hyper.Vm.Id.t(), Unit.Time.t()) :: {:ok, %__MODULE__{}} | {:error, Ecto.Changeset.t()} @decorate with_span("Hyper.Img.Db.Lease.bump", include: [:image_id, :vm_id]) def bump(image_id, vm_id, ttl) do @@ -72,7 +72,7 @@ defmodule Hyper.Img.Db.Lease do Release the lease issued to the given node_id and the given vm_id. Since each VM only ever uses one image, it is not necessary to specify the image id. """ - @spec release(Hyper.Vm.id()) :: :ok + @spec release(Hyper.Vm.Id.t()) :: :ok @decorate with_span("Hyper.Img.Db.Lease.release", include: [:vm_id]) def release(vm_id) do query = from(l in __MODULE__, where: l.node_id == ^to_string(node()) and l.vm_id == ^vm_id) diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex index a5e0f63b..4f172e88 100644 --- a/lib/hyper/node.ex +++ b/lib/hyper/node.ex @@ -26,6 +26,11 @@ defmodule Hyper.Node do real-time monitors backing the soft budget (`Hyper.Node.Budget.Soft`). Lives here, not at the application root, because both are per-node and only meaningful while this node hosts VMs. + + * `Hyper.Node.Reaper` - a periodic, liveness-aware GC for per-VM host + resources (orphaned firecracker cgroups and `hyper-rw-*` dm volumes) stranded + by an unclean death whose vm_id never reboots. Started last so the VM + supervisor it consults for liveness is already up. """ use Supervisor @@ -40,8 +45,14 @@ defmodule Hyper.Node do def start_link(opts \\ []) do case test_system() do - :ok -> Supervisor.start_link(__MODULE__, opts, name: __MODULE__) - {:error, reason} -> {:error, reason} + :ok -> + # Clear any dm/loop devices a previous unclean shutdown left behind, + # before the device-owning children start and collide with them. + :ok = Hyper.Node.Reclaim.run() + Supervisor.start_link(__MODULE__, opts, name: __MODULE__) + + {:error, reason} -> + {:error, reason} end end @@ -49,10 +60,16 @@ defmodule Hyper.Node do def init(_opts) do children = [ Hyper.Node.Users, + # Layer owns Hyper.Node.Layer.Registry, which Budget.Advertiser queries + # (via Hyper.Node.Layer.active/0) as it advertises on init - so Layer must + # be up first. + Hyper.Node.Layer, Hyper.Node.Budget.Supervisor, {DynamicSupervisor, name: @vm_sup, strategy: :one_for_one}, - Hyper.Node.Layer, - Hyper.Node.Img + Hyper.Node.Img, + # Last child: :one_for_one starts in order, so the VM supervisor and Img are + # up before the reaper's first tick can read their liveness. + Hyper.Node.Reaper ] Supervisor.init(children, strategy: :one_for_one) @@ -63,7 +80,7 @@ defmodule Hyper.Node do layer, resolve the kernel, and start the VM supervisor. The uid is freed and the mutable layer torn down automatically when the VM supervisor dies. """ - @spec start_image_vm(Hyper.Vm.id(), Hyper.Vm.Spec.t()) :: {:ok, pid()} | {:error, term()} + @spec start_image_vm(Hyper.Vm.Id.t(), Hyper.Vm.Spec.t()) :: {:ok, pid()} | {:error, term()} @decorate with_span("Hyper.Node.start_image_vm", include: [:vm_id, :spec]) def start_image_vm(vm_id, %Hyper.Vm.Spec{} = spec) do with {:ok, uid} <- Users.claim(), @@ -89,7 +106,7 @@ defmodule Hyper.Node do end @doc false - @spec build_opts(Hyper.Vm.id(), Hyper.Vm.Spec.t(), Users.id(), pid(), Path.t()) :: + @spec build_opts(Hyper.Vm.Id.t(), Hyper.Vm.Spec.t(), Users.id(), pid(), Path.t()) :: FireVMM.Opts.t() def build_opts(vm_id, %Hyper.Vm.Spec{} = spec, uid, mutable, kernel) do %FireVMM.Opts{ @@ -144,10 +161,11 @@ defmodule Hyper.Node do @spec test_system :: :ok | {:error, term()} def test_system do with {:ok, _} <- Hyper.Cfg.Budget.load(), - :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(), + :ok <- check_firecracker_bins(), :ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(), :ok <- Hyper.Node.Vmlinux.test_system(), :ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(), + :ok <- Hyper.Img.OciLoader.test_system(), :ok <- Hyper.Node.Users.test_system(), :ok <- Hyper.Node.Layer.Repo.test_system(), :ok <- Hyper.SuidHelper.test_system(), @@ -157,6 +175,24 @@ defmodule Hyper.Node do end end + @spec check_firecracker_bins :: + :ok + | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}} + | {:error, :firecracker_not_configured | :jailer_not_configured} + defp check_firecracker_bins do + with {:fc, {:ok, fc}} <- {:fc, Hyper.Cfg.Tools.firecracker_configured()}, + {:jail, {:ok, jail}} <- {:jail, Hyper.Cfg.Tools.jailer_configured()} do + cond do + not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}} + not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}} + true -> :ok + end + else + {:fc, :error} -> {:error, :firecracker_not_configured} + {:jail, :error} -> {:error, :jailer_not_configured} + end + end + @spec check_helper_base(Path.t()) :: :ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}} defp check_helper_base(base) do diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex index 7e5603b2..22b069ac 100644 --- a/lib/hyper/node/fire_vmm.ex +++ b/lib/hyper/node/fire_vmm.ex @@ -34,7 +34,7 @@ defmodule Hyper.Node.FireVMM do defstruct [:vm_id, :uid, :gid, :type, :arch, :mutable, :kernel, :boot_args] @type t :: %__MODULE__{ - vm_id: Hyper.Vm.id(), + vm_id: Hyper.Vm.Id.t(), uid: Hyper.Node.Users.id(), gid: Hyper.Node.Users.id(), type: Hyper.Vm.Instance.t(), @@ -47,14 +47,15 @@ defmodule Hyper.Node.FireVMM do @spec start_link(Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id)) + Supervisor.start_link(__MODULE__, opts) end + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(opts) do # Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by # the node-level DynamicSupervisor. %{ - vm_id: {__MODULE__, opts.vm_id}, + id: {__MODULE__, opts.vm_id}, start: {__MODULE__, :start_link, [opts]}, type: :supervisor, restart: :transient @@ -63,18 +64,26 @@ defmodule Hyper.Node.FireVMM do @impl true def init(opts) do - children = [ - # Client must be registered before Core: Core starts the State machine, - # which calls Client.run while waiting for the daemon's API. Client depends - # only on vm_id (an independent peer), so it has no reverse dependency. - {Client, %Client.Opts{vm_id: opts.vm_id}}, - {Core, opts} - ] + # Self-register the cluster routing entry here rather than via a start name; + # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never + # collides, so `:already_registered` only happens against a stale dead + # incarnation - decline the start and let the supervisor retry clean. + case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do + :ok -> + children = [ + # Client must be registered before Core: Core starts the State machine, + # which calls Client.run while waiting for the daemon's API. Client + # depends only on vm_id (an independent peer), so no reverse dependency. + {Client, %Client.Opts{vm_id: opts.vm_id}}, + {Core, opts} + ] - Supervisor.init(children, strategy: :one_for_one) - end + Supervisor.init(children, strategy: :one_for_one) - defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor}) + {:error, _} -> + :ignore + end + end @doc "Test whether the system can run firecracker VMMs." @spec test_system() :: :ok | {:error, term()} diff --git a/lib/hyper/node/fire_vmm/chroot_jail.ex b/lib/hyper/node/fire_vmm/chroot_jail.ex index edaa4f1d..cab7364c 100644 --- a/lib/hyper/node/fire_vmm/chroot_jail.ex +++ b/lib/hyper/node/fire_vmm/chroot_jail.ex @@ -27,7 +27,7 @@ defmodule Hyper.Node.FireVMM.ChrootJail do `uid:gid`), and return `cold` with its kernel + rootfs paths rewritten to their in-jail equivalents. Fails the boot if either artifact cannot be staged. """ - @spec stage(Hyper.Vm.id(), non_neg_integer(), non_neg_integer(), Cold.t()) :: + @spec stage(Hyper.Vm.Id.t(), non_neg_integer(), non_neg_integer(), Cold.t()) :: {:ok, Cold.t()} | {:error, term()} @decorate with_span("Hyper.Node.FireVMM.ChrootJail.stage", include: [:vm_id]) def stage(vm_id, uid, gid, %Cold{} = cold) do diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex index c6901ae2..c41b5f1d 100644 --- a/lib/hyper/node/fire_vmm/client.ex +++ b/lib/hyper/node/fire_vmm/client.ex @@ -55,18 +55,15 @@ defmodule Hyper.Node.FireVMM.Client do @type t :: %__MODULE__{socket_path: Path.t()} end + # Prod path (vm_id, no explicit name) starts unnamed and self-registers in + # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override + # (test stand-ins) is honoured as a plain local name and skips registration. @spec start_link(Opts.t()) :: GenServer.on_start() def start_link(%Opts{} = opts) do - name = - case opts.name do - nil when not is_nil(opts.vm_id) -> via(opts.vm_id) - other -> other - end - - GenServer.start_link(__MODULE__, opts, gen_opts(name)) + GenServer.start_link(__MODULE__, opts, gen_opts(opts.name)) end - @spec via(Hyper.Vm.id()) :: GenServer.name() + @spec via(Hyper.Vm.Id.t()) :: GenServer.name() def via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :client}) # Cap on a single Firecracker API call. @@ -79,12 +76,26 @@ defmodule Hyper.Node.FireVMM.Client do end @impl true - @spec init(Opts.t()) :: {:ok, State.t()} + @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}} def init(%Opts{} = opts) do - socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) - {:ok, %State{socket_path: socket_path}} + with :ok <- register(opts) do + socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id) + {:ok, %State{socket_path: socket_path}} + end + end + + # Register cluster-wide under {vm_id, :client} on the prod path. With an + # explicit name (test stand-in), the name is the local registration, so skip. + @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}} + defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do + case Hyper.Cluster.Routing.register_self({vm_id, :client}) do + :ok -> :ok + {:error, reason} -> {:stop, reason} + end end + defp register(%Opts{}), do: :ok + @impl true def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do {:reply, op_fun.(socket_path: socket_path), state} diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex index 8ccf6180..34830b54 100644 --- a/lib/hyper/node/fire_vmm/core.ex +++ b/lib/hyper/node/fire_vmm/core.ex @@ -18,8 +18,9 @@ defmodule Hyper.Node.FireVMM.Core do * firecracker crash -> the `Daemon` child exits; both restart; `Daemon` resets the stale jail and relaunches, and the fresh controller cold-boots. - `MuonTrap` kills the OS process when its port closes (teardown or BEAM death), - so no firecracker process outlives the supervisor. + `Daemon` guarantees firecracker is dead on teardown via the helper's + `cgroup.kill` (MuonTrap's port-close kill misses the setsid'd firecracker), so + no firecracker process outlives a graceful supervisor shutdown. """ use Supervisor @@ -28,9 +29,12 @@ defmodule Hyper.Node.FireVMM.Core do alias Hyper.Node.FireVMM.Daemon alias Hyper.Node.FireVMM.State + # Started unnamed: nothing resolves the core by name (it is addressed as a + # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a + # needless racy Horde registration at startup. @spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start() def start_link(opts) do - Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core})) + Supervisor.start_link(__MODULE__, opts) end @impl true diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex index 7cbc159c..882a6cca 100644 --- a/lib/hyper/node/fire_vmm/daemon.ex +++ b/lib/hyper/node/fire_vmm/daemon.ex @@ -3,26 +3,37 @@ defmodule Hyper.Node.FireVMM.Daemon do The jailed firecracker OS process for one microVM, as a static child of `Hyper.Node.FireVMM.Core`. - Lifecycle is supervisor-owned. On every (re)start it first resets any stale - jail left by a prior incarnation - the firecracker jailer refuses to reuse an - existing chroot - then builds the jailer command and runs it under - `MuonTrap.Daemon`, which kills the OS process when its port closes (controller - crash, container teardown, or BEAM death). So no firecracker process outlives - the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a - firecracker crash) cleanly cold-boots against a fresh jail. - - The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the - reset, then delegates and returns that pid. + A `trap_exit` GenServer that owns firecracker's lifetime end to end: + + * on every (re)start it resets any stale jail left by a prior incarnation — + the firecracker jailer refuses to reuse an existing chroot — then launches + the jailer under a linked `MuonTrap.Daemon`. The supervised process is + `hyper-suidhelper jailer ...`, which `execve`s into the jailer (same pid). + * if firecracker exits, the linked `MuonTrap.Daemon` exits and this server + stops with that reason, so `Core`'s `:one_for_all` cold-boots the pair. + * on teardown it **guarantees firecracker is dead**: `MuonTrap`'s port-close + kills by process group, but the jailer `setsid`s firecracker into its own + session, so it escapes that kill and would leak (holding the cgroup, the + rootfs dm device, loop devices). `terminate/2` therefore runs the helper's + `cgroup.kill` teardown (`ChrootJail.remove`), which SIGKILLs the whole leaf + cgroup regardless of session. The same call on (re)start cleans up after a + prior incarnation the BEAM could not (a SIGKILL'd node leaves no + `terminate/2`); the periodic `Hyper.Node.Reaper` is the final backstop. """ + use GenServer + use OpenTelemetryDecorator + alias Hyper.Node.FireVMM.{Jailer, Opts} alias Hyper.SuidHelper alias Unit.Time - use OpenTelemetryDecorator + require Logger @shutdown_timeout Time.s(5) + defstruct [:opts, :muontrap] + @spec child_spec(Opts.t()) :: Supervisor.child_spec() def child_spec(%Opts{} = opts) do %{ @@ -34,20 +45,82 @@ defmodule Hyper.Node.FireVMM.Daemon do } end - @doc """ - Reset the VM's stale jail, then launch the jailer under `MuonTrap.Daemon` and - return its pid. Fails (so the supervisor retries) if the reset cannot run. - """ - @spec start_link(Opts.t()) :: {:ok, pid()} | {:error, term()} - @decorate with_span("Hyper.Node.FireVMM.Daemon.start_link", include: [:id]) - def start_link(%Opts{vm_id: id} = opts) do - with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do - cmd = Jailer.command(opts) - - case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do - {:ok, pid} -> {:ok, pid} - {:error, _} = err -> err - end + @spec start_link(Opts.t()) :: GenServer.on_start() + def start_link(%Opts{} = opts) do + GenServer.start_link(__MODULE__, opts) + end + + @impl true + @decorate with_span("Hyper.Node.FireVMM.Daemon.init", include: [:id]) + def init(%Opts{vm_id: id} = opts) do + # Trap exits so the linked MuonTrap's exit reaches `handle_info` (not a silent + # link kill) and so `terminate/2` runs on supervisor shutdown. + Process.flag(:trap_exit, true) + + with :ok <- reset_stale_jail(id), + {:ok, muontrap} <- launch(opts) do + {:ok, %__MODULE__{opts: opts, muontrap: muontrap}} + else + {:error, reason} -> {:stop, reason} + end + end + + # firecracker (the linked MuonTrap.Daemon) exited: stop with its reason so + # `Core`'s `:one_for_all` discards the controller too and cold-boots the pair. + @impl true + def handle_info({:EXIT, muontrap, reason}, %__MODULE__{muontrap: muontrap} = state) do + {:stop, reason, state} + end + + def handle_info(_msg, state), do: {:noreply, state} + + # Guarantee firecracker is dead and its jail cleared. MuonTrap cannot kill the + # setsid'd firecracker; the helper's `cgroup.kill` can. Best-effort: a failure + # here is logged, and the `Reaper` will retry, but it must not crash teardown. + @impl true + @decorate with_span("Hyper.Node.FireVMM.Daemon.terminate", include: [:id]) + def terminate(_reason, %__MODULE__{opts: %Opts{vm_id: id}}) do + case clear_jail(id) do + :ok -> + :ok + + {:error, reason} -> + Logger.error("vm #{id}: teardown failed to clear jail: #{inspect(reason)}") + end + end + + @spec reset_stale_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()} + defp reset_stale_jail(id), do: clear_jail(id) + + @spec clear_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()} + defp clear_jail(id) do + SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) + end + + @spec launch(Opts.t()) :: {:ok, pid()} | {:error, term()} + defp launch(%Opts{vm_id: id} = opts) do + cmd = Jailer.command(opts) + + # Surface what the jailed process actually does: `log_output` routes the + # helper/jailer/firecracker stdout+stderr (guest serial console included) + # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque + # `:error_exit_status` into `{:firecracker_exited, status}` so a crash + # report names the real exit code instead of hiding it. + daemon_opts = [ + log_output: :info, + log_prefix: "vm #{id} firecracker: ", + stderr_to_stdout: true, + exit_status_to_reason: &{:firecracker_exited, &1} + ] + + case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do + {:ok, pid} -> + Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})") + {:ok, pid} + + {:error, reason} = err -> + Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}") + err end end end diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex index 381f9d23..3b6deb23 100644 --- a/lib/hyper/node/fire_vmm/jailer.ex +++ b/lib/hyper/node/fire_vmm/jailer.ex @@ -1,27 +1,26 @@ defmodule Hyper.Node.FireVMM.Jailer do @moduledoc """ - Builds the firecracker - [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md) - command for one VM. + Builds the `hyper-suidhelper jailer` command for one VM. - The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance` - flags) and drops privileges, then exec's firecracker. We run the jailer (not - firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS - process, the jailer owns isolation. + The BEAM does not invoke the jailer directly. Instead it calls the setuid helper + with the `jailer` subcommand; the helper reads the firecracker binary path, chroot + base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`, + re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps + supervising it). + + This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`, + repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates + everything else; it also inserts the `--` separator between its own flags and + firecracker's flags. Because firecracker is chrooted to `///root`, the API - socket it opens at `/api.socket` lives at `host_socket` on the host - that's the + socket it opens at `/api.socket` lives at `host_socket` on the host — that's the path the controller connects to. - - Host config: paths are derived from `config :hyper, work_dir: ...`. The - firecracker + jailer binaries are installed under `/redist/firecracker` - by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`. """ use OpenTelemetryDecorator alias Hyper.Node.FireVMM - alias Hyper.Node.FireVMM.Provider alias Hyper.Vm.Instance # firecracker's API socket path *inside* the chroot. @@ -36,6 +35,8 @@ defmodule Hyper.Node.FireVMM.Jailer do first failure. """ + alias Hyper.Cfg.{Dirs, Jails} + @doc "Run every pre-requisite check, halting at the first failure." @spec run() :: :ok | {:error, term()} def run do @@ -61,7 +62,7 @@ defmodule Hyper.Node.FireVMM.Jailer do end defp parent_cgroup_present do - if Sys.Linux.Cgroup.V2.named_exists?(Hyper.Cfg.Jails.cgroup()), + if Sys.Linux.Cgroup.V2.named_exists?(Jails.cgroup()), do: :ok, else: {:error, :missing_parent_cgroup} end @@ -77,7 +78,7 @@ defmodule Hyper.Node.FireVMM.Jailer do end defp chroot_writable do - case Sys.Posix.ensure_writable_dir(Hyper.Cfg.Dirs.chroot_base()) do + case Sys.Posix.ensure_writable_dir(Dirs.chroot_base()) do {:ok} -> :ok {:error, reason} -> {:error, {:chroot_base_unavailable, reason}} end @@ -92,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do @spec command(FireVMM.Opts.t()) :: t() def command(opts) do args = - [ - "--id", - opts.vm_id, - "--exec-file", - Provider.firecracker_bin(), - "--uid", - to_string(opts.uid), - "--gid", - to_string(opts.gid), - "--chroot-base-dir", - Hyper.Cfg.Dirs.chroot_base(), - "--cgroup-version", - "2", - "--parent-cgroup", - Hyper.Cfg.Jails.cgroup() - ] ++ + ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++ cgroup_flags(opts.type) ++ - ["--", "--api-sock", "/" <> @jail_socket] + ["--api-sock", "/" <> @jail_socket] - %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)} + %{binary: Hyper.Cfg.Tools.suidhelper(), args: args, host_socket: host_socket(opts.vm_id)} end # Find the appropriate jailer cgroup flags for the given instance type. @@ -125,25 +111,41 @@ defmodule Hyper.Node.FireVMM.Jailer do end @doc "Host path of the VM's per-VM jail dir (`//`)." - @spec chroot_dir(Hyper.Vm.id()) :: Path.t() + @spec chroot_dir(Hyper.Vm.Id.t()) :: Path.t() def chroot_dir(id) do Path.join([Hyper.Cfg.Dirs.chroot_base(), exec_name(), id]) end @doc "Host path of the VM's chroot root (`///root`)." - @spec chroot_root(Hyper.Vm.id()) :: Path.t() + @spec chroot_root(Hyper.Vm.Id.t()) :: Path.t() def chroot_root(id) do Path.join(chroot_dir(id), "root") end @doc """ - Host path of the VM's cgroup leaf (`/sys/fs/cgroup///`), the - cgroup the jailer creates for firecracker. Reconstructed (the jailer owns its + Host path of the cgroup dir holding every VM's leaf (`/sys/fs/cgroup/`). + Its immediate subdir names are vm_ids - the leaves the jailer creates for + firecracker - so listing it (directories only; the dir also holds cgroup control + files) enumerates the cgroup leaves on this host. + + Note the cgroup hierarchy has NO `` level: the jailer (cgroup v2, + `--parent-cgroup `) places firecracker directly at `/`, + unlike the chroot (`//`). Confirmed via + `/proc//cgroup` = `0:://`. + """ + @spec cgroup_parent_dir() :: Path.t() + def cgroup_parent_dir do + Path.join("/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup()) + end + + @doc """ + Host path of the VM's cgroup leaf (`/sys/fs/cgroup//`), the cgroup + the jailer creates for firecracker. Reconstructed (the jailer owns its placement) so a relaunch can clear the stale leaf left by a prior incarnation. """ - @spec cgroup_dir(Hyper.Vm.id()) :: Path.t() + @spec cgroup_dir(Hyper.Vm.Id.t()) :: Path.t() def cgroup_dir(id) do - Path.join(["/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup(), exec_name(), id]) + Path.join(cgroup_parent_dir(), id) end @doc """ @@ -153,7 +155,7 @@ defmodule Hyper.Node.FireVMM.Jailer do derive it independently and are guaranteed to agree. We do not control where the jailer places the socket, so the path is reconstructed here. """ - @spec host_socket(Hyper.Vm.id()) :: Path.t() + @spec host_socket(Hyper.Vm.Id.t()) :: Path.t() def host_socket(id) do Path.join([ Hyper.Cfg.Dirs.chroot_base(), @@ -164,5 +166,5 @@ defmodule Hyper.Node.FireVMM.Jailer do ]) end - defp exec_name, do: Path.basename(Provider.firecracker_bin()) + defp exec_name, do: Path.basename(Hyper.Cfg.Tools.firecracker()) end diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex deleted file mode 100644 index 0730c588..00000000 --- a/lib/hyper/node/fire_vmm/provider.ex +++ /dev/null @@ -1,92 +0,0 @@ -defmodule Hyper.Node.FireVMM.Provider do - @moduledoc """ - Installs the firecracker release for the current architecture into - `Hyper.Cfg.Dirs.firecracker_install_dir/0` (`/redist/firecracker`). - - `ensure_installed/0` is idempotent: if the binaries are already present and - executable it returns `:ok` without touching the network. Otherwise it fetches - the official firecracker release tarball for the detected architecture via - `Redist.Targz` (download, SHA-256 verify, extract). The archive is - extracted as-is - the binaries live under `release-v-/` exactly as - firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those - paths. - """ - - alias Redist.Targz - - @downloads %{ - x86_64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz", - sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", - firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64", - jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64" - }, - aarch64: %{ - url: - "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz", - sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", - firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64", - jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64" - } - } - - @doc "Absolute path to the installed firecracker binary." - @spec firecracker_bin() :: Path.t() - def firecracker_bin, do: bin_path(:firecracker) - - @doc "Absolute path to the installed jailer binary." - @spec jailer_bin() :: Path.t() - def jailer_bin, do: bin_path(:jailer) - - @doc "Ensure the firecracker release is installed for this node." - @spec ensure_installed() :: :ok | {:error, term()} - def ensure_installed do - with {:ok, arch} <- Sys.Arch.current() do - dl = Map.fetch!(@downloads, arch) - - case check_install(dl) do - :ok -> :ok - {:error, :not_installed} -> install(dl) - {:error, :bad_install} -> reinstall(dl) - end - end - end - - # `:ok` if `dl`'s version-specific binaries are present and executable; - # `{:error, :not_installed}` if the install dir is empty/absent; otherwise - # `{:error, :bad_install}` - something is there but it's the wrong version, - # partial, or corrupt, which we cannot fix in place because `Targz` keeps - # existing files. The remedy is to wipe and reinstall. - @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install} - defp check_install(dl) do - fc = Path.join(install_dir(), dl.firecracker) - jail = Path.join(install_dir(), dl.jailer) - - cond do - Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) -> - :ok - - File.dir?(install_dir()) and File.ls!(install_dir()) != [] -> - {:error, :bad_install} - - true -> - {:error, :not_installed} - end - end - - defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir()) - - defp reinstall(dl) do - _ = File.rm_rf!(install_dir()) - install(dl) - end - - defp bin_path(key) do - {:ok, arch} = Sys.Arch.current() - dl = Map.fetch!(@downloads, arch) - Path.join(install_dir(), Map.fetch!(dl, key)) - end - - defp install_dir, do: Hyper.Cfg.Dirs.firecracker_install_dir() -end diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex index b390965c..eab53db5 100644 --- a/lib/hyper/node/fire_vmm/state.ex +++ b/lib/hyper/node/fire_vmm/state.ex @@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do } @enforce_keys [:opts] - defstruct [:opts, :spec, :boot_deadline] + defstruct [:opts, :spec, :boot_deadline, api_granted: false] @type t :: %State{ opts: Opts.t(), spec: BootSpec.Cold.t() | nil, - boot_deadline: integer() | nil + boot_deadline: integer() | nil, + api_granted: boolean() } # How long to wait for the daemon's API to come up before failing the boot. @@ -54,11 +55,14 @@ defmodule Hyper.Node.FireVMM.State do %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}} end - def start_link(%Opts{vm_id: id} = opts) do - :gen_statem.start_link(via(id), __MODULE__, opts, []) + # Started unnamed; the controller self-registers under `{id, :state}` from + # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves + # it cluster-wide through `via/1`. + def start_link(%Opts{} = opts) do + :gen_statem.start_link(__MODULE__, opts, []) end - @spec stop(Hyper.Vm.id()) :: :ok + @spec stop(Hyper.Vm.Id.t()) :: :ok def stop(id) do :gen_statem.call(via(id), :stop) end @@ -72,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do # The daemon is already (being) started by `Core` as our sibling. Read the root # device off the per-VM mutable layer, resolve the boot spec, set the readiness # deadline, and start probing the API. - def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do - spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) - deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) - data = %State{opts: opts, spec: spec, boot_deadline: deadline} - - {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + def init( + %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = + opts + ) do + case Hyper.Cluster.Routing.register_self({id, :state}) do + :ok -> + spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type) + deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout) + data = %State{opts: opts, spec: spec, boot_deadline: deadline} + + {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]} + + {:error, reason} -> + {:stop, reason} + end end # Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel + @@ -110,31 +123,81 @@ defmodule Hyper.Node.FireVMM.State do @moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`." alias Hyper.Firecracker.Api.{InstanceInfo, Operations} - alias Hyper.Node.FireVMM.{Client, Opts} + alias Hyper.Node.FireVMM.{Client, Jailer, Opts} + alias Hyper.SuidHelper.ChrootJail alias Unit.Time + require Logger + # How often to probe the daemon's API while waiting for it. @probe_interval Time.ms(50) - # Poll the daemon's API until it answers, then configure. Give up if the - # readiness deadline passes first. + # Hand the jailed API socket to the node user, then poll the daemon's API + # until it answers and advance to `:configuring`. Give up if the readiness + # deadline passes first. The grant must happen before the probe: firecracker + # creates the socket owned by the per-VM uid, so the unprivileged controller + # gets EACCES on connect until the helper chowns it to us. def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do - case Client.run(Client.via(id), &Operations.describe_instance/1) do - {:ok, %InstanceInfo{}} -> - {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} - - {:error, _reason} -> - if System.monotonic_time(:millisecond) >= data.boot_deadline do - {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data} - else - {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + case ensure_api_granted(id, data) do + {:cont, data} -> + case Client.run(Client.via(id), &Operations.describe_instance/1) do + {:ok, %InstanceInfo{}} -> + {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]} + + {:error, reason} -> + keep_probing(id, data, reason) end + + {:wait, data, reason} -> + keep_probing(id, data, reason) end end def handle({:call, from}, :stop, data) do {:next_state, :stopping, data, [{:reply, from, :ok}]} end + + # Ensure the jailed API socket has been handed to the node user. Idempotent + # once granted (we record it in `data` so we ask the helper only once). + # `:socket_pending` means firecracker has not created the socket yet, so we + # keep waiting; a hard error is logged but also tolerated until the deadline + # (the probe that follows would fail with EACCES anyway and drive the stop). + @spec ensure_api_granted(Hyper.Vm.Id.t(), State.t()) :: + {:cont, State.t()} | {:wait, State.t(), term()} + defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data} + + defp ensure_api_granted(id, data) do + case ChrootJail.grant_api(Jailer.host_socket(id)) do + :ok -> + {:cont, %{data | api_granted: true}} + + {:error, :socket_pending} -> + {:wait, data, :socket_pending} + + {:error, reason} -> + Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}") + {:wait, data, {:grant_api, reason}} + end + end + + # Keep waiting for readiness, re-arming the probe timer, unless the deadline + # has lapsed - then fail the boot, surfacing `reason` rather than swallowing + # it into a bare `:daemon_unready`. A persistent failure here points at the + # host->jail socket (path or, more often, the grant/permission step above). + @spec keep_probing(Hyper.Vm.Id.t(), State.t(), term()) :: + {:keep_state, State.t(), list()} | {:stop, term(), State.t()} + defp keep_probing(id, data, reason) do + if System.monotonic_time(:millisecond) >= data.boot_deadline do + Logger.warning( + "vm #{id}: firecracker API not reachable before deadline; " <> + "last probe error: #{inspect(reason)}" + ) + + {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data} + else + {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]} + end + end end defmodule Configuring do @@ -145,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations} alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts} + require Logger + # Stage boot artifacts into the chroot, then issue the pre-boot config and - # start the guest. + # start the guest. Both failure paths end the boot via a supervisor restart, + # so log the reason here - otherwise it vanishes into the `:one_for_all` + # cycle and the VM just appears to relaunch for no visible cause. def handle( :state_timeout, :configure, @@ -155,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do case ChrootJail.stage(id, uid, gid, spec) do {:ok, jailed_spec} -> case apply_spec(id, jailed_spec) do - :ok -> {:next_state, :running, data} - {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data} + :ok -> + Logger.info("vm #{id}: configured, guest starting") + {:next_state, :running, data} + + {:error, reason} -> + Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}") + {:stop, {:shutdown, {:boot_failed, reason}}, data} end {:error, reason} -> + Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}") {:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data} end end @@ -174,7 +247,7 @@ defmodule Hyper.Node.FireVMM.State do # Cold boot, issued through the Client and aborting at the first error: # machine-config -> boot-source -> each drive -> each NIC -> InstanceStart. - @spec apply_spec(Hyper.Vm.id(), BootSpec.Cold.t()) :: :ok | {:error, term()} + @spec apply_spec(Hyper.Vm.Id.t(), BootSpec.Cold.t()) :: :ok | {:error, term()} @decorate with_span("Hyper.Node.FireVMM.State.Configuring.apply_spec", include: [:id]) defp apply_spec(id, %BootSpec.Cold{} = cold) do via = Client.via(id) diff --git a/lib/hyper/node/img.ex b/lib/hyper/node/img.ex index 320d4a7f..53bf2ae6 100644 --- a/lib/hyper/node/img.ex +++ b/lib/hyper/node/img.ex @@ -53,7 +53,7 @@ defmodule Hyper.Node.Img do end @doc "Create a per-VM mutable layer for `vm_id` over `img_id`." - @spec create_mutable(Hyper.Img.id(), Hyper.Vm.id()) :: {:ok, pid()} | {:error, term()} + @spec create_mutable(Hyper.Img.id(), Hyper.Vm.Id.t()) :: {:ok, pid()} | {:error, term()} def create_mutable(img_id, vm_id) do case DynamicSupervisor.start_child( @mutable_sup, @@ -74,7 +74,7 @@ defmodule Hyper.Node.Img do Serve `img` to `vm_id` for the duration of `callable`, holding a DB lease on the image (and transitively its whole blob chain) the whole time. """ - @spec with_image(Hyper.Img.id(), Hyper.Vm.id(), (-> result)) :: result | {:error, term()} + @spec with_image(Hyper.Img.id(), Hyper.Vm.Id.t(), (-> result)) :: result | {:error, term()} when result: var def with_image(img, vm_id, callable) do with_image_lease(img, vm_id, callable) @@ -84,7 +84,8 @@ defmodule Hyper.Node.Img do # even if `callable` raises. A background task re-bumps the lease for the whole # run, so a long-lived VM never lets its claim lapse. If the lease cannot be # taken, returns the error and never runs `callable`. - @spec with_image_lease(Hyper.Img.id(), Hyper.Vm.id(), (-> result)) :: result | {:error, term()} + @spec with_image_lease(Hyper.Img.id(), Hyper.Vm.Id.t(), (-> result)) :: + result | {:error, term()} when result: var defp with_image_lease(img, vm_id, callable) do ttl = Db.Lease.default_ttl() @@ -104,7 +105,7 @@ defmodule Hyper.Node.Img do # Re-bump the lease forever at 1/3 of the TTL, until killed. Runs in a task for the # lifetime of `callable`; transient bump failures are swallowed so a DB hiccup # can't tear down the VM - the next tick retries. - @spec heartbeat(Hyper.Img.id(), Hyper.Vm.id(), Unit.Time.t()) :: no_return() + @spec heartbeat(Hyper.Img.id(), Hyper.Vm.Id.t(), Unit.Time.t()) :: no_return() defp heartbeat(img, vm_id, ttl) do Process.sleep(div(Unit.Time.as_ms(ttl), 3)) diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex index 57de460b..07f06269 100644 --- a/lib/hyper/node/img/mutable.ex +++ b/lib/hyper/node/img/mutable.ex @@ -29,7 +29,7 @@ defmodule Hyper.Node.Img.Mutable do @enforce_keys [:img_id, :vm_id] defstruct [:img_id, :vm_id] - @type t :: %__MODULE__{img_id: Hyper.Img.id(), vm_id: Hyper.Vm.id()} + @type t :: %__MODULE__{img_id: Hyper.Img.id(), vm_id: Hyper.Vm.Id.t()} end defmodule State do @@ -119,6 +119,12 @@ defmodule Hyper.Node.Img.Mutable do @impl true def handle_info(:idle_timeout, state), do: {:noreply, state} + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do # Destroy the thin volume, then release the image (its monitor on us also @@ -129,7 +135,7 @@ defmodule Hyper.Node.Img.Mutable do end @doc false - @spec dm_name(Hyper.Vm.id()) :: String.t() + @spec dm_name(Hyper.Vm.Id.t()) :: String.t() def dm_name(vm_id), do: "hyper-rw-#{sanitize(vm_id)}" defp sanitize(id), do: String.replace(id, ~r/[^A-Za-z0-9._-]/, "_") diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex index 9587049c..24760465 100644 --- a/lib/hyper/node/img/server.ex +++ b/lib/hyper/node/img/server.ex @@ -128,6 +128,12 @@ defmodule Hyper.Node.Img.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{dm_names: dm_names}) do # Remove top-down (a snapshot's origin is the device below it). Layers are diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex index 7a50f7ec..dbab44c7 100644 --- a/lib/hyper/node/img/thin_pool.ex +++ b/lib/hyper/node/img/thin_pool.ex @@ -94,6 +94,13 @@ defmodule Hyper.Node.Img.ThinPool do {:reply, :ok, id_free(state, id)} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. An abnormal exit + # carries a non-`:normal` reason and falls through to the default handler. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, state) do _ = SuidHelper.Dmsetup.remove(@pool_name) diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex index 79e70491..d7249a74 100644 --- a/lib/hyper/node/layer/server.ex +++ b/lib/hyper/node/layer/server.ex @@ -119,6 +119,12 @@ defmodule Hyper.Node.Layer.Server do {:noreply, state} end + @impl true + # Each privileged command runs through `System.cmd`, which links a transient + # port to this process; because we trap exits (for `terminate/2` teardown), + # the port's normal close is delivered here. Ignore it. + def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state} + @impl true def terminate(_reason, %State{blk_path: blk_path}) do case SuidHelper.Losetup.detach(blk_path) do diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex new file mode 100644 index 00000000..479a1726 --- /dev/null +++ b/lib/hyper/node/reaper.ex @@ -0,0 +1,166 @@ +defmodule Hyper.Node.Reaper do + @moduledoc """ + Per-node periodic, liveness-aware garbage collector for per-VM host resources + that an unclean BEAM death can strand: a firecracker cgroup leaf and a + `hyper-rw-` dm volume whose owning processes' `terminate/2` never ran and + whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and + the relaunch-time cleanup in the FireVMM path, never get a chance to clear it). + + Liveness is the whole point. The reaper consults two independent sources of + truth for "this vm is alive" (`Plan.orphans/3` removes their union from the + candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup + leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A + candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it + is reaped, so a VM caught mid-boot (resources present, not yet registered) is + given a grace tick rather than destroyed. + + The decision logic lives in the pure `Hyper.Node.Reaper.Plan`; this module is a + thin I/O adapter that gathers the inputs, calls the plan, and executes the + best-effort, idempotent removals. + """ + + use GenServer + use OpenTelemetryDecorator + require Logger + + alias Hyper.Cluster.Routing + alias Hyper.Node.FireVMM.Jailer + alias Hyper.Node.Img + alias Hyper.Node.Reaper.{Config, Plan} + alias Hyper.SuidHelper.{ChrootJail, Dmsetup} + + @vm_sup Hyper.Node.VMSupervisor + + defstruct [:config, last_orphans: MapSet.new()] + + @type t :: %__MODULE__{config: Config.t(), last_orphans: MapSet.t(String.t())} + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts, name: __MODULE__) + end + + @impl true + def init(opts) do + config = Keyword.get(opts, :config) || Config.load() + + if config.enabled do + schedule(config) + {:ok, %__MODULE__{config: config}} + else + Logger.info("reaper: disabled by config; not starting") + :ignore + end + end + + @impl true + def handle_info(:tick, state) do + state = sweep(state) + schedule(state.config) + {:noreply, state} + end + + # Ignore any unexpected message (port noise, stale timers) rather than crashing. + def handle_info(_msg, state), do: {:noreply, state} + + @spec schedule(Config.t()) :: reference() + defp schedule(config) do + Process.send_after(self(), :tick, Unit.Time.as_ms(config.interval)) + end + + @spec sweep(t()) :: t() + @decorate with_span("Hyper.Node.Reaper.sweep") + defp sweep(%__MODULE__{} = state) do + live = gather_live() + leaves = list_cgroup_leaves() + rw = Plan.rw_ids(list_rw_dm()) + + current = Plan.orphans(live, leaves, rw) + {to_reap, next} = Plan.confirm(current, state.last_orphans) + + Enum.each(to_reap, &reap_one/1) + %{state | last_orphans: next} + end + + # Over-counting "live" only defers a reap (safe); under-counting destroys a live + # VM (catastrophic). So union two independent liveness sources: the local VM + # supervisor's children and the cluster routing table's view of this node. + @spec gather_live() :: MapSet.t(String.t()) + defp gather_live, do: MapSet.union(local_live(), routed_live()) + + @spec local_live() :: MapSet.t(String.t()) + defp local_live do + case Process.whereis(@vm_sup) do + nil -> + MapSet.new() + + _pid -> + @vm_sup + |> DynamicSupervisor.which_children() + |> Enum.map(fn {_, child, _, _} -> child end) + |> Enum.filter(&is_pid/1) + |> Enum.map(&Routing.id_for/1) + |> Enum.reject(&is_nil/1) + |> MapSet.new() + end + end + + @spec routed_live() :: MapSet.t(String.t()) + defp routed_live do + for {id, node} <- Routing.all(), node == node(), into: MapSet.new(), do: id + end + + @spec list_cgroup_leaves() :: [String.t()] + defp list_cgroup_leaves do + parent = Jailer.cgroup_parent_dir() + + case File.ls(parent) do + {:ok, names} -> + # The parent holds the per-VM leaf directories alongside cgroup control + # files (`cgroup.procs`, `cgroup.controllers`, ...); only the directories + # are vm_id leaves. + Enum.filter(names, &File.dir?(Path.join(parent, &1))) + + {:error, :enoent} -> + [] + + {:error, reason} -> + Logger.warning("reaper: could not list cgroup leaves: #{inspect(reason)}") + [] + end + end + + @spec list_rw_dm() :: [String.t()] + defp list_rw_dm do + case Dmsetup.list() do + {:ok, names} -> + names + + {:error, reason} -> + Logger.warning("reaper: could not list dm devices: #{inspect(reason)}") + [] + end + end + + @spec reap_one(String.t()) :: :ok + @decorate with_span("Hyper.Node.Reaper.reap_one", include: [:id]) + defp reap_one(id) do + Logger.warning("reaper: reaping orphan vm #{id}") + + log_result( + "chroot/cgroup", + id, + ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) + ) + + log_result("dm volume", id, Dmsetup.remove(Img.Mutable.dm_name(id))) + :ok + end + + @spec log_result(String.t(), String.t(), :ok | {:error, term()}) :: :ok + defp log_result(_what, _id, :ok), do: :ok + + defp log_result(what, id, {:error, reason}) do + Logger.warning("reaper: removing #{what} for #{id} failed: #{inspect(reason)}") + end +end diff --git a/lib/hyper/node/reaper/config.ex b/lib/hyper/node/reaper/config.ex new file mode 100644 index 00000000..5c0da184 --- /dev/null +++ b/lib/hyper/node/reaper/config.ex @@ -0,0 +1,35 @@ +defmodule Hyper.Node.Reaper.Config do + @moduledoc """ + Configuration for the per-node resource reaper (`Hyper.Node.Reaper`). + + Every field has a default, so configuration is optional - set only what you want + to change. Durations are `Unit.Time` values, so (like `Hyper.Img.Db.Gc.Config`) + overrides belong in `config/runtime.exs`: + + config :hyper, Hyper.Node.Reaper.Config, + enabled: true, + interval: Unit.Time.s(30) + + Set `enabled: false` to turn the reaper off entirely - it then never starts. + + ## Fields + + * `enabled` - run the reaper at all (default `true`). + * `interval` - rest between reap ticks (default `60s`). The two-strike grace + means an orphan is reaped at most one interval after it is first seen. + """ + + @type t :: %__MODULE__{ + enabled: boolean(), + interval: Unit.Time.t() + } + + defstruct enabled: true, + interval: Unit.Time.s(60) + + @doc "Build the config from app env, filling any unset field with its default." + @spec load() :: t() + def load do + struct!(__MODULE__, Application.get_env(:hyper, __MODULE__, [])) + end +end diff --git a/lib/hyper/node/reaper/plan.ex b/lib/hyper/node/reaper/plan.ex new file mode 100644 index 00000000..17b6dde9 --- /dev/null +++ b/lib/hyper/node/reaper/plan.ex @@ -0,0 +1,32 @@ +defmodule Hyper.Node.Reaper.Plan do + @moduledoc """ + Pure reap-decision core for `Hyper.Node.Reaper`. No I/O. Every safety invariant + is a property of these functions: a live vm_id is never a candidate, only an + orphan seen on two consecutive ticks is reaped, and only `hyper-rw-*` dm names + yield candidates (so `hyper-thinpool` / `hyper-img-*` can never be reaped). + """ + + @rw_prefix "hyper-rw-" + + @doc "vm_ids of orphan rw-volumes from raw `dmsetup ls` names (only `hyper-rw-*`)." + @spec rw_ids([String.t()]) :: [String.t()] + def rw_ids(dm_names) do + for name <- dm_names, + String.starts_with?(name, @rw_prefix), + do: String.replace_prefix(name, @rw_prefix, "") + end + + @doc "Candidate orphans this tick: (cgroup leaves ∪ rw vm_ids) minus the live set." + @spec orphans(MapSet.t(String.t()), [String.t()], [String.t()]) :: MapSet.t(String.t()) + def orphans(live, cgroup_leaves, rw_ids) do + cgroup_leaves + |> MapSet.new() + |> MapSet.union(MapSet.new(rw_ids)) + |> MapSet.difference(live) + end + + @doc "Two-strike grace: reap only ids that were also orphans last tick. Returns {to_reap, next_last}." + @spec confirm(MapSet.t(String.t()), MapSet.t(String.t())) :: + {MapSet.t(String.t()), MapSet.t(String.t())} + def confirm(current, last), do: {MapSet.intersection(current, last), current} +end diff --git a/lib/hyper/node/reclaim.ex b/lib/hyper/node/reclaim.ex new file mode 100644 index 00000000..a358e0c0 --- /dev/null +++ b/lib/hyper/node/reclaim.ex @@ -0,0 +1,99 @@ +defmodule Hyper.Node.Reclaim do + @moduledoc """ + Boot-time reclamation of device-mapper and loop devices orphaned by an unclean + shutdown (SIGKILL or `:erlang.halt`, where the owning GenServers' `terminate/2` + never ran to tear them down). + + Hyper names every dm device it creates with a `hyper-` prefix (`hyper-thinpool`, + `hyper-rw-`, `hyper-img--`), so this removes exactly those - never an + operator's unrelated dm devices. Removal is leaf-first: a device still open by + another (the pool under a thin volume, a snapshot under the next in its chain) + refuses until its dependents are gone, so leftovers are retried until a pass + removes nothing new. Loop devices backing files under Hyper's data dirs are then + detached (the dm devices that held them are gone by that point). + + Entirely best-effort: every failure is logged and boot continues. It runs once, + before any device-owning GenServer starts, so the freshly-booting node never + collides with its own previous instance's leftovers. + """ + + alias Hyper.SuidHelper.{Dmsetup, Losetup} + + require Logger + + @dm_prefix "hyper-" + + @spec run() :: :ok + def run do + reclaim_dm() + reclaim_loops() + :ok + end + + defp reclaim_dm do + case Dmsetup.list() do + {:ok, names} -> + case Enum.filter(names, &String.starts_with?(&1, @dm_prefix)) do + [] -> + :ok + + stale -> + Logger.warning( + "reclaim: removing #{length(stale)} stale dm device(s): #{inspect(stale)}" + ) + + remove_dm(stale) + end + + {:error, reason} -> + Logger.warning("reclaim: could not list dm devices: #{inspect(reason)}") + end + end + + @spec remove_dm([String.t()]) :: :ok + defp remove_dm([]), do: :ok + + defp remove_dm(names) do + {failed, removed_any?} = + Enum.reduce(names, {[], false}, fn name, {failed, any?} -> + case Dmsetup.remove(name) do + :ok -> {failed, true} + {:error, _} -> {[name | failed], any?} + end + end) + + cond do + failed == [] -> :ok + # A pass made progress: a retry may now clear the devices that were still + # held by the ones just removed. + removed_any? -> remove_dm(failed) + true -> Logger.error("reclaim: could not remove dm devices: #{inspect(failed)}") + end + end + + defp reclaim_loops do + case Losetup.list() do + {:ok, pairs} -> + for {dev, backing} <- pairs, under_data_dirs?(backing) do + case Losetup.detach(dev) do + :ok -> + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not detach #{dev} (#{backing}): #{inspect(reason)}") + end + end + + :ok + + {:error, reason} -> + Logger.warning("reclaim: could not list loop devices: #{inspect(reason)}") + end + end + + @spec under_data_dirs?(Path.t()) :: boolean() + defp under_data_dirs?(path) do + String.starts_with?(path, Hyper.Cfg.Dirs.scratch_dir() <> "/") or + String.starts_with?(path, Hyper.Cfg.Dirs.layer_dir() <> "/") + end +end diff --git a/lib/hyper/suid_helper.ex b/lib/hyper/suid_helper.ex index 1a11992a..74dd73bf 100644 --- a/lib/hyper/suid_helper.ex +++ b/lib/hyper/suid_helper.ex @@ -16,7 +16,7 @@ defmodule Hyper.SuidHelper do self-test and reports the base path it was compiled against. """ - alias Hyper.SuidHelper.{Blockdev, Dmsetup, Expected, Losetup} + alias Hyper.SuidHelper.{Dmsetup, Expected} use OpenTelemetryDecorator @@ -51,18 +51,20 @@ defmodule Hyper.SuidHelper do end @doc """ - Check that the setuid helper and every tool it execs are usable on this - machine: the helper binary is present, is the build this release expects - (`verify_version/0`), then each tool submodule's own check. + Check that the setuid helper is usable on this machine: the helper binary is + present, is the build this release expects (`verify_version/0`), and the kernel + exposes the device-mapper targets we need (`Dmsetup.test_system/0`, which also + exercises the helper's configured `dmsetup` binary). + + The `losetup`/`blockdev` binaries are validated by the helper the first time + each is used; their paths live in the helper's own config, not here. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.test_system") def test_system do with :ok <- helper_present(), - :ok <- verify_version(), - :ok <- Losetup.test_system(), - :ok <- Dmsetup.test_system() do - Blockdev.test_system() + :ok <- verify_version() do + Dmsetup.test_system() end end diff --git a/lib/hyper/suid_helper/blockdev.ex b/lib/hyper/suid_helper/blockdev.ex index be9bc39e..760fae84 100644 --- a/lib/hyper/suid_helper/blockdev.ex +++ b/lib/hyper/suid_helper/blockdev.ex @@ -11,18 +11,9 @@ defmodule Hyper.SuidHelper.Blockdev do @spec device_sectors(Path.t()) :: {:ok, pos_integer()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Blockdev.device_sectors", include: [:path]) def device_sectors(path) do - case SuidHelper.exec(["blockdev", "--bin", Hyper.Cfg.Tools.blockdev(), "--getsz", path]) do + case SuidHelper.exec(["blockdev", "--getsz", path]) do {:ok, %{"sectors" => n}} -> {:ok, n} {:error, _} = err -> err end end - - @doc "Check the blockdev binary is present." - @spec test_system() :: :ok | {:error, :blockdev_not_found} - @decorate with_span("Hyper.SuidHelper.Blockdev.test_system") - def test_system do - if System.find_executable(Hyper.Cfg.Tools.blockdev()), - do: :ok, - else: {:error, :blockdev_not_found} - end end diff --git a/lib/hyper/suid_helper/chroot_jail.ex b/lib/hyper/suid_helper/chroot_jail.ex index fdc0f1ce..cf8370c0 100644 --- a/lib/hyper/suid_helper/chroot_jail.ex +++ b/lib/hyper/suid_helper/chroot_jail.ex @@ -57,4 +57,25 @@ defmodule Hyper.SuidHelper.ChrootJail do {:error, _} = err -> err end end + + @doc """ + Hand the firecracker API `socket` to the node user so the unprivileged + controller can `connect()` to it. The jailer drops firecracker to a per-VM + uid/gid and chroots it, so the socket it creates is owned by that per-VM id and + the node (a different uid) gets `EACCES` on connect. The helper chowns just + that one socket to its caller (the node user) and chmods it `0660`, leaving the + rest of the per-VM isolation intact. + + Returns `{:error, :socket_pending}` while firecracker has not yet created the + socket, so the caller can keep waiting. + """ + @spec grant_api(Path.t()) :: :ok | {:error, :socket_pending} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.ChrootJail.grant_api", include: [:socket]) + def grant_api(socket) do + case SuidHelper.exec(["chroot-jail", "grant-api", "--socket", socket]) do + {:ok, %{"result" => "granted"}} -> :ok + {:ok, %{"result" => "pending"}} -> {:error, :socket_pending} + {:error, _} = err -> err + end + end end diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex index c8dabc88..1be0bbfe 100644 --- a/lib/hyper/suid_helper/dmsetup.ex +++ b/lib/hyper/suid_helper/dmsetup.ex @@ -59,14 +59,7 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec remove(String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.remove", include: [:name]) def remove(name) do - case SuidHelper.exec([ - "dmsetup", - "--bin", - Hyper.Cfg.Tools.dmsetup(), - "remove", - "--retry", - name - ]) do + case SuidHelper.exec(["dmsetup", "remove", "--retry", name]) do {:ok, _} -> :ok {:error, _} = err -> err end @@ -76,39 +69,31 @@ defmodule Hyper.SuidHelper.Dmsetup do @spec message(String.t(), String.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message]) def message(name, message) do - argv = - ["dmsetup", "--bin", Hyper.Cfg.Tools.dmsetup(), "message", name, "--message", message] - - case SuidHelper.exec(argv) do + case SuidHelper.exec(["dmsetup", "message", name, "--message", message]) do {:ok, _} -> :ok {:error, _} = err -> err end end @doc """ - Check the dmsetup binary is present and the kernel exposes the dm targets we - use (snapshot, thin, thin-pool). + Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool). + + Routes through the setuid helper: `dmsetup targets` opens `/dev/mapper/control`, + which needs root, and the BEAM runs unprivileged. The helper validates its + configured `dmsetup` binary before running it, so a missing or unsafe binary + surfaces here too. """ @spec test_system() :: :ok | {:error, term()} @decorate with_span("Hyper.SuidHelper.Dmsetup.test_system") def test_system do - if System.find_executable(Hyper.Cfg.Tools.dmsetup()), - do: test_targets(), - else: {:error, :dmsetup_not_found} - end - - @doc "Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool)." - @spec test_targets() :: :ok | {:error, term()} - @decorate with_span("Hyper.SuidHelper.Dmsetup.test_targets") - def test_targets do - case System.cmd(Hyper.Cfg.Tools.dmsetup(), ["targets"], stderr_to_stdout: true) do - {out, 0} -> + case SuidHelper.exec(["dmsetup", "targets"]) do + {:ok, %{"output" => out}} -> have = parse_targets(out) missing = Enum.reject(@required_targets, &MapSet.member?(have, &1)) if missing == [], do: :ok, else: {:error, {:missing_dm_targets, missing}} - {out, code} -> - {:error, {:dmsetup_targets_failed, code, String.trim(out)}} + {:error, {code, msg}} -> + {:error, {:dmsetup_targets_failed, code, msg}} end end @@ -122,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do |> MapSet.new() end + @doc "Names of every device-mapper device currently present on this host." + @spec list() :: {:ok, [String.t()]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Dmsetup.list") + def list do + case SuidHelper.exec(["dmsetup", "ls"]) do + {:ok, %{"output" => out}} -> {:ok, parse_names(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_names(String.t()) :: [String.t()] + def parse_names(out) do + case String.trim(out) do + # `dmsetup ls` prints this sentinel (not a device row) when there are none. + "No devices found" -> + [] + + _ -> + out + |> String.split("\n", trim: true) + |> Enum.map(&(&1 |> String.split() |> List.first())) + |> Enum.reject(&is_nil/1) + end + end + @doc false @spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t() def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do @@ -145,9 +156,7 @@ defmodule Hyper.SuidHelper.Dmsetup do # create flags (e.g. `--readonly`). Returns the `/dev/mapper/` path. @spec create(String.t(), String.t(), [String.t()]) :: {:ok, Path.t()} | {:error, err()} defp create(name, table, flags) do - argv = - ["dmsetup", "--bin", Hyper.Cfg.Tools.dmsetup(), "create", name] ++ - flags ++ ["--table", table] + argv = ["dmsetup", "create", name] ++ flags ++ ["--table", table] case SuidHelper.exec(argv) do {:ok, %{"device" => dev}} -> {:ok, dev} diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex index 9ee9191a..3abf4fba 100644 --- a/lib/hyper/suid_helper/losetup.ex +++ b/lib/hyper/suid_helper/losetup.ex @@ -11,7 +11,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_ro(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_ro", include: [:path]) def attach_ro(path) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Cfg.Tools.losetup(), "attach", path]) do + case SuidHelper.exec(["losetup", "attach", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -21,14 +21,7 @@ defmodule Hyper.SuidHelper.Losetup do @spec attach_rw(Path.t()) :: {:ok, Path.t()} | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.attach_rw", include: [:path]) def attach_rw(path) do - case SuidHelper.exec([ - "losetup", - "--bin", - Hyper.Cfg.Tools.losetup(), - "attach", - "--rw", - path - ]) do + case SuidHelper.exec(["losetup", "attach", "--rw", path]) do {:ok, %{"device" => dev}} -> {:ok, dev} {:error, _} = err -> err end @@ -38,18 +31,34 @@ defmodule Hyper.SuidHelper.Losetup do @spec detach(Path.t()) :: :ok | {:error, err()} @decorate with_span("Hyper.SuidHelper.Losetup.detach", include: [:dev]) def detach(dev) do - case SuidHelper.exec(["losetup", "--bin", Hyper.Cfg.Tools.losetup(), "detach", dev]) do + case SuidHelper.exec(["losetup", "detach", dev]) do {:ok, _} -> :ok {:error, _} = err -> err end end - @doc "Check the losetup binary is present." - @spec test_system() :: :ok | {:error, :losetup_not_found} - @decorate with_span("Hyper.SuidHelper.Losetup.test_system") - def test_system do - if System.find_executable(Hyper.Cfg.Tools.losetup()), - do: :ok, - else: {:error, :losetup_not_found} + @doc "Currently-attached loop devices as `{device, backing_file}` pairs." + @spec list() :: {:ok, [{Path.t(), Path.t()}]} | {:error, err()} + @decorate with_span("Hyper.SuidHelper.Losetup.list") + def list do + case SuidHelper.exec(["losetup", "list"]) do + {:ok, %{"output" => out}} -> {:ok, parse_list(out)} + {:error, _} = err -> err + end + end + + @doc false + @spec parse_list(String.t()) :: [{Path.t(), Path.t()}] + def parse_list(out) do + out + |> String.split("\n", trim: true) + |> Enum.flat_map(fn line -> + # `NAME BACK-FILE` rows; a loop with no backing file has only one column + # (nothing for us to reclaim by file), so skip it. + case String.split(line, " ", parts: 2, trim: true) do + [dev, backing] -> [{dev, String.trim(backing)}] + _ -> [] + end + end) end end diff --git a/lib/hyper/vm.ex b/lib/hyper/vm.ex index 0b99da8d..3001e41c 100644 --- a/lib/hyper/vm.ex +++ b/lib/hyper/vm.ex @@ -10,7 +10,6 @@ defmodule Hyper.Vm do @dialyzer {:nowarn_function, [fast_fork: 1, fork: 1]} @type t :: pid() - @type id :: String.t() @typedoc """ What a VM boots from: explicit, already-jail-visible artifact paths for a cold diff --git a/lib/hyper/vm/id.ex b/lib/hyper/vm/id.ex new file mode 100644 index 00000000..7c43d29e --- /dev/null +++ b/lib/hyper/vm/id.ex @@ -0,0 +1,27 @@ +defmodule Hyper.Vm.Id do + @moduledoc """ + A microVM id and its generator. + + An id is a `v` prefix followed by lowercase base32 of 10 random bytes, charset + `[a-z2-7]` - alphanumeric only, no `-`, `_`, or other punctuation. That charset + is the intersection of three independent constraints the id must satisfy at + once: + + * firecracker rejects `_` in an instance id (`InvalidInstanceId`); + * dm/jailer names must not start with `-`; + * registry keys and chroot path components stay trivially safe. + """ + + @type t :: String.t() + + @doc """ + Generate a fresh, random VM id (see the module doc for the charset contract). + + The previous base64url encoding emitted `-` and `_`, so it could produce ids + firecracker refused at boot (`Invalid char (_)`). + """ + @spec generate() :: t() + def generate do + "v" <> Base.encode32(:crypto.strong_rand_bytes(10), padding: false, case: :lower) + end +end diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex new file mode 100644 index 00000000..38bb7ad1 --- /dev/null +++ b/lib/mix/tasks/firecracker.install.ex @@ -0,0 +1,148 @@ +defmodule Mix.Tasks.Firecracker.Install do + @shortdoc "Download, verify, and install the pinned Firecracker release" + @moduledoc """ + Downloads, verifies, and installs the pinned Firecracker release (v1.16.0) + for the current CPU architecture. + + mix firecracker.install [--prefix DIR] + + Steps performed: + + 1. Detects the CPU architecture (`x86_64` or `aarch64`). + 2. Downloads the release tarball and verifies its SHA-256 checksum. + 3. Extracts the tarball, then copies the binaries to `/firecracker` + and `/jailer` using the **bare basenames** `firecracker` and + `jailer`. The setuid helper validates binaries via `SafeBin<"firecracker">` + and `SafeBin<"jailer">`, which match on basename only — version-stamped + names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally. + 4. Marks both binaries executable (`0o755`). + 5. Prints the `/etc/hyper/config.toml` snippet the operator needs to paste. + + This task installs **unprivileged** binaries and prints configuration. + Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper). + This task does **not** setuid `firecracker` or `jailer`. Install and setuid + the helper separately with `mix suidhelper.install`. + + ## Options + + * `--prefix DIR` — installation directory (default: `/opt/firecracker`). + + ## Security requirements + + After installing, ensure: + + * The binaries are root-owned and **not** group- or world-writable. + The suidhelper refuses binaries with loose permissions. + * `/etc/hyper/config.toml` is root-owned with mode `0644`. + """ + + use Mix.Task + + @version "1.16.0" + @default_prefix "/opt/firecracker" + + @impl Mix.Task + @spec run([String.t()]) :: :ok + def run(argv) do + {opts, _rest, _invalid} = OptionParser.parse(argv, strict: [prefix: :string]) + prefix = Keyword.get(opts, :prefix, @default_prefix) + + arch = detect_arch!() + + case Application.ensure_all_started(:req) do + {:ok, _} -> :ok + {:error, {reason, app}} -> Mix.raise("Cannot start HTTP client #{app}: #{inspect(reason)}") + end + + install!(release_for(arch), prefix) + print_config(prefix) + end + + defp detect_arch! do + case Sys.Arch.current() do + {:ok, arch} -> + arch + + {:error, {:unsupported_arch, raw}} -> + Mix.raise( + "Unsupported CPU architecture #{inspect(raw)}; " <> + "Firecracker supports x86_64 and aarch64." + ) + end + end + + defp release_for(:x86_64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-x86_64.tgz", + sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6", + firecracker_path: "release-v#{@version}-x86_64/firecracker-v#{@version}-x86_64", + jailer_path: "release-v#{@version}-x86_64/jailer-v#{@version}-x86_64" + } + end + + defp release_for(:aarch64) do + %{ + url: + "https://github.com/firecracker-microvm/firecracker/releases/download/" <> + "v#{@version}/firecracker-v#{@version}-aarch64.tgz", + sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7", + firecracker_path: "release-v#{@version}-aarch64/firecracker-v#{@version}-aarch64", + jailer_path: "release-v#{@version}-aarch64/jailer-v#{@version}-aarch64" + } + end + + defp install!( + %{url: url, sha256: sha256, firecracker_path: fc_rel, jailer_path: jailer_rel}, + prefix + ) do + extract_dir = Path.join(prefix, ".firecracker-extract") + + Mix.shell().info("Downloading Firecracker v#{@version} from #{url} ...") + + case Redist.Targz.install(url, sha256, extract_dir) do + :ok -> :ok + {:error, reason} -> Mix.raise("Download from #{url} failed: #{inspect(reason)}") + end + + dst_fc = Path.join(prefix, "firecracker") + dst_jailer = Path.join(prefix, "jailer") + + # The release ships version-stamped names; copy to bare basenames so SafeBin + # validation passes. The helper matches on basename, not full path. + File.cp!(Path.join(extract_dir, fc_rel), dst_fc) + File.cp!(Path.join(extract_dir, jailer_rel), dst_jailer) + File.chmod!(dst_fc, 0o755) + File.chmod!(dst_jailer, 0o755) + _ = File.rm_rf!(extract_dir) + + Mix.shell().info("Installed #{dst_fc}") + Mix.shell().info("Installed #{dst_jailer}") + end + + defp print_config(prefix) do + fc = Path.join(prefix, "firecracker") + jailer = Path.join(prefix, "jailer") + + # This task runs unprivileged, so the binaries land owned by the invoking + # user. The suidhelper's SafeBin refuses any binary not owned by root and not + # free of group/other write bits, so the operator MUST chown/chmod them or + # every jailer launch fails closed. Print the exact commands rather than a + # vague "ensure root-owned". + Mix.shell().info(""" + + Almost done. Run these as root so the setuid helper will accept the binaries + (it refuses any jailer/firecracker not owned by root): + + sudo chown root:root #{fc} #{jailer} + sudo chmod 0755 #{fc} #{jailer} + + Then add to /etc/hyper/config.toml (file: root-owned, mode 0644): + + [tools] + firecracker = "#{fc}" + jailer = "#{jailer}" + """) + end +end diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex new file mode 100644 index 00000000..2085de3c --- /dev/null +++ b/lib/mix/tasks/suidhelper.install.ex @@ -0,0 +1,93 @@ +defmodule Mix.Tasks.Suidhelper.Install do + @shortdoc "Build, stamp, and install the setuid helper" + @moduledoc """ + Builds, stamps, and installs the Rust setuid helper. + + mix suidhelper.install + + Two steps: + + 1. `cargo xtask stamp` in `native/suidhelper` builds the release binary and + writes its BLAKE3 self-checksum into `.note.sum` (the same step the + `:suidhelper_stamp` compiler runs). + 2. The stamped binary is copied setuid-root (mode `4755`) to + `/usr/local/bin/hyper-suidhelper`. + + The copy needs root, but Mix runs every subprocess in its own session with no + controlling terminal (`erl_child_setup` calls `setsid`), so a nested `sudo` + cannot open `/dev/tty` to prompt for a password. This task therefore only runs + `sudo` itself when it is already non-interactive (`sudo -n` succeeds, e.g. + `NOPASSWD` or a usable cached credential). Otherwise it prints the exact + privileged command for you to run in your own terminal. + + This is the privileged counterpart to `mix suidhelper.stamp`, which stamps + only. `cargo` and the helper's toolchain (see + `native/suidhelper/rust-toolchain.toml`) must be installed. + """ + + use Mix.Task + + @helper_dir "native/suidhelper" + @source Path.join(@helper_dir, "target/release/hyper-suidhelper") + # Must match `Hyper.Config`'s default `suid_helper` path and the xtask's + # `INSTALL_PATH`: a `PATH` location the unprivileged node can exec. + @install_path "/usr/local/bin/hyper-suidhelper" + + @impl Mix.Task + def run(argv) do + stamp!(argv) + install_privileged() + end + + defp stamp!(argv) do + case System.cmd("cargo", ["xtask", "stamp" | argv], + cd: @helper_dir, + into: IO.stream(:stdio, :line) + ) do + {_, 0} -> + :ok + + {_, _} -> + Mix.raise(""" + `cargo xtask stamp` failed building the suidhelper. + + Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml) + are installed. + """) + end + end + + defp install_privileged do + if passwordless_sudo?() do + Mix.shell().info("Installing #{@source} -> #{@install_path} (setuid root)") + + case System.cmd("sudo", install_argv(), into: IO.stream(:stdio, :line)) do + {_, 0} -> Mix.shell().info("installed #{@install_path} (setuid root)") + {_, _} -> Mix.raise(manual_instructions()) + end + else + Mix.shell().info(manual_instructions()) + end + end + + # `sudo -n true` exits 0 only when sudo can run without prompting. With no + # controlling terminal a cached `tty_tickets` credential is invisible, so this + # is true essentially only under `NOPASSWD` -- exactly the case where the + # nested `sudo install` below can succeed. + defp passwordless_sudo? do + match?({_, 0}, System.cmd("sudo", ["-n", "true"], stderr_to_stdout: true)) + end + + defp install_argv, + do: ["install", "-o", "root", "-g", "root", "-m", "4755", @source, @install_path] + + defp manual_instructions do + """ + + The binary is built and stamped, but installing it setuid-root needs a + password and `sudo` has no terminal to prompt on here. Run the copy yourself: + + sudo #{Enum.join(install_argv(), " ")} + """ + end +end diff --git a/mix.exs b/mix.exs index 7ccae729..fe3f48f4 100644 --- a/mix.exs +++ b/mix.exs @@ -24,6 +24,10 @@ defmodule Hyper.MixProject do # Cache the PLTs in a stable, gitignored dir so CI can cache them. plt_local_path: "priv/plts", plt_core_path: "priv/plts", + # `:mix` is needed so the Mix tasks under `lib/mix/tasks` (which call + # `Mix.raise/1`, `Mix.shell/0`, and implement the `Mix.Task` behaviour) + # resolve instead of tripping `unknown_function`. + plt_add_apps: [:mix], # Verify @specs against actual returns, and flag ignored return values. flags: [:unmatched_returns, :extra_return, :missing_return] ] diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml index f5b4b1b7..be6a449c 100644 --- a/native/suidhelper/Cargo.toml +++ b/native/suidhelper/Cargo.toml @@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs" name = "tools_dmsetup_parsers" path = "tests/tools/dmsetup_parsers.rs" +[[test]] +name = "tools_jailer" +path = "tests/tools/jailer.rs" + [[test]] name = "util_confinement" path = "tests/util/confinement.rs" @@ -56,6 +60,10 @@ path = "tests/util/confinement.rs" name = "tools_chroot_jail_remove" path = "tests/tools/chroot_jail_remove.rs" +[[test]] +name = "tools_chroot_jail_grant_api" +path = "tests/tools/chroot_jail_grant_api.rs" + [[test]] name = "util_chroot_jail" path = "tests/util/chroot_jail.rs" @@ -68,14 +76,22 @@ path = "tests/e2e/config.rs" name = "e2e_argv" path = "tests/e2e/argv.rs" +[[test]] +name = "e2e_jailer" +path = "tests/e2e/jailer.rs" + [[test]] name = "e2e_chroot_jail" path = "tests/e2e/chroot_jail.rs" +[[test]] +name = "config_uid_gid_range" +path = "tests/config_uid_gid_range.rs" + [dependencies] clap = { version = "4", features = ["derive"] } hyper-suidhelper-meta = { path = "meta" } -nix = { version = "0.29", features = ["user", "fs", "dir"] } +nix = { version = "0.29", features = ["user", "fs", "dir", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "1" diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs index 425ba386..41b00a75 100644 --- a/native/suidhelper/src/config.rs +++ b/native/suidhelper/src/config.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: AGPL-3.0-only //! Runtime host configuration, read from a single root-owned TOML file. +//! +//! ## UID/GID range divergence +//! +//! Elixir keeps a default `{900_000, 999_999}` that governs which UIDs the node +//! hands *out*; this helper reads `[jails] uid_gid_range` from config.toml to +//! decide which UIDs it *accepts* (default `{900_000, 999_999}` when the key is +//! absent). Operators narrowing the range must set **both**. +use crate::util::safe_bin::{self, SafeBin}; use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile}; use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; use nix::fcntl::OFlag; @@ -21,11 +29,56 @@ pub enum LoadingError { Malformed(PathBuf), #[error("work_dir in {0:?} must be an absolute path")] Relative(PathBuf), + #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")] + BadUidGidRange { min: u32, max: u32 }, +} + +/// Error returned by config accessors for tool binaries derived from config. +#[derive(Debug, Error)] +pub enum BinError { + #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")] + Unconfigured(&'static str), + #[error(transparent)] + Bin(#[from] safe_bin::Error), } const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml"; const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH"; +/// UID/GID allocation band, read from `[jails] uid_gid_range` in config.toml as +/// a two-element `[min, max]` array. Controls which UIDs the helper *accepts* +/// from the BEAM — see module docs. +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(from = "[u32; 2]")] +pub struct UidGidRange { + pub min: u32, + pub max: u32, +} + +impl From<[u32; 2]> for UidGidRange { + fn from([min, max]: [u32; 2]) -> Self { + Self { min, max } + } +} + +// Band defaults match Elixir's `compile_env` allocation defaults so that an +// unconfigured helper and an unconfigured node agree out of the box. +const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999); + +/// Validate a uid_gid_range value. A present range where min==0 or min>max is +/// treated as a config trust violation — fatal at load, consistent with the +/// "present but untrusted" model. Exposed so tests can verify the contract +/// without touching the file system. +pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> { + if r.min == 0 || r.min > r.max { + return Err(LoadingError::BadUidGidRange { + min: r.min, + max: r.max, + }); + } + Ok(()) +} + /// The config file path. In production this is the fixed `/etc/hyper/config.toml`. /// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the /// secure arm is always the hardcoded path, so a release build cannot be steered. @@ -44,13 +97,103 @@ fn config_path() -> PathBuf { #[derive(Debug, Clone, Deserialize)] pub struct Config { work_dir: PathBuf, + #[serde(default)] + tools: Tools, + #[serde(default)] + jails: Jails, +} + +/// The `[jails]` table: how the helper places and confines each VM jail. +/// +/// `cgroup` is the parent cgroup the jailer nests every VM beneath (default +/// `"hyper"`). `uid_gid_range` is the `[min, max]` band of UIDs/GIDs the helper +/// accepts from the BEAM; absent means the built-in default. A missing `[jails]` +/// table, or any missing key within it, falls back to these defaults. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Jails { + cgroup: String, + uid_gid_range: Option, +} + +impl Default for Jails { + fn default() -> Self { + Self { + cgroup: default_parent_cgroup(), + uid_gid_range: None, + } + } +} + +/// Paths to the external binaries the helper runs, the `[tools]` table. +/// +/// The device tools (`dmsetup`, `losetup`, `blockdev`) carry built-in defaults; +/// `firecracker` and `jailer` have none and must be configured before any VM can +/// launch — their absence surfaces as [`BinError::Unconfigured`] at use time, not +/// at load. Every path is validated as a root-owned, correctly-named [`SafeBin`] +/// when accessed, never at parse time (the file is read unprivileged). A missing +/// `[tools]` table, or any missing key within it, falls back to these defaults. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Tools { + dmsetup: PathBuf, + losetup: PathBuf, + blockdev: PathBuf, + firecracker: Option, + jailer: Option, +} + +impl Default for Tools { + fn default() -> Self { + Self { + dmsetup: default_dmsetup(), + losetup: default_losetup(), + blockdev: default_blockdev(), + firecracker: None, + jailer: None, + } + } +} + +// The default data root. Must match the Elixir node's `@dev_work_dir`, which it +// uses when the same config file is absent, so both sides agree (see +// `Hyper.Node.check_helper_base`). +fn default_work_dir() -> PathBuf { + PathBuf::from("/srv/hyper") +} + +fn default_dmsetup() -> PathBuf { + PathBuf::from("/usr/sbin/dmsetup") +} + +fn default_losetup() -> PathBuf { + PathBuf::from("/usr/sbin/losetup") +} + +fn default_blockdev() -> PathBuf { + PathBuf::from("/usr/sbin/blockdev") +} + +fn default_parent_cgroup() -> String { + // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync. + "hyper".into() +} + +impl Default for Config { + fn default() -> Self { + Self { + work_dir: default_work_dir(), + tools: Tools::default(), + jails: Jails::default(), + } + } } impl Config { /// The process-wide config, loaded once (and forced unprivileged by - /// [`Config::init`]). A load failure is fatal: the helper cannot safely - /// operate without a trusted data root, so it prints the error and exits - /// rather than guessing a default. + /// [`Config::init`]). An absent file yields the built-in defaults; a + /// *present but untrusted* file (wrong owner/mode, malformed) is fatal — + /// the helper prints the error and exits rather than trusting it. pub fn get() -> &'static Config { LazyLock::force(&CONFIG) } @@ -58,7 +201,7 @@ impl Config { /// Force the config to load now. Call this once at the very start of `main`, /// after privileges have already been dropped (the `.preinit_array` entry in /// `setuid_privileged` runs before `main`), so the file is never first read - /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read + /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read /// as the real uid, not as root. pub fn init() { let _ = Self::get(); @@ -74,6 +217,60 @@ impl Config { self.work_dir.join("jails") } + /// The validated `dmsetup` binary the helper will run. + pub fn dmsetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.tools.dmsetup) + } + + /// The validated `losetup` binary the helper will run. + pub fn losetup(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.tools.losetup) + } + + /// The validated `blockdev` binary the helper will run. + pub fn blockdev(&self) -> Result, safe_bin::Error> { + SafeBin::from_path(&self.tools.blockdev) + } + + /// The Firecracker VMM binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set `[tools] firecracker` before any VM can be launched. + pub fn firecracker(&self) -> Result, BinError> { + self.tools + .firecracker + .as_deref() + .ok_or(BinError::Unconfigured("firecracker")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The Firecracker jailer binary, validated as root-owned and correctly named. + /// Errors [`BinError::Unconfigured`] when absent from config — an operator + /// must set `[tools] jailer` before any VM can be launched. + pub fn jailer(&self) -> Result, BinError> { + self.tools + .jailer + .as_deref() + .ok_or(BinError::Unconfigured("jailer")) + .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin)) + } + + /// The jailer `--parent-cgroup` value, from `[jails] cgroup`. Defaults to + /// `"hyper"`, matching the Elixir node's default. + pub fn parent_cgroup(&self) -> &str { + &self.jails.cgroup + } + + /// The UID/GID band the helper accepts from the BEAM. Defaults to + /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults). + /// A present range with min==0 or min>max is rejected at load time by + /// [`Config::safe_load`], so this accessor is always total. + pub fn uid_gid_range(&self) -> (u32, u32) { + self.jails + .uid_gid_range + .map(|r| (r.min, r.max)) + .unwrap_or(DEFAULT_UID_GID) + } + /// Read, ownership-check, parse, and validate the config file. See the module /// docs for the trust model. pub fn safe_load() -> Result { @@ -82,7 +279,18 @@ impl Config { let safe_path: SafePath = path.clone().try_into()?; let file: SafeFile = - SafeFile::open(&safe_path, OFlag::O_RDONLY)?; + match SafeFile::open(&safe_path, OFlag::O_RDONLY) { + Ok(file) => file, + // A genuinely-absent file means "use the built-in defaults": those + // are compiled into this root-owned binary, so they are trusted. Any + // OTHER failure — a present but wrong-owner/mode file, an I/O error — + // stays fatal, because it is a signal (someone put an untrusted file + // there), not an absence. + Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => { + return Ok(Self::default()) + } + Err(e) => return Err(e.into()), + }; let body = std::io::read_to_string(std::fs::File::from(file.into_owned_fd())) .map_err(|_| LoadingError::Unreadable(path.clone()))?; @@ -92,6 +300,11 @@ impl Config { if !config.work_dir.is_absolute() { return Err(LoadingError::Relative(path)); } + + if let Some(r) = &config.jails.uid_gid_range { + validate_uid_gid_range(r)?; + } + Ok(config) } } diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs index 617fd6d5..11b08319 100644 --- a/native/suidhelper/src/main.rs +++ b/native/suidhelper/src/main.rs @@ -15,6 +15,7 @@ use clap::{Parser, Subcommand}; use hyper_suidhelper::config; +use hyper_suidhelper::tools::jailer::{self, JailerArgs}; use hyper_suidhelper::tools::Tool; use hyper_suidhelper::util::setuid_privileged::{self, Privileged}; use serde::Serialize; @@ -37,6 +38,9 @@ enum Command { Tool(Tool), /// Check the helper is correctly installed (can promote to root). SysTest, + /// Validate the caller's args, become root, and `execve` the firecracker + /// jailer in our place. Prints nothing on success (the image is replaced). + Jailer(JailerArgs), /// Print the build version and BLAKE3 checksum of this binary. Version, } @@ -101,12 +105,21 @@ fn main() { config::Config::init(); // Each command yields a serializable value (errors stringified to unify); we - // render the final JSON line here. + // render the final JSON line here. The jailer is the exception: it `execve`s + // in place, so on success it never returns and never emits JSON, and on + // failure it reports to stderr and exits before reaching the JSON pipeline. let output = match command { Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()), Command::SysTest => SysTest::perform() .map(Output::SysTest) .map_err(|e| e.to_string()), + Command::Jailer(args) => { + let e = jailer::run(args).expect_err("jailer::run only returns on error"); + eprintln!("{e}"); + // _exit bypasses atexit handlers; safe because we are permanently root + // at this point and must not run any cleanup registered before escalation. + unsafe { nix::libc::_exit(2) } + } Command::Version => unreachable!("handled above"), }; diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs new file mode 100644 index 00000000..4ec94a21 --- /dev/null +++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so +//! the unprivileged controller can `connect()` to it. +//! +//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker +//! then creates its API socket at `/root/api.socket` owned by that per-VM +//! id. Connecting a unix socket needs *write* permission on the node, so the +//! node user (a different uid) gets `EACCES`. This op chowns just that one +//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the +//! privileged scope are the real (caller) ids while euid is 0 — and chmods it +//! `0660`. The node thus connects as owner, and humans added to the node's +//! group connect via the group bit. +//! +//! That alone is not enough: the jailer leaves `/root` as `0700` owned by +//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so +//! the node cannot even traverse into `root` to reach the (now its own) socket. +//! So this op also opens just that one directory to the caller's group: it keeps +//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the +//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not +//! list), other none. Per-VM isolation is otherwise untouched: only this socket +//! and its immediate parent's group/mode move, nothing else in the jail, and +//! unrelated users stay locked out. +//! +//! Security: the socket path is validated as a `SafePath` and reached by an +//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect +//! the chown outside the jail, and every op is fd-relative on the pinned `root` +//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket` +//! `//root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must +//! report a *socket* — a regular file or symlink planted at that name is an +//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on +//! the path) is `Pending`, not an error: firecracker has not created it yet, so +//! the controller keeps probing. + +use crate::config::Config; +use crate::tools::IsTool; +use crate::util::safe_dir::{self, SafeDir}; +use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents}; +use clap::Args; +use nix::errno::Errno; +use nix::sys::stat::SFlag; +use nix::unistd::{getgid, getuid}; +use serde::Serialize; +use std::path::{Path, PathBuf}; +use thiserror::Error as ThisError; + +/// The fixed in-jail socket name firecracker opens (mirrors the Elixir +/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`). +const SOCKET_NAME: &str = "api.socket"; + +/// The socket sits at `///root/api.socket`: three parent +/// components (``, ``, `root`) before the leaf. +const SOCKET_PARENT_DEPTH: usize = 3; + +/// Mode handed to the node: owner+group read/write, no world access. +const SOCKET_MODE: u32 = 0o660; + +/// Mode set on the jail `root` dir so the node's group can *traverse* it to +/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x` +/// (traverse, not list), other none. +const JAIL_ROOT_MODE: u32 = 0o710; + +type LexicalPath = SafePath; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("--socket path: {0}")] + SocketPath(#[source] safe_path::ValidationError), + #[error("--socket must be exactly //root/api.socket below JAIL_BASE: {0:?}")] + SocketShape(PathBuf), + #[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")] + SocketName(PathBuf), + #[error("walking to the jail root: {0}")] + Walk(#[source] safe_dir::Error), + #[error("api.socket is not a socket (or is a symlink); refusing to touch it")] + NotASocket, + #[error("statting the socket: {0}")] + Stat(#[source] safe_dir::Error), + #[error("chowning the socket to the caller: {0}")] + Chown(#[source] safe_dir::Error), + #[error("chmoding the socket: {0}")] + Chmod(#[source] safe_dir::Error), + #[error("chgrp-ing the jail root dir to the caller: {0}")] + ChgrpRoot(#[source] safe_dir::Error), + #[error("chmoding the jail root dir for traversal: {0}")] + ChmodRoot(#[source] safe_dir::Error), +} + +#[derive(Args)] +pub struct GrantApiArgs { + /// Host path of the firecracker API socket, shape + /// ///root/api.socket. + #[arg(long)] + socket: PathBuf, +} + +#[derive(Debug, Serialize)] +#[serde(tag = "result", rename_all = "snake_case")] +pub enum GrantOut { + /// The socket was handed to the caller (chowned + chmoded). + Granted, + /// The socket does not exist yet; the caller should keep waiting. + Pending, +} + +/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`). +pub fn run(args: GrantApiArgs) -> Result { + GrantApi { args }.run() +} + +struct GrantApi { + args: GrantApiArgs, +} + +impl IsTool for GrantApi { + type Args = GrantApiArgs; + type Output = GrantOut; + type RunT = Result; + + fn run_privileged(&self) -> Self::RunT { + grant_api_under(&Config::get().jail_base(), &self.args.socket) + } + + fn parse(&self, res: Self::RunT) -> Result> { + Ok(res?) + } +} + +/// Hand `socket` (`///root/api.socket`) to the helper's +/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns +/// `Pending` if any path component or the socket itself is not yet present. +pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result { + let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?; + let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?; + if parents.len() != SOCKET_PARENT_DEPTH { + return Err(Error::SocketShape(socket.to_path_buf())); + } + if leaf != Path::new(SOCKET_NAME) { + return Err(Error::SocketName(socket.to_path_buf())); + } + + let Some(root) = walk(jail_base.to_path_buf(), &parents)? else { + return Ok(GrantOut::Pending); // jail not fully created yet + }; + + let leaf = Path::new(SOCKET_NAME); + let stat = match root.stat(leaf) { + Ok(stat) => stat, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending), + Err(e) => return Err(Error::Stat(e)), + }; + // `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails + // this check too: only a real socket is accepted, anything else is refused. + if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() { + return Err(Error::NotASocket); + } + + root.chown(leaf, getuid().as_raw(), getgid().as_raw()) + .map_err(Error::Chown)?; + root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?; + + // Open `root` itself to the caller's group so the node can traverse into it + // to reach the socket (the jailer leaves it 0700 / per-VM uid). Owner stays + // the per-VM uid; only the group and mode move. Operate on the pinned `root` + // fd (already opened `O_NOFOLLOW`), never by name - TOCTOU-safe. + root.chgrp_self(getgid().as_raw()) + .map_err(Error::ChgrpRoot)?; + root.chmod_self(JAIL_ROOT_MODE).map_err(Error::ChmodRoot)?; + Ok(GrantOut::Granted) +} + +/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns +/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the +/// caller can treat a half-built jail as `Pending` rather than an error. +fn walk(base: PathBuf, parents: &[PathBuf]) -> Result, Error> { + let base_path: LexicalPath = base.try_into().map_err(Error::SocketPath)?; + let anchor = match SafeDir::open(&base_path) { + Ok(dir) => dir, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(None), + Err(e) => return Err(Error::Walk(e)), + }; + match anchor.descend(parents) { + Ok(dir) => Ok(Some(dir)), + Err(e) if e.errno() == Some(Errno::ENOENT) => Ok(None), + Err(e) => Err(Error::Walk(e)), + } +} diff --git a/native/suidhelper/src/tools/chroot_jail/mod.rs b/native/suidhelper/src/tools/chroot_jail/mod.rs index d65bac7b..3a06c35a 100644 --- a/native/suidhelper/src/tools/chroot_jail/mod.rs +++ b/native/suidhelper/src/tools/chroot_jail/mod.rs @@ -1,9 +1,11 @@ // SPDX-License-Identifier: AGPL-3.0-only //! `chroot-jail`: per-VM chroot/jail lifecycle. +pub mod grant_api; mod prepare; pub mod remove; +pub use grant_api::GrantApiArgs; pub use prepare::PrepareArgs; pub use remove::RemoveArgs; @@ -15,6 +17,8 @@ pub enum ChrootJailOp { Prepare(PrepareArgs), /// Remove a VM's stale chroot and cgroup leaf before relaunching the jailer. Remove(RemoveArgs), + /// Hand the firecracker API socket to the node user (chown to caller, 0660). + GrantApi(GrantApiArgs), } impl ChrootJailOp { @@ -25,6 +29,7 @@ impl ChrootJailOp { match self { ChrootJailOp::Prepare(args) => prepare::run(args), ChrootJailOp::Remove(args) => remove::run(args), + ChrootJailOp::GrantApi(args) => grant_api::run(args), } } } diff --git a/native/suidhelper/src/tools/chroot_jail/remove.rs b/native/suidhelper/src/tools/chroot_jail/remove.rs index 57e19495..7574b364 100644 --- a/native/suidhelper/src/tools/chroot_jail/remove.rs +++ b/native/suidhelper/src/tools/chroot_jail/remove.rs @@ -7,8 +7,11 @@ //! symlinked component cannot redirect the deletion outside the tree, and removal //! is fd-relative (`unlinkat`), never by re-resolved name. `--chroot` must be //! exactly `/` below `JAIL_BASE`; `--cgroup` at least two components -//! below its base (a non-recursive `rmdir`). Both deletes are idempotent: a -//! missing target (`ENOENT`, and for the cgroup `ENOTEMPTY`) is success. +//! below its base. Before removing the cgroup leaf we write `cgroup.kill` to +//! SIGKILL any process still in the subtree (a still-live firecracker would +//! otherwise keep the leaf non-empty and leak its loop/dm devices), then +//! `rmdir` it (non-recursive). Both deletes are idempotent: a missing target +//! (`ENOENT`, and for the cgroup `ENOTEMPTY`) is success. use crate::config::Config; use crate::tools::IsTool; @@ -18,11 +21,18 @@ use clap::Args; use nix::errno::Errno; use serde::Serialize; use std::path::{Path, PathBuf}; +use std::time::Duration; use thiserror::Error as ThisError; /// The cgroup virtual filesystem root. const CGROUP_BASE: &str = "/sys/fs/cgroup"; +/// How many times to retry the leaf `rmdir` while a killed cgroup drains, and the +/// pause between tries: ~`ATTEMPTS * BACKOFF` total (a cgroup reaps in a few ms), +/// after which a still-busy leaf is left for a later sweep rather than failing. +const RMDIR_ATTEMPTS: u32 = 20; +const RMDIR_BACKOFF_MS: u64 = 5; + type LexicalPath = SafePath; #[derive(Debug, ThisError)] @@ -41,6 +51,8 @@ pub enum Error { RemoveChroot(#[source] safe_dir::Error), #[error("removing cgroup: {0}")] RemoveCgroup(#[source] safe_dir::Error), + #[error("killing cgroup procs: {0}")] + KillCgroup(#[source] safe_dir::Error), } #[derive(Args)] @@ -74,8 +86,10 @@ impl IsTool for Remove { type RunT = Result<(), Error>; fn run_privileged(&self) -> Self::RunT { - remove_chroot_under(&Config::get().jail_base(), &self.args.chroot)?; + // Cgroup first: it SIGKILLs any firecracker still alive in the leaf, so the + // chroot teardown below is not yanking files out from under a live process. remove_cgroup_under(Path::new(CGROUP_BASE), &self.args.cgroup)?; + remove_chroot_under(&Config::get().jail_base(), &self.args.chroot)?; Ok(()) } @@ -105,9 +119,11 @@ pub fn remove_chroot_under(jail_base: &Path, chroot: &Path) -> Result<(), Error> } } -/// Remove the (empty) per-VM cgroup leaf, fd-relative after an `O_NOFOLLOW` walk -/// from `base`. `--cgroup` must be at least two components below `base` (one or -/// more parents, plus the leaf). Idempotent on `ENOENT`/`ENOTEMPTY`. +/// SIGKILL any process still in the per-VM cgroup leaf (via `cgroup.kill`), then +/// `rmdir` it, fd-relative after an `O_NOFOLLOW` walk from `base`. `--cgroup` must +/// be at least two components below `base` (one or more parents, plus the leaf). +/// Idempotent: a missing leaf is success, and a leaf that has not finished +/// draining after the kill (`EBUSY`/`ENOTEMPTY`) is left for a later sweep. pub fn remove_cgroup_under(base: &Path, cgroup: &Path) -> Result<(), Error> { let path: LexicalPath = cgroup.to_path_buf().try_into().map_err(Error::CgroupPath)?; let (parents, leaf) = path.relative_to(base).map_err(Error::CgroupPath)?; @@ -118,10 +134,50 @@ pub fn remove_cgroup_under(base: &Path, cgroup: &Path) -> Result<(), Error> { let Some(parent) = walk(base.to_path_buf(), &parents)? else { return Ok(()); // an ancestor is already gone }; - match parent.rmdir(&leaf) { + + // Kill any process still in the leaf cgroup before removing it. Open the leaf + // O_NOFOLLOW (one step past the parent walk) and write "1" to cgroup.kill. + match parent.openat_dir(&leaf) { + Ok(leaf_dir) => kill_cgroup(&leaf_dir)?, + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(()), // leaf already gone + Err(e) => return Err(Error::Walk(e)), + } + + rmdir_drained(&parent, &leaf) +} + +/// `rmdir` the killed cgroup leaf, retrying while it drains. `cgroup.kill` signals +/// SIGKILL synchronously but the kernel reaps the processes and offlines the +/// cgroup asynchronously, so the `rmdir` briefly returns `EBUSY`/`ENOTEMPTY`. +/// Retry a bounded number of times; if it still has not drained, the processes are +/// already dead (the kill is the load-bearing op) and a later sweep or the next +/// relaunch clears the empty leaf, so a persistent busy is success — never fail +/// the caller (a relaunch) over leftover-dir cleanup. `ENOENT` means already gone. +fn rmdir_drained(parent: &SafeDir, leaf: &Path) -> Result<(), Error> { + for attempt in 0..RMDIR_ATTEMPTS { + match parent.rmdir(leaf) { + Ok(()) => return Ok(()), + Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(()), + Err(e) if matches!(e.errno(), Some(Errno::EBUSY | Errno::ENOTEMPTY)) => { + if attempt + 1 < RMDIR_ATTEMPTS { + std::thread::sleep(Duration::from_millis(RMDIR_BACKOFF_MS)); + } + } + Err(e) => return Err(Error::RemoveCgroup(e)), + } + } + Ok(()) +} + +/// Best-effort SIGKILL of every process in the v2 cgroup `leaf_dir`: write "1" to +/// its `cgroup.kill` pseudo-file. A missing file (`ENOENT`: pre-5.14/non-v2 +/// kernel, or already-emptied leaf) or a cgroup torn down concurrently (`ENODEV`) +/// is tolerated — killing must not hard-fail the remove. +fn kill_cgroup(leaf_dir: &SafeDir) -> Result<(), Error> { + match leaf_dir.write_file(Path::new("cgroup.kill"), b"1") { Ok(()) => Ok(()), - Err(e) if matches!(e.errno(), Some(Errno::ENOENT | Errno::ENOTEMPTY)) => Ok(()), - Err(e) => Err(Error::RemoveCgroup(e)), + Err(e) if matches!(e.errno(), Some(Errno::ENOENT | Errno::ENODEV)) => Ok(()), + Err(e) => Err(Error::KillCgroup(e)), } } diff --git a/native/suidhelper/src/tools/dmsetup/mod.rs b/native/suidhelper/src/tools/dmsetup/mod.rs index eef016ad..d77674df 100644 --- a/native/suidhelper/src/tools/dmsetup/mod.rs +++ b/native/suidhelper/src/tools/dmsetup/mod.rs @@ -52,6 +52,11 @@ enum DmOp { #[arg(long)] message: ThinMessage, }, + /// List the target types the kernel device-mapper exposes. Read-only, but + /// still needs root: it opens `/dev/mapper/control`. + Targets, + /// List the names of existing dm devices (for stale-device reclaim). + Ls, } #[derive(Serialize)] @@ -60,6 +65,8 @@ pub enum DmsetupOut { Created { device: PathBuf }, Removed, Messaged, + Targets { output: String }, + Listed { output: String }, } pub struct Dmsetup { @@ -105,6 +112,12 @@ impl IsTool for Dmsetup { .arg("0") .arg(message.to_string()); } + DmOp::Targets => { + cmd.arg("targets"); + } + DmOp::Ls => { + cmd.arg("ls"); + } } cmd.env_clear().output() @@ -124,6 +137,12 @@ impl IsTool for Dmsetup { }, DmOp::Remove { .. } => DmsetupOut::Removed, DmOp::Message { .. } => DmsetupOut::Messaged, + DmOp::Targets => DmsetupOut::Targets { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, + DmOp::Ls => DmsetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/jailer.rs b/native/suidhelper/src/tools/jailer.rs new file mode 100644 index 00000000..ec29ac8a --- /dev/null +++ b/native/suidhelper/src/tools/jailer.rs @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! The `jailer` subcommand: validate the BEAM's arguments, re-acquire root +//! permanently, and `execve` the firecracker jailer in our place. +//! +//! Unlike the device tools this is **not** an [`crate::tools::IsTool`]: it does +//! not run a child and parse JSON, it *becomes* the jailer via `execve`, so the +//! unprivileged BEAM's MuonTrap port keeps supervising the resulting process +//! across the image replacement. There is no output and no return on success. +//! +//! Threat model: the BEAM is untrusted. It supplies only `--id`, `--uid`, +//! `--gid`, repeated `--cgroup KEY=VALUE`, and `--api-sock`. Every privileged +//! path (the jailer binary, the firecracker `--exec-file`, the chroot base, the +//! parent cgroup) comes from the root-owned config, never the caller. The +//! refusal contracts on the newtypes below are the security core: a compromised +//! BEAM must not be able to name a privileged path, request uid 0, traverse out +//! of the chroot base, inject a flag, or smuggle an environment/fd into root. +//! +//! ## Validator laws (property-tested in `tests/tools/jailer.rs`) +//! - [`validate_id_number`] accepts iff `n != 0 && lo <= n <= hi`; 0 is rejected +//! for *every* range (uid 0 makes the jailer skip its privilege drop). +//! - [`VmId`] round-trips exactly the allowed charset/length and rejects any +//! separator, dot, NUL, whitespace, leading dash, empty, or over-long input. +//! - [`CgroupSetting`] re-renders a valid pair to its canonical `key=value` and +//! rejects unknown keys and values outside the per-key grammar. +//! - [`JailSock`] accepts exactly `/` + one filename and rejects multi-component, +//! relative, `..`, and NUL/whitespace inputs. + +use crate::config::{BinError, Config}; +use crate::util::setuid_privileged; +use clap::Args; +use nix::errno::Errno; +use std::ffi::CString; +use std::fmt; +use std::os::unix::ffi::OsStrExt; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use thiserror::Error as ThisError; + +/// The jailer protects at most a handful of controllers; an unbounded `--cgroup` +/// list is a caller trying something. Cap it well above any legitimate need. +const MAX_CGROUPS: usize = 16; + +/// `sun_path` in `sockaddr_un` is 108 bytes on Linux; a socket path longer than +/// that can never be bound, so reject it up front. +const MAX_SOCK_LEN: usize = 108; + +#[derive(Debug, ThisError)] +pub enum Error { + #[error("invalid --id {0:?}: must be 1..=64 chars of [A-Za-z0-9_-], not starting with '-'")] + VmId(String), + #[error("invalid --cgroup {0:?}: unknown key or value outside its grammar")] + Cgroup(String), + #[error("invalid --api-sock {0:?}: must be / with name in [A-Za-z0-9_.-]")] + Sock(String), + #[error("--uid/--gid {value} is zero or outside the configured range [{lo}, {hi}]")] + IdNumber { value: u32, lo: u32, hi: u32 }, + #[error("too many --cgroup settings: {0} (max {MAX_CGROUPS})")] + TooManyCgroups(usize), + #[error(transparent)] + Bin(#[from] BinError), + #[error(transparent)] + Privilege(#[from] setuid_privileged::Error), + #[error("argument contains an interior NUL byte")] + NulArgument, + #[error("execve {path:?} failed: {errno}")] + Exec { path: PathBuf, errno: Errno }, +} + +/// `n != 0 && lo <= n <= hi`. uid/gid 0 is rejected unconditionally: a jailer run +/// with uid 0 skips its privilege drop and leaves firecracker running as root. +pub fn validate_id_number(n: u32, range: (u32, u32)) -> Result { + let (lo, hi) = range; + if n != 0 && lo <= n && n <= hi { + Ok(n) + } else { + Err(Error::IdNumber { value: n, lo, hi }) + } +} + +/// A VM id used as a chroot subdirectory name: `[A-Za-z0-9_-]`, length `1..=64`, +/// first character not `-` (so it can never be read as a flag). No `/`, `.`, NUL, +/// or whitespace can appear, so it can never traverse out of the chroot base. +#[derive(Debug, Clone)] +pub struct VmId(String); + +impl FromStr for VmId { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::VmId(s.to_string()); + let bytes = s.as_bytes(); + if !(1..=64).contains(&bytes.len()) { + return Err(reject()); + } + if bytes[0] == b'-' { + return Err(reject()); + } + if !bytes + .iter() + .all(|&b| b.is_ascii_alphanumeric() || b == b'_' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for VmId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +/// `1..=20` ASCII digits — bounds a numeric cgroup limit without pulling in a +/// regex engine. The upper bound comfortably exceeds `u64::MAX`'s 20 digits. +fn is_digits_1_20(s: &str) -> bool { + !s.is_empty() && s.len() <= 20 && s.bytes().all(|b| b.is_ascii_digit()) +} + +/// A single `KEY=VALUE` cgroup setting from an allowlist. The helper re-emits +/// `key=value` itself from the canonical key, so the jailer never sees the +/// caller's raw bytes. Per-key value grammar: +/// - `memory.max` : `[0-9]{1,20}` or the literal `max` +/// - `cpu.max` : `[0-9]{1,20} [0-9]{1,20}` or `max [0-9]{1,20}` +#[derive(Debug, Clone)] +pub struct CgroupSetting { + key: &'static str, + value: String, +} + +impl FromStr for CgroupSetting { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Cgroup(s.to_string()); + // Split on the FIRST `=`. None of the value grammars contains a `=`, so a + // second `=` lands in `value` and is rejected by the grammar check below. + let (raw_key, value) = s.split_once('=').ok_or_else(reject)?; + + let key: &'static str = match raw_key { + "memory.max" => "memory.max", + "cpu.max" => "cpu.max", + _ => return Err(reject()), + }; + + let valid = match key { + "memory.max" => value == "max" || is_digits_1_20(value), + "cpu.max" => match value.split_once(' ') { + Some((quota, period)) => { + (quota == "max" || is_digits_1_20(quota)) && is_digits_1_20(period) + } + None => false, + }, + _ => false, + }; + + if valid { + Ok(Self { + key, + value: value.to_string(), + }) + } else { + Err(reject()) + } + } +} + +impl fmt::Display for CgroupSetting { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}={}", self.key, self.value) + } +} + +/// The firecracker API socket path: an absolute path that is exactly `/` plus one +/// filename in `[A-Za-z0-9_.-]`. The charset excludes `/`, so the value is always +/// a direct child of `/` with no extra components and no traversal; `.`/`..` as +/// the whole filename are rejected explicitly. +#[derive(Debug, Clone)] +pub struct JailSock(String); + +impl FromStr for JailSock { + type Err = Error; + + fn from_str(s: &str) -> Result { + let reject = || Error::Sock(s.to_string()); + if s.len() > MAX_SOCK_LEN { + return Err(reject()); + } + let name = s.strip_prefix('/').ok_or_else(reject)?; + if name.is_empty() || name == "." || name == ".." { + return Err(reject()); + } + if !name + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') + { + return Err(reject()); + } + Ok(Self(s.to_string())) + } +} + +impl fmt::Display for JailSock { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +#[derive(Args)] +pub struct JailerArgs { + /// Microvm id; becomes the chroot subdirectory name. + #[arg(long)] + id: VmId, + /// Unprivileged uid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + uid: u32, + /// Unprivileged gid the jailer drops to (rejected if 0 or out of range). + #[arg(long)] + gid: u32, + /// Repeatable `KEY=VALUE` cgroup setting from the allowlist. + #[arg(long = "cgroup")] + cgroup: Vec, + /// Absolute firecracker API socket path (single filename under `/`). + #[arg(long = "api-sock")] + api_sock: JailSock, +} + +fn cstr_bytes(bytes: &[u8]) -> Result { + CString::new(bytes).map_err(|_| Error::NulArgument) +} + +fn cstr_str(s: &str) -> Result { + cstr_bytes(s.as_bytes()) +} + +fn cstr_path(p: &Path) -> Result { + cstr_bytes(p.as_os_str().as_bytes()) +} + +/// Build the exact argv handed to the jailer. argv[0] is the jailer path. The +/// caller never names the jailer, the `--exec-file`, the chroot base, the cgroup +/// version, or the parent cgroup — those are derived from trusted config here. +#[allow(clippy::too_many_arguments)] +fn build_argv( + jailer: &Path, + id: &VmId, + firecracker: &Path, + uid: u32, + gid: u32, + jail_base: &Path, + parent_cgroup: &str, + cgroups: &[CgroupSetting], + api_sock: &JailSock, +) -> Result, Error> { + let mut argv = vec![ + cstr_path(jailer)?, + cstr_str("--id")?, + cstr_str(&id.to_string())?, + cstr_str("--exec-file")?, + cstr_path(firecracker)?, + cstr_str("--uid")?, + cstr_str(&uid.to_string())?, + cstr_str("--gid")?, + cstr_str(&gid.to_string())?, + cstr_str("--chroot-base-dir")?, + cstr_path(jail_base)?, + cstr_str("--cgroup-version")?, + cstr_str("2")?, + cstr_str("--parent-cgroup")?, + cstr_str(parent_cgroup)?, + ]; + + for cg in cgroups { + argv.push(cstr_str("--cgroup")?); + argv.push(cstr_str(&cg.to_string())?); + } + + argv.push(cstr_str("--")?); + argv.push(cstr_str("--api-sock")?); + argv.push(cstr_str(&api_sock.to_string())?); + + Ok(argv) +} + +/// Close every inherited fd above stdio so a compromised BEAM cannot smuggle an +/// open fd into the root jailer. Keep 0/1/2: MuonTrap supervises the jailer +/// through stdio, and stderr carries our exec-failure message. `close_range(2)` +/// needs Linux 5.9+; on any failure (ENOSYS or otherwise) we fall back to +/// closing each fd individually — fail closed before handing root to the jailer. +fn close_inherited_fds() { + const FIRST: u32 = 3; + // SAFETY: raw syscall with no memory operands; closing fds has no UB. + let rc = unsafe { nix::libc::close_range(FIRST, u32::MAX, 0) }; + if rc == 0 { + return; + } + + // SAFETY: sysconf is a pure query of a system limit. + let max = unsafe { nix::libc::sysconf(nix::libc::_SC_OPEN_MAX) }; + let max = if max < 0 { 4096 } else { max as i32 }; + for fd in (FIRST as i32)..max { + // SAFETY: closing an arbitrary fd is safe; EBADF for unused fds is ignored. + unsafe { + nix::libc::close(fd); + } + } +} + +/// Validate the caller's args, then permanently become root and `execve` the +/// jailer. On success this never returns (the process image is replaced); the +/// `Ok` arm is therefore [`std::convert::Infallible`]. Every failure is returned +/// as [`Error`] for the caller to print and exit non-zero. +pub fn run(args: JailerArgs) -> Result { + let config = Config::get(); + + // Resolve everything that can fail as the REAL uid, before any privilege is + // raised: config accessors, binary validation, range, and arg validation. + let jailer: PathBuf = config.jailer()?.into(); + let firecracker: PathBuf = config.firecracker()?.into(); + let jail_base = config.jail_base(); + let parent_cgroup = config.parent_cgroup(); + let range = config.uid_gid_range(); + + let uid = validate_id_number(args.uid, range)?; + let gid = validate_id_number(args.gid, range)?; + + if args.cgroup.len() > MAX_CGROUPS { + return Err(Error::TooManyCgroups(args.cgroup.len())); + } + + let argv = build_argv( + &jailer, + &args.id, + &firecracker, + uid, + gid, + &jail_base, + parent_cgroup, + &args.cgroup, + &args.api_sock, + )?; + let jailer_cstr = cstr_path(&jailer)?; + + // Point of no return: from here we are permanently root and must execve. + setuid_privileged::become_root_permanently()?; + close_inherited_fds(); + + // Empty envp: once ruid==0 the kernel clears AT_SECURE, so a smuggled + // LD_PRELOAD/LD_LIBRARY_PATH would be honored by the dynamic loader and + // hijack the root jailer. We pass nothing and let the jailer build its own. + let empty_env: [CString; 0] = []; + let errno = nix::unistd::execve(&jailer_cstr, &argv, &empty_env) + .expect_err("execve only returns on failure"); + Err(Error::Exec { + path: jailer, + errno, + }) +} diff --git a/native/suidhelper/src/tools/losetup.rs b/native/suidhelper/src/tools/losetup.rs index fa8a49d9..aadb5b67 100644 --- a/native/suidhelper/src/tools/losetup.rs +++ b/native/suidhelper/src/tools/losetup.rs @@ -66,6 +66,8 @@ enum LosetupOp { Attach(AttachArgs), /// Detach a loop device. Detach { dev: LoopDev }, + /// List loop devices as `NAME BACK-FILE` rows (for stale-device reclaim). + List, } #[derive(Serialize)] @@ -73,6 +75,7 @@ enum LosetupOp { pub enum LosetupOut { Attached { device: PathBuf }, Detached, + Listed { output: String }, } pub struct Losetup { @@ -101,6 +104,15 @@ impl IsTool for Losetup { let dev: &Path = dev.as_ref(); cmd.arg("-d").arg(dev); } + LosetupOp::List => { + cmd.args([ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE", + ]); + } } cmd.env_clear().output() @@ -119,6 +131,9 @@ impl IsTool for Losetup { device: PathBuf::from(String::from_utf8_lossy(&out.stdout).trim()), }, LosetupOp::Detach { .. } => LosetupOut::Detached, + LosetupOp::List => LosetupOut::Listed { + output: String::from_utf8_lossy(&out.stdout).into_owned(), + }, }) } } diff --git a/native/suidhelper/src/tools/mod.rs b/native/suidhelper/src/tools/mod.rs index 10c53c47..9f9fedae 100644 --- a/native/suidhelper/src/tools/mod.rs +++ b/native/suidhelper/src/tools/mod.rs @@ -1,11 +1,13 @@ //! Per-tool CLI fragments and their `IsTool` implementations. Each tool lives in -//! its own submodule and owns its own error type, operand validation, and `--bin` -//! parser; this module owns the shared trait, the `Tool` subcommand tree, and the -//! privilege boundary. +//! its own submodule and owns its own error type and operand validation; this +//! module owns the shared trait, the `Tool` subcommand tree, and the privilege +//! boundary. The binary each tool runs is resolved from the trusted config here, +//! never passed by the caller. mod blockdev; pub mod chroot_jail; mod dmsetup; +pub mod jailer; mod losetup; pub use blockdev::{Blockdev, BlockdevArgs}; @@ -13,15 +15,15 @@ pub use chroot_jail::ChrootJailOp; pub use dmsetup::{DmTable, Dmsetup, DmsetupArgs, ThinMessage}; pub use losetup::{Losetup, LosetupArgs}; -use crate::util::safe_bin::SafeBin; +use crate::config::Config; use crate::util::setuid_privileged::{self, Privileged}; use clap::Subcommand; use serde::Serialize; use thiserror::Error as ThisError; -/// Errors of the dispatch layer: whatever the privilege guard or the chosen tool -/// raises on the way out. (`--bin` and operand validation are handled by clap at -/// parse time, so they never reach here.) +/// Errors of the dispatch layer: an invalid configured binary (`SafeBin`), the +/// privilege guard, or the chosen tool's own failure on the way out. (Operand +/// validation is handled by clap at parse time, so it never reaches here.) #[derive(Debug, ThisError)] pub enum Error { #[error(transparent)] @@ -65,28 +67,23 @@ pub trait IsTool { } } -/// The subcommand tree: one subcommand per tool, each taking its own `--bin` -/// with the tool-specific args flattened in from the submodule. +/// The subcommand tree: one subcommand per tool, with the tool-specific args +/// flattened in from the submodule. The binary each tool runs is not a caller +/// argument - it comes from the root-owned config (see [`Config`]). #[derive(Subcommand)] pub enum Tool { /// Attach/detach loop devices. Losetup { - #[arg(long)] - bin: SafeBin<"losetup">, #[command(flatten)] args: LosetupArgs, }, /// Create/remove device-mapper snapshot devices. Dmsetup { - #[arg(long)] - bin: SafeBin<"dmsetup">, #[command(flatten)] args: DmsetupArgs, }, /// Query a block device's size. Blockdev { - #[arg(long)] - bin: SafeBin<"blockdev">, #[command(flatten)] args: BlockdevArgs, }, @@ -99,13 +96,24 @@ pub enum Tool { impl Tool { /// Dispatch to the selected tool's `run` (or, for `chroot-jail`, its nested - /// op), returning its already-serialized `Value`. The `--bin` is already - /// validated (it is a `SafeBin`, constructed only by its value parser). + /// op), returning its already-serialized `Value`. The binary path is taken + /// from the trusted config and validated (`SafeBin`) here, as the real uid, + /// before any privilege is acquired. pub fn run(self) -> Result { + let config = Config::get(); match self { - Tool::Losetup { bin, args } => Losetup::new(bin.into(), args).run(), - Tool::Dmsetup { bin, args } => Dmsetup::new(bin.into(), args).run(), - Tool::Blockdev { bin, args } => Blockdev::new(bin.into(), args).run(), + Tool::Losetup { args } => { + let bin = config.losetup().map_err(|e| Error::Tool(Box::new(e)))?; + Losetup::new(bin.into(), args).run() + } + Tool::Dmsetup { args } => { + let bin = config.dmsetup().map_err(|e| Error::Tool(Box::new(e)))?; + Dmsetup::new(bin.into(), args).run() + } + Tool::Blockdev { args } => { + let bin = config.blockdev().map_err(|e| Error::Tool(Box::new(e)))?; + Blockdev::new(bin.into(), args).run() + } Tool::ChrootJail { op } => op.run(), } } diff --git a/native/suidhelper/src/util/chroot_jail.rs b/native/suidhelper/src/util/chroot_jail.rs index 70d4ce78..6a618187 100644 --- a/native/suidhelper/src/util/chroot_jail.rs +++ b/native/suidhelper/src/util/chroot_jail.rs @@ -43,6 +43,12 @@ pub enum Error { }, #[error("copy kernel: {0}")] Copy(#[source] io::Error), + #[error("resolving block device {path}: {source}")] + ResolveDev { + path: PathBuf, + #[source] + source: io::Error, + }, } /// An unfilled artifact slot. @@ -190,8 +196,18 @@ pub fn stage_kernel_under( /// `uid:gid`. The device is opened as a verified `SafeFile`, so the /// number comes from a real device node, never a caller-supplied value. fn make_rootfs(chroot: &SafeDir, device: &BlockDev, uid: u32, gid: u32) -> Result<(), Error> { - let dev_path: SafePath = - device.as_ref().to_path_buf().try_into()?; + // `device` is lexically validated as `/dev/loopN` or `/dev/mapper/hyper-*`. + // The dm form is a symlink under `/dev/mapper` — a root-owned `0755` dir an + // unprivileged caller cannot write — so resolving it to its real `/dev/dm-N` + // node cannot be redirected by the caller. We must resolve first: `SafeFile` + // opens `O_NOFOLLOW` (it must never follow an attacker-supplied symlink) and + // would otherwise reject the dm symlink itself. Loop nodes resolve to + // themselves. The re-opened target is still verified `IsBlockDevice`. + let real = std::fs::canonicalize(device.as_ref()).map_err(|source| Error::ResolveDev { + path: device.as_ref().to_path_buf(), + source, + })?; + let dev_path: SafePath = real.try_into()?; let dev = SafeFile::::open(&dev_path, OFlag::O_PATH)?; let rdev = dev.rdev()?; chroot.mknod_block(Path::new(ROOTFS_NAME), rdev, uid, gid)?; diff --git a/native/suidhelper/src/util/safe_bin.rs b/native/suidhelper/src/util/safe_bin.rs index 8f323aa7..91084407 100644 --- a/native/suidhelper/src/util/safe_bin.rs +++ b/native/suidhelper/src/util/safe_bin.rs @@ -1,17 +1,16 @@ -//! A validated `--bin` path. +//! A validated tool-binary path. //! -//! The caller names the binary to run, but it must be the expected tool and a -//! binary only root could have produced. [`SafeBin`] is a newtype whose only -//! constructor runs those checks, so holding one is proof the path was -//! validated. The const string parameter `NAME` is the basename it was validated -//! against - a `SafeBin<"losetup">` can never be passed where a -//! `SafeBin<"dmsetup">` is wanted. Combined with the [`FromStr`] impl (see -//! `tools`), clap validates the path at argument-parse time with no per-tool -//! boilerplate. +//! The path of each device tool (`losetup`, `dmsetup`, `blockdev`) comes from +//! the root-owned config file, never from the unprivileged caller. [`SafeBin`] +//! is a newtype whose only constructor runs the safety checks, so holding one is +//! proof the path was validated. The const string parameter `NAME` is the +//! basename it was validated against - a `SafeBin<"losetup">` can never be +//! passed where a `SafeBin<"dmsetup">` is wanted. //! -//! These checks are what keep this from being arbitrary-root-execution: an -//! unprivileged caller cannot point us at a binary it controls (must be -//! root-owned, not group/other-writable, not a symlink, exact basename). +//! These checks are what keep this from being arbitrary-root-execution: even a +//! mistaken config entry cannot point us at a binary a non-root user controls +//! (must be an absolute path, the exact basename, root-owned, not a symlink, not +//! group/other-writable). use std::ffi::OsStr; use std::fs; @@ -23,14 +22,14 @@ use thiserror::Error as ThisError; #[derive(Debug, ThisError)] pub enum Error { - #[error("--bin must be an absolute path: {0}")] + #[error("binary path must be absolute: {0}")] NotAbsolute(PathBuf), - #[error("--bin basename must be `{expected}`: {got}")] + #[error("binary basename must be `{expected}`: {got}")] Name { expected: &'static str, got: PathBuf, }, - #[error("--bin {path}: {source}")] + #[error("binary {path}: {source}")] Stat { path: PathBuf, #[source] @@ -44,22 +43,17 @@ pub enum Error { Writable(PathBuf), } -/// A `--bin` path validated to have basename `NAME`. The wrapped path is private -/// and the only constructor is the [`FromStr`] impl, so a `SafeBin` value cannot -/// exist without having been checked. +/// A tool-binary path validated to have basename `NAME`. The wrapped path is +/// private and the only constructor is [`SafeBin::from_path`], so a `SafeBin` +/// value cannot exist without having been checked. #[derive(Debug, Clone)] pub struct SafeBin(PathBuf); -// Lets clap validate `--bin` at parse time straight into a `SafeBin`, with -// no per-tool value parser: the const basename is the whole spec. Validates that -// `s` is an absolute path with basename `NAME`, a regular root-owned file no -// non-root user could have written. -impl FromStr for SafeBin { - type Err = Error; - - fn from_str(s: &str) -> Result { - let bin = Path::new(s); - +impl SafeBin { + /// Validate `bin` as the `NAME` tool binary: an absolute path with basename + /// `NAME`, a real (non-symlink) regular file owned by root that no non-root + /// user could have written. These checks are the whole point of the type. + pub fn from_path(bin: &Path) -> Result { if !bin.is_absolute() { return Err(Error::NotAbsolute(bin.to_path_buf())); } @@ -93,6 +87,16 @@ impl FromStr for SafeBin { } } +// Lets a string parse straight into a validated `SafeBin` (used by the +// test suite); delegates to the single `from_path` constructor. +impl FromStr for SafeBin { + type Err = Error; + + fn from_str(s: &str) -> Result { + Self::from_path(Path::new(s)) + } +} + // Read the validated path back out; the "validated" guarantee stays attached to // the `SafeBin` type until this conversion. impl From> for PathBuf { diff --git a/native/suidhelper/src/util/safe_dir.rs b/native/suidhelper/src/util/safe_dir.rs index 388ae838..61808d8a 100644 --- a/native/suidhelper/src/util/safe_dir.rs +++ b/native/suidhelper/src/util/safe_dir.rs @@ -17,8 +17,8 @@ use super::safe_path::SafePath; use nix::dir::{Dir, Type}; use nix::fcntl::{openat, AtFlags, OFlag}; use nix::libc::dev_t; -use nix::sys::stat::{mknodat, Mode, SFlag}; -use nix::unistd::{dup, fchownat, linkat, unlinkat, Gid, Uid, UnlinkatFlags}; +use nix::sys::stat::{fchmod, fchmodat, fstatat, mknodat, FchmodatFlags, FileStat, Mode, SFlag}; +use nix::unistd::{dup, fchown, fchownat, linkat, unlinkat, write, Gid, Uid, UnlinkatFlags}; use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd}; @@ -33,10 +33,16 @@ pub enum Error { ReadDir(#[source] nix::Error), #[error("unlinkat {name:?}: {source}")] Unlink { name: PathBuf, source: nix::Error }, + #[error("write {name:?}: {source}")] + Write { name: PathBuf, source: nix::Error }, #[error("mknodat {name:?}: {source}")] Mknod { name: PathBuf, source: nix::Error }, #[error("fchownat {name:?}: {source}")] Chown { name: PathBuf, source: nix::Error }, + #[error("fchmodat {name:?}: {source}")] + Chmod { name: PathBuf, source: nix::Error }, + #[error("fstatat {name:?}: {source}")] + Stat { name: PathBuf, source: nix::Error }, #[error("linkat -> {name:?}: {source}")] Link { name: PathBuf, source: nix::Error }, #[error("dup: {0}")] @@ -50,8 +56,11 @@ impl Error { match self { Error::Open { source, .. } | Error::Unlink { source, .. } + | Error::Write { source, .. } | Error::Mknod { source, .. } | Error::Chown { source, .. } + | Error::Chmod { source, .. } + | Error::Stat { source, .. } | Error::Link { source, .. } => Some(*source), Error::ReadDir(source) | Error::Dup(source) => Some(*source), } @@ -133,6 +142,39 @@ impl SafeDir { Ok(unsafe { SafeFile::from_raw_fd(raw) }) } + /// Open existing file `name` write-only (`O_WRONLY|O_NOFOLLOW|O_CLOEXEC`, no + /// `O_CREAT`/`O_TRUNC`) and write `contents`. For kernel pseudo-files such as + /// `cgroup.kill` where a single `write` is the whole API. `O_NOFOLLOW` rejects a + /// symlinked `name`, so the write cannot be redirected out of this pinned dir. + pub fn write_file(&self, name: &Path, contents: &[u8]) -> Result<(), Error> { + let raw = openat( + Some(self.0.as_raw_fd()), + name, + OFlag::O_WRONLY | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC, + Mode::empty(), + ) + .map_err(|source| Error::Open { + name: name.to_path_buf(), + source, + })?; + // SAFETY: openat just handed us this fd; nobody else owns it. + let fd = unsafe { OwnedFd::from_raw_fd(raw) }; + let mut off = 0; + while off < contents.len() { + match write(&fd, &contents[off..]) { + Ok(0) => break, + Ok(n) => off += n, + Err(source) => { + return Err(Error::Write { + name: name.to_path_buf(), + source, + }) + } + } + } + Ok(()) + } + /// Create a block device node `name` in this directory with the given /// `rdev`, then chown it to `uid:gid`. pub fn mknod_block(&self, name: &Path, rdev: dev_t, uid: u32, gid: u32) -> Result<(), Error> { @@ -180,6 +222,55 @@ impl SafeDir { }) } + /// `fstat` entry `name` relative to this dir's fd without following a final + /// symlink (`AT_SYMLINK_NOFOLLOW`). A symlink stats as itself (`S_IFLNK`), + /// never its target, so a caller inspecting the file type can reject one. + pub fn stat(&self, name: &Path) -> Result { + fstatat(Some(self.0.as_raw_fd()), name, AtFlags::AT_SYMLINK_NOFOLLOW).map_err(|source| { + Error::Stat { + name: name.to_path_buf(), + source, + } + }) + } + + /// `chmod` entry `name` to `mode`. Linux's `fchmodat` has no working + /// no-follow mode (it returns `ENOTSUP`), so this follows a final symlink; + /// call it only after [`stat`](Self::stat) has proven `name` is not a + /// symlink, so the follow is a no-op on a real (non-link) entry. + pub fn chmod(&self, name: &Path, mode: u32) -> Result<(), Error> { + fchmodat( + Some(self.0.as_raw_fd()), + name, + Mode::from_bits_truncate(mode), + FchmodatFlags::FollowSymlink, + ) + .map_err(|source| Error::Chmod { + name: name.to_path_buf(), + source, + }) + } + + /// `fchmod` this directory through its own held fd. Unlike [`chmod`](Self::chmod), + /// which re-resolves a *name*, this targets the fd we already opened + /// `O_NOFOLLOW`, so there is no path component to swap - TOCTOU-safe on the + /// directory itself. + pub fn chmod_self(&self, mode: u32) -> Result<(), Error> { + fchmod(self.0.as_raw_fd(), Mode::from_bits_truncate(mode)).map_err(|source| Error::Chmod { + name: PathBuf::from("."), + source, + }) + } + + /// `fchown` this directory's group through its own held fd, preserving its + /// owner (no uid passed). Same TOCTOU guarantee as [`chmod_self`](Self::chmod_self). + pub fn chgrp_self(&self, gid: u32) -> Result<(), Error> { + fchown(self.0.as_raw_fd(), None, Some(Gid::from_raw(gid))).map_err(|source| Error::Chown { + name: PathBuf::from("."), + source, + }) + } + /// Remove the non-directory entry `name` from this directory. pub fn unlink(&self, name: &Path) -> Result<(), Error> { unlinkat(Some(self.0.as_raw_fd()), name, UnlinkatFlags::NoRemoveDir).map_err(|source| { diff --git a/native/suidhelper/src/util/setuid_privileged.rs b/native/suidhelper/src/util/setuid_privileged.rs index 3c58bd39..e334b115 100644 --- a/native/suidhelper/src/util/setuid_privileged.rs +++ b/native/suidhelper/src/util/setuid_privileged.rs @@ -12,17 +12,20 @@ //! constructor nor a Drop impl can report an error, and silently staying root //! would defeat the point. -use nix::unistd::{getuid, seteuid, Uid}; +use nix::unistd::{getuid, seteuid, setgroups, setresgid, setresuid, Gid, Uid}; use thiserror::Error as ThisError; -/// Failures of the privilege transitions. Both are fatal: if we can't raise we -/// aren't installed setuid root, and if we can't lower we refuse to keep running. +/// Failures of the privilege transitions. All fatal: if we can't raise we aren't +/// installed setuid root, if we can't lower we refuse to keep running, and if we +/// can't seal permanent root we refuse to hand off to the execve target. #[derive(Debug, ThisError)] pub enum Error { #[error("not installed setuid root")] NotSetuidRoot, #[error("failed to drop privileges")] DropPrivileges, + #[error("failed to acquire permanent root for execve handoff")] + PermanentRoot, } /// `.preinit_array` runs before `.init_array` and before any shared-library @@ -76,6 +79,36 @@ impl Privileged { } } +/// Re-acquire full root **permanently** for an `execve` handoff and return — +/// there is deliberately no [`Privileged`] Drop guard, because `execve` replaces +/// the entire process image: nothing of this process survives to run a destructor, +/// and the new image (the firecracker jailer) is responsible for dropping its own +/// privileges. We must hand it a *genuine* root process (all of real, effective +/// and saved uid == 0) so the jailer's own privilege-drop is the real thing. +/// +/// Order matters and each step needs the privilege the previous one preserves: +/// 1. `seteuid(0)` — regain effective root; without it the rest are EPERM. +/// 2. `setresgid(0,0,0)` — set every gid to root *before* touching uids, while +/// we still hold euid 0 (gid changes require privilege). +/// 3. `setgroups([0])` — `drop_to_real` only lowered euid; it never touched the +/// caller's supplementary groups, so the jailer would otherwise inherit them. +/// Reset to just {0} now, while still privileged. +/// 4. `setresuid(0,0,0)` LAST — this seals real+effective+saved uid to root. It +/// goes last because once the saved uid is 0 there is no escape hatch left +/// (which is the point: permanent), and because it must follow the gid/group +/// changes that needed our euid-0 to be permitted. +pub fn become_root_permanently() -> Result<(), Error> { + let root_uid = Uid::from_raw(0); + let root_gid = Gid::from_raw(0); + + seteuid(root_uid).map_err(|_| Error::NotSetuidRoot)?; + setresgid(root_gid, root_gid, root_gid).map_err(|_| Error::PermanentRoot)?; + setgroups(&[root_gid]).map_err(|_| Error::PermanentRoot)?; + setresuid(root_uid, root_uid, root_uid).map_err(|_| Error::PermanentRoot)?; + + Ok(()) +} + impl Drop for Privileged { fn drop(&mut self) { if lower().is_err() { diff --git a/native/suidhelper/tests/config_uid_gid_range.rs b/native/suidhelper/tests/config_uid_gid_range.rs new file mode 100644 index 00000000..1a6f4647 --- /dev/null +++ b/native/suidhelper/tests/config_uid_gid_range.rs @@ -0,0 +1,59 @@ +//! Properties of the `uid_gid_range` configuration field. +//! +//! Contracts under test: +//! - A valid range (min >= 1, min <= max) is always accepted. +//! - Absent range yields the built-in default (900_000, 999_999). +//! - A valid range round-trips through TOML deserialization + uid_gid_range(). +//! - min == 0 is always rejected (uid 0 means root; the jailer must never +//! receive it — it skips its privilege drop when uid == 0). +//! - min > max is always rejected (incoherent range; likely a config typo). + +use hyper_suidhelper::config::{validate_uid_gid_range, Config, LoadingError, UidGidRange}; +use proptest::prelude::*; + +#[test] +fn absent_range_yields_default() { + assert_eq!(Config::default().uid_gid_range(), (900_000, 999_999)); +} + +proptest! { + #[test] + fn valid_range_accepted(min in 1u32.., delta in 0u32..) { + // max = min + delta, saturating so it never wraps past u32::MAX. + let max = min.saturating_add(delta); + let r = UidGidRange { min, max }; + prop_assert!(validate_uid_gid_range(&r).is_ok()); + } + + #[test] + fn valid_range_round_trips_via_toml(min in 1u32.., delta in 0u32..) { + let max = min.saturating_add(delta); + let body = format!( + "work_dir = \"/srv/hyper\"\n[jails]\nuid_gid_range = [{min}, {max}]\n" + ); + let config: Config = toml::from_str(&body).expect("valid TOML"); + prop_assert_eq!(config.uid_gid_range(), (min, max)); + } + + #[test] + fn zero_min_always_rejected(max in 0u32..) { + let r = UidGidRange { min: 0, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { min: 0, .. }) + ); + prop_assert!(rejected); + } + + #[test] + fn min_exceeds_max_always_rejected(max in 0u32..u32::MAX) { + // min = max + 1 is always strictly greater than max and always >= 1. + let min = max + 1; + let r = UidGidRange { min, max }; + let rejected = matches!( + validate_uid_gid_range(&r), + Err(LoadingError::BadUidGidRange { .. }) + ); + prop_assert!(rejected); + } +} diff --git a/native/suidhelper/tests/e2e/argv.rs b/native/suidhelper/tests/e2e/argv.rs index 6ebf997a..25c5981a 100644 --- a/native/suidhelper/tests/e2e/argv.rs +++ b/native/suidhelper/tests/e2e/argv.rs @@ -1,7 +1,7 @@ //! L4: prove the exact argv (and empty env) the helper hands to the child tool — -//! the one thing the design deliberately hides from the caller. We point `--bin` -//! at a root-owned fake that writes its argv+env to a file as JSON, then assert -//! on the reconstructed command line. +//! the one thing the design deliberately hides from the caller. We point the +//! tool's config path at a root-owned fake that writes its argv+env to a file as +//! JSON, then assert on the reconstructed command line. #![cfg(feature = "insecure_test_seams")] use std::fs; @@ -31,9 +31,17 @@ fn install_fake(dir: &Path, basename: &str, record: &Path, stdout_line: &str) -> path // root-owned because this test runs as root } -fn write_root_config(dir: &Path) -> PathBuf { +/// Write a root-owned config that points the named tools at the given (fake) +/// binaries, so the helper resolves each tool's path from config rather than a +/// caller argument. +fn write_root_config(dir: &Path, bins: &[(&str, &Path)]) -> PathBuf { let p = dir.join("config.toml"); - fs::write(&p, "work_dir = \"/srv/hyper\"\n").unwrap(); + // Every key here is a tool name, so they live under the `[tools]` table. + let mut body = String::from("work_dir = \"/srv/hyper\"\n[tools]\n"); + for (key, path) in bins { + body.push_str(&format!("{key} = \"{}\"\n", path.display())); + } + fs::write(&p, body).unwrap(); fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); p } @@ -60,17 +68,15 @@ fn dmsetup_create_snapshot_reconstructs_canonical_table_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); // Deliberately weird inner spacing; the helper must re-render canonically. let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "create", "hyper-vm1", "--readonly", @@ -106,21 +112,11 @@ fn dmsetup_remove_retry_toggle_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); - let out = run( - &cfg, - &[ - "dmsetup", - "--bin", - bin.to_str().unwrap(), - "remove", - "--retry", - "hyper-vm1", - ], - ); + let out = run(&cfg, &["dmsetup", "remove", "--retry", "hyper-vm1"]); assert_eq!(out.status.code(), Some(0)); assert_eq!(recorded_argv(&rec), vec!["remove", "--retry", "hyper-vm1"]); } @@ -132,16 +128,14 @@ fn dmsetup_message_create_thin_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); let bin = install_fake(tmp.path(), "dmsetup", &rec, ""); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); let out = run( &cfg, &[ "dmsetup", - "--bin", - bin.to_str().unwrap(), "message", "hyper-pool", "--message", @@ -156,6 +150,116 @@ fn dmsetup_message_create_thin_as_root() { ); } +#[test] +fn dmsetup_targets_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_targets: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + // Fake prints one `dmsetup targets` row; the helper returns it verbatim. + let bin = install_fake(tmp.path(), "dmsetup", &rec, "snapshot v1.16.0"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["targets"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "targets"); + assert_eq!(json["output"], "snapshot v1.16.0\n"); +} + +#[test] +fn dmsetup_ls_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_ls: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake(tmp.path(), "dmsetup", &rec, "hyper-thinpool\\nhyper-rw-abc"); + let cfg = write_root_config(tmp.path(), &[("dmsetup", &bin)]); + + let out = run(&cfg, &["dmsetup", "ls"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!(recorded_argv(&rec), vec!["ls"]); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!(json["output"], "hyper-thinpool\nhyper-rw-abc\n"); +} + +#[test] +fn losetup_list_argv_and_parse_as_root() { + if !is_root() { + eprintln!("SKIP losetup_list: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let rec = tmp.path().join("argv.json"); + let bin = install_fake( + tmp.path(), + "losetup", + &rec, + "/dev/loop0 /srv/hyper/scratch/thinpool.meta", + ); + let cfg = write_root_config(tmp.path(), &[("losetup", &bin)]); + + let out = run(&cfg, &["losetup", "list"]); + assert_eq!( + out.status.code(), + Some(0), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + assert_eq!( + recorded_argv(&rec), + vec![ + "--list", + "--noheadings", + "--raw", + "--output", + "NAME,BACK-FILE" + ] + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!(json["result"], "listed"); + assert_eq!( + json["output"], + "/dev/loop0 /srv/hyper/scratch/thinpool.meta\n" + ); +} + +#[test] +fn dmsetup_rejects_configured_bin_with_wrong_basename_as_root() { + if !is_root() { + eprintln!("SKIP dmsetup_rejects_bin: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // A real, root-owned system file, but the wrong basename for `dmsetup`. + let cfg = write_root_config(tmp.path(), &[("dmsetup", Path::new("/usr/bin/env"))]); + + let out = run(&cfg, &["dmsetup", "targets"]); + assert_ne!( + out.status.code(), + Some(0), + "a configured binary with the wrong basename must be refused" + ); + let err = String::from_utf8_lossy(&out.stderr); + assert!(err.contains("basename must be"), "stderr: {err}"); +} + #[test] fn blockdev_getsz_argv_and_parse_as_root() { if !is_root() { @@ -163,21 +267,12 @@ fn blockdev_getsz_argv_and_parse_as_root() { return; } let tmp = tempfile::tempdir().unwrap(); - let cfg = write_root_config(tmp.path()); let rec = tmp.path().join("argv.json"); // Fake prints "2048" as the sector count for the helper to parse. let bin = install_fake(tmp.path(), "blockdev", &rec, "2048"); + let cfg = write_root_config(tmp.path(), &[("blockdev", &bin)]); - let out = run( - &cfg, - &[ - "blockdev", - "--bin", - bin.to_str().unwrap(), - "--getsz", - "/dev/loop0", - ], - ); + let out = run(&cfg, &["blockdev", "--getsz", "/dev/loop0"]); assert_eq!( out.status.code(), Some(0), diff --git a/native/suidhelper/tests/e2e/config.rs b/native/suidhelper/tests/e2e/config.rs index cd3ba333..ea36b4e9 100644 --- a/native/suidhelper/tests/e2e/config.rs +++ b/native/suidhelper/tests/e2e/config.rs @@ -3,6 +3,8 @@ //! tempfile instead of /etc/hyper, so tests never touch the real host. #![cfg(feature = "insecure_test_seams")] +use hyper_suidhelper::config::{BinError, Config}; +use hyper_suidhelper::util::safe_bin; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -34,17 +36,28 @@ fn write_root_config(dir: &Path, body: &str) -> std::path::PathBuf { p } -/// A missing config file is refused at SafeFile::open with exit code 2. -/// The error is LoadingError::File(ValidationError::Open(ENOENT)), -/// which displays as "open failed: ". +/// A genuinely-absent config file is NOT an error: the helper falls back to the +/// built-in defaults (compiled into this root-owned binary, hence trusted). The +/// default `work_dir` is `/srv/hyper`. Needs root because `sys-test` then +/// acquires privileges to prove it can promote. #[test] -fn missing_config_exits_2() { +fn missing_config_falls_back_to_defaults_as_root() { + if !is_root() { + eprintln!("SKIP missing_config defaults: sys-test needs root"); + return; + } let tmp = tempfile::tempdir().unwrap(); let missing = tmp.path().join("nope.toml"); let out = run_with_config(&missing, &["sys-test"]); - assert_eq!(out.status.code(), Some(2), "missing config must exit 2"); - let err = String::from_utf8_lossy(&out.stderr); - assert!(err.contains("open failed"), "stderr: {err}"); + assert_eq!( + out.status.code(), + Some(0), + "absent config should use defaults; stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + let json: serde_json::Value = serde_json::from_slice(&out.stdout).expect("stdout is JSON"); + assert_eq!(json["sys_test"], "ok"); + assert_eq!(json["hyper_base"], "/srv/hyper"); } #[test] @@ -115,3 +128,95 @@ fn valid_config_and_setuid_yields_sys_test_ok_as_root() { assert_eq!(json["sys_test"], "ok"); assert_eq!(json["hyper_base"], "/srv/hyper"); } + +#[test] +fn firecracker_unconfigured_when_absent() { + // Config::default() has firecracker == None; the accessor must signal this + // distinctly so callers can report a missing-configuration error rather than + // a safe_bin validation error. + let err = Config::default() + .firecracker() + .expect_err("absent firecracker must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("firecracker")), + "expected Unconfigured(\"firecracker\"), got {err:?}", + ); +} + +#[test] +fn jailer_unconfigured_when_absent() { + let err = Config::default() + .jailer() + .expect_err("absent jailer must return Unconfigured"); + assert!( + matches!(err, BinError::Unconfigured("jailer")), + "expected Unconfigured(\"jailer\"), got {err:?}", + ); +} + +#[test] +fn jailer_basename_mismatch_rejected() { + // The basename check in SafeBin::from_path precedes the stat, so we do not + // need a real file — any absolute path with the wrong leaf name is enough. + let body = "work_dir = \"/srv/hyper\"\n[tools]\njailer = \"/usr/local/bin/not-jailer\"\n"; + let config: Config = toml::from_str(body).unwrap(); + let err = config + .jailer() + .expect_err("wrong-basename jailer path must be rejected"); + assert!( + matches!(err, BinError::Bin(safe_bin::Error::Name { .. })), + "expected a Name error, got {err:?}", + ); +} + +#[test] +fn firecracker_and_jailer_return_ok_when_root_owned_as_root() { + if !is_root() { + eprintln!("SKIP firecracker_jailer_configured: needs root to create root-owned binaries"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let fc = tmp.path().join("firecracker"); + let jr = tmp.path().join("jailer"); + // 0o755: root-owned, not group/other-writable — satisfies SafeBin's checks. + for p in [&fc, &jr] { + fs::write(p, b"#!/bin/sh\n").unwrap(); + fs::set_permissions(p, fs::Permissions::from_mode(0o755)).unwrap(); + } + let body = format!( + "work_dir = \"/srv/hyper\"\n[tools]\nfirecracker = \"{}\"\njailer = \"{}\"\n", + fc.display(), + jr.display(), + ); + let config: Config = toml::from_str(&body).unwrap(); + assert!( + config.firecracker().is_ok(), + "root-owned firecracker with correct basename must be accepted" + ); + assert!( + config.jailer().is_ok(), + "root-owned jailer with correct basename must be accepted" + ); +} + +#[test] +fn bad_uid_gid_range_exits_2_as_root() { + if !is_root() { + eprintln!("SKIP bad_uid_gid_range: needs root to own the config file"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + // min = 0 is the clearest violation: uid 0 is root, which the jailer must + // never receive because it skips its privilege drop when uid == 0. + let p = write_root_config( + tmp.path(), + "work_dir = \"/srv/hyper\"\n[jails]\nuid_gid_range = [0, 100]\n", + ); + let out = run_with_config(&p, &["sys-test"]); + assert_eq!(out.status.code(), Some(2)); + let err = String::from_utf8_lossy(&out.stderr); + assert!( + err.contains("uid_gid_range"), + "expected a uid_gid_range error in stderr, got: {err}", + ); +} diff --git a/native/suidhelper/tests/e2e/jailer.rs b/native/suidhelper/tests/e2e/jailer.rs new file mode 100644 index 00000000..ba45468c --- /dev/null +++ b/native/suidhelper/tests/e2e/jailer.rs @@ -0,0 +1,209 @@ +//! L4 for the jailer handoff: prove the exact argv (and an EMPTY environment) the +//! helper hands to the jailer via `execve`, and that the jailer's exit status +//! propagates. We point the config's `jailer` at a root-owned recorder that +//! writes its argv and its `/proc/self/environ` to files, then exits with a known +//! code. Root-only: `become_root_permanently` requires we are already root. +#![cfg(feature = "insecure_test_seams")] + +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::process::Command; + +const HELPER: &str = env!("CARGO_BIN_EXE_hyper-suidhelper"); +const RECORDER_EXIT: i32 = 7; + +fn is_root() -> bool { + nix::unistd::geteuid().is_root() +} + +fn cat_bin() -> &'static str { + ["/bin/cat", "/usr/bin/cat"] + .into_iter() + .find(|p| Path::new(p).exists()) + .expect("a `cat` binary for the recorder") +} + +/// Install a root-owned recorder named `jailer` that writes its argv (minus +/// argv[0]) as a JSON array to `argv_rec` and copies its `/proc/self/environ` to +/// `env_rec`, then exits `RECORDER_EXIT`. Paths are baked into the script text +/// because the recorder runs with an empty environment and absolute `cat` so it +/// needs no `PATH`. Returns the recorder's absolute path. +fn install_recorder(dir: &Path, argv_rec: &Path, env_rec: &Path) -> PathBuf { + let path = dir.join("jailer"); + let script = format!( + "#!/bin/sh\n\ + (\n printf '['\n sep=''\n for a in \"$@\"; do printf '%s\"%s\"' \"$sep\" \"$a\"; sep=','; done\n printf ']'\n) > '{argv}'\n\ + {cat} /proc/self/environ > '{env}'\n\ + exit {code}\n", + argv = argv_rec.display(), + cat = cat_bin(), + env = env_rec.display(), + code = RECORDER_EXIT, + ); + fs::write(&path, script).unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o755)).unwrap(); + path +} + +/// A root-owned plain file with basename `firecracker` — the `--exec-file`. It is +/// never executed by us (the jailer would), only validated as a `SafeBin`. +fn install_firecracker(dir: &Path) -> PathBuf { + let path = dir.join("firecracker"); + fs::write(&path, b"#!/bin/true\n").unwrap(); + fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap(); + path +} + +fn write_root_config(dir: &Path, jailer: &Path, firecracker: &Path) -> PathBuf { + let p = dir.join("config.toml"); + let body = format!( + "work_dir = \"/srv/hyper\"\n[tools]\njailer = \"{}\"\nfirecracker = \"{}\"\n", + jailer.display(), + firecracker.display(), + ); + fs::write(&p, body).unwrap(); + fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); + p +} + +fn run(config: &Path, args: &[&str]) -> std::process::Output { + Command::new(HELPER) + .args(args) + .env_clear() + .env("HYPER_SETUIDHELPER_IS_INSECURE_MODE", "1") + .env("HYPER_SETUIDHELPER_CONFIG_PATH", config) + .output() + .expect("spawn helper") +} + +#[test] +fn execs_jailer_with_canonical_argv_and_empty_env_as_root() { + if !is_root() { + eprintln!("SKIP jailer exec: needs root to become_root_permanently + own the fakes"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "900001", + "--gid", + "900002", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--api-sock", + "/api.sock", + ], + ); + + // The jailer's own exit status must propagate through the execve handoff. + assert_eq!( + out.status.code(), + Some(RECORDER_EXIT), + "exit status did not propagate; stderr: {}", + String::from_utf8_lossy(&out.stderr), + ); + + let argv: Vec = + serde_json::from_str(&fs::read_to_string(&argv_rec).expect("recorded argv")).unwrap(); + assert_eq!( + argv, + vec![ + "--id", + "vm1", + "--exec-file", + &firecracker.to_string_lossy(), + "--uid", + "900001", + "--gid", + "900002", + "--chroot-base-dir", + "/srv/hyper/jails", + "--cgroup-version", + "2", + "--parent-cgroup", + "hyper", + "--cgroup", + "memory.max=1048576", + "--cgroup", + "cpu.max=100000 100000", + "--", + "--api-sock", + "/api.sock", + ], + "helper handed the jailer a non-canonical argv", + ); + + // The helper execve's the jailer with an EMPTY envp (see src/tools/jailer.rs): + // once ruid==0 a smuggled LD_PRELOAD would be honored, so nothing of the + // caller's environment may reach the root jailer. The recorder is a `/bin/sh` + // script and the shell *self-sets* `PWD` (and, under bash, `_`/`SHLVL`) on + // startup, so a literally-empty environ is impossible to observe through it. + // We instead prove no CALLER variable survives: the helper is spawned with + // `HYPER_*` config vars in its own environment (see `run`), so their absence + // here is the leak canary - had the helper passed its environment through, + // they would appear alongside `PWD`. + let environ = fs::read(&env_rec).expect("recorded environ"); + let leaked: Vec = environ + .split(|&b| b == 0) + .filter(|entry| !entry.is_empty()) + .map(|entry| String::from_utf8_lossy(entry).into_owned()) + .filter(|entry| { + let key = entry.split('=').next().unwrap_or(""); + !matches!(key, "PWD" | "_" | "SHLVL") + }) + .collect(); + assert!( + leaked.is_empty(), + "caller environment leaked to the jailer (only shell-set PWD/_/SHLVL allowed): {leaked:?}", + ); +} + +#[test] +fn refuses_uid_zero_without_exec_as_root() { + if !is_root() { + eprintln!("SKIP jailer uid 0: needs root"); + return; + } + let tmp = tempfile::tempdir().unwrap(); + let argv_rec = tmp.path().join("argv.json"); + let env_rec = tmp.path().join("environ.bin"); + let jailer = install_recorder(tmp.path(), &argv_rec, &env_rec); + let firecracker = install_firecracker(tmp.path()); + let cfg = write_root_config(tmp.path(), &jailer, &firecracker); + + let out = run( + &cfg, + &[ + "jailer", + "--id", + "vm1", + "--uid", + "0", + "--gid", + "900002", + "--api-sock", + "/api.sock", + ], + ); + + assert_ne!(out.status.code(), Some(0), "uid 0 must be refused"); + assert_eq!(out.status.code(), Some(2), "validation failure exits 2"); + assert!( + !argv_rec.exists(), + "the jailer must never have been exec'd for uid 0", + ); +} diff --git a/native/suidhelper/tests/tools/chroot_jail_grant_api.rs b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs new file mode 100644 index 00000000..e55a679b --- /dev/null +++ b/native/suidhelper/tests/tools/chroot_jail_grant_api.rs @@ -0,0 +1,241 @@ +//! Contracts of the `chroot-jail grant-api` op, driven through the base-injected +//! `grant_api_under` seam so they run unprivileged in a tempdir. The promises +//! under test (refusal contracts first — they are the security boundary): +//! * shape — the socket is accepted iff it is exactly +//! `//root/api.socket` below the jail base; any other depth or a +//! leaf that is not `api.socket` is refused before any chown; +//! * lexical — a `.`/`..`/empty component or a relative path is always rejected +//! before any filesystem access; +//! * type — a regular file or a symlink planted at `api.socket` is refused +//! (`NotASocket`) and left untouched, never chmod'd; only a real socket is +//! granted; +//! * confinement — a symlinked path component is never followed, so the chown +//! can never escape the anchored jail tree (the core TOCTOU guarantee); +//! * pending — a not-yet-created socket (or half-built jail) is `Pending`, not +//! an error, so the controller keeps probing; +//! * grant — a real socket is chowned to the caller and left mode 0660, and +//! its parent `root` dir is opened for the caller's group to traverse +//! (chgrp'd to the caller, chmod'd 0710) so the node can reach the socket. + +use hyper_suidhelper::tools::chroot_jail::grant_api::{grant_api_under, Error, GrantOut}; +use hyper_suidhelper::util::safe_path::ValidationError; +use proptest::prelude::*; +use std::os::unix::fs::{symlink, PermissionsExt}; +use std::os::unix::net::UnixListener; +use std::path::{Path, PathBuf}; +use std::{fs, os::unix::fs::MetadataExt}; + +/// Build the canonical `/exec/id/root` parent dirs and return that dir. +fn make_root(jail: &Path) -> PathBuf { + let root = jail.join("exec").join("id").join("root"); + fs::create_dir_all(&root).unwrap(); + root +} + +#[test] +fn socket_outside_jail_base_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + let outside = tmp.path().join("elsewhere/exec/id/root/api.socket"); + let err = grant_api_under(&jail, &outside).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotUnderBase)), + "got {err:?}", + ); +} + +#[test] +fn wrong_leaf_basename_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("root").join("evil.sock"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketName(_)), "got {err:?}"); +} + +#[test] +fn too_shallow_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail.join("exec").join("id").join("api.socket"); // missing root/ + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn too_deep_is_shape_error() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = jail + .join("exec") + .join("id") + .join("root") + .join("extra") + .join("api.socket"); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!(matches!(err, Error::SocketShape(_)), "got {err:?}"); +} + +#[test] +fn dotdot_traversal_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let bad = PathBuf::from(format!("{}/exec/../id/root/api.socket", jail.display())); + let err = grant_api_under(jail, &bad).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::LooseComponents)), + "got {err:?}", + ); +} + +#[test] +fn relative_socket_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let err = grant_api_under(tmp.path(), Path::new("exec/id/root/api.socket")).unwrap_err(); + assert!( + matches!(err, Error::SocketPath(ValidationError::NotAbsolute)), + "got {err:?}", + ); +} + +#[test] +fn missing_socket_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); // never created + let out = grant_api_under(jail, &socket).expect("missing socket must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +#[test] +fn missing_jail_tree_is_pending() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let socket = jail.join("exec").join("id").join("root").join("api.socket"); // nothing created + let out = grant_api_under(jail, &socket).expect("half-built jail must be Ok(Pending)"); + assert!(matches!(out, GrantOut::Pending), "got {out:?}"); +} + +// A real socket is granted: chowned to the caller (our own uid/gid, which a +// non-root process is allowed to set on a file it owns) and chmod'd 0660. +#[test] +fn real_socket_is_granted_and_chmod_0660() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let socket = root.join("api.socket"); + let _listener = UnixListener::bind(&socket).unwrap(); + fs::set_permissions(&socket, fs::Permissions::from_mode(0o755)).unwrap(); + + let out = grant_api_under(jail, &socket).expect("real socket must grant"); + assert!(matches!(out, GrantOut::Granted), "got {out:?}"); + + let meta = fs::symlink_metadata(&socket).unwrap(); + assert_eq!(meta.mode() & 0o777, 0o660, "socket must be chmod'd 0660"); + assert_eq!(meta.uid(), nix::unistd::getuid().as_raw()); + assert_eq!(meta.gid(), nix::unistd::getgid().as_raw()); + + // The parent `root` dir must also be opened for the caller's group to + // traverse, else the node could not reach the socket: chgrp'd to the caller, + // chmod'd 0710 (owner rwx, group --x, other none). + let root_meta = fs::symlink_metadata(&root).unwrap(); + assert_eq!( + root_meta.mode() & 0o777, + 0o710, + "jail root must be chmod'd 0710 for traversal", + ); + assert_eq!( + root_meta.gid(), + nix::unistd::getgid().as_raw(), + "jail root must be chgrp'd to the caller", + ); +} + +// A regular file planted at api.socket is refused and left untouched (not chmod'd). +#[test] +fn regular_file_at_leaf_is_refused_and_untouched() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let imposter = root.join("api.socket"); + fs::write(&imposter, b"not a socket").unwrap(); + fs::set_permissions(&imposter, fs::Permissions::from_mode(0o600)).unwrap(); + + let err = grant_api_under(jail, &imposter).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); + assert_eq!( + fs::symlink_metadata(&imposter).unwrap().mode() & 0o777, + 0o600, + "imposter file must not be chmod'd", + ); +} + +// A symlink planted at api.socket is refused: fstatat(AT_SYMLINK_NOFOLLOW) stats +// the link itself (S_IFLNK), so it is never seen as a socket and never followed. +#[test] +fn symlink_at_leaf_is_refused() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let root = make_root(jail); + let target = tmp.path().join("real-target"); + fs::write(&target, b"secret").unwrap(); + let link = root.join("api.socket"); + symlink(&target, &link).unwrap(); + + let err = grant_api_under(jail, &link).unwrap_err(); + assert!(matches!(err, Error::NotASocket), "got {err:?}"); +} + +// A symlinked path component must NOT be followed: the walk fails rather than +// reaching through it, so nothing outside the jail is touched. +#[test] +fn symlinked_component_does_not_escape() { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path().join("jail"); + fs::create_dir(&jail).unwrap(); + + let sentinel = tmp.path().join("sentinel"); + fs::create_dir_all(sentinel.join("id").join("root")).unwrap(); + let outside_socket = sentinel.join("id").join("root").join("api.socket"); + let _listener = UnixListener::bind(&outside_socket).unwrap(); + fs::set_permissions(&outside_socket, fs::Permissions::from_mode(0o700)).unwrap(); + + // `/exec` is a symlink to the external sentinel dir. + symlink(&sentinel, jail.join("exec")).unwrap(); + + let socket = jail.join("exec").join("id").join("root").join("api.socket"); + let _ = grant_api_under(&jail, &socket); // O_NOFOLLOW makes the walk refuse + + assert_eq!( + fs::symlink_metadata(&outside_socket).unwrap().mode() & 0o777, + 0o700, + "grant escaped through a symlinked component", + ); +} + +proptest! { + // For a socket `depth` components below the jail base with leaf `api.socket` + // (target never created), grant_api_under returns Ok(Pending) iff depth == 4 + // (i.e. 3 parents), else SocketShape. The generator emits only plain names so + // the lexical gate never fires and the leaf is always `api.socket`. + #[test] + fn shape_classification( + parents in prop::collection::vec("[a-z][a-z0-9]{0,5}", 1..6) + ) { + let tmp = tempfile::tempdir().unwrap(); + let jail = tmp.path(); + let mut socket = jail.to_path_buf(); + for c in &parents { + socket.push(c); + } + socket.push("api.socket"); + let res = grant_api_under(jail, &socket); + if parents.len() == 3 { + prop_assert!(matches!(res, Ok(GrantOut::Pending)), "depth 3 must be Pending, got {res:?}"); + } else { + prop_assert!(matches!(res, Err(Error::SocketShape(_))), "got {res:?}"); + } + } +} diff --git a/native/suidhelper/tests/tools/chroot_jail_remove.rs b/native/suidhelper/tests/tools/chroot_jail_remove.rs index 93377e25..7d0de542 100644 --- a/native/suidhelper/tests/tools/chroot_jail_remove.rs +++ b/native/suidhelper/tests/tools/chroot_jail_remove.rs @@ -164,6 +164,53 @@ fn symlinked_chroot_component_does_not_escape() { ); } +// cgroup.kill is written "1" before the rmdir: the fix SIGKILLs the subtree +// first. Here cgroup.kill keeps the leaf non-empty so rmdir returns ENOTEMPTY +// (tolerated) and the leaf survives — letting us assert the write happened. +#[test] +fn cgroup_kill_is_written_before_rmdir() { + let tmp = tempfile::tempdir().unwrap(); + let base = tmp.path(); + let leaf = base.join("slice").join("leaf"); + fs::create_dir_all(&leaf).unwrap(); + fs::write(leaf.join("cgroup.kill"), b"0").unwrap(); + + remove_cgroup_under(base, &leaf).expect("write+rmdir must be Ok"); + assert!(leaf.exists(), "non-empty leaf must survive"); + assert_eq!( + fs::read(leaf.join("cgroup.kill")).unwrap(), + b"1", + "cgroup.kill must have been written \"1\"", + ); +} + +// A leaf without a cgroup.kill file (pre-5.14/non-v2 kernel) is tolerated: the +// missing pseudo-file is a no-op and the empty leaf is removed normally. +#[test] +fn missing_cgroup_kill_is_tolerated() { + let tmp = tempfile::tempdir().unwrap(); + let base = tmp.path(); + let leaf = base.join("slice").join("leaf"); + fs::create_dir_all(&leaf).unwrap(); + + remove_cgroup_under(base, &leaf).expect("missing cgroup.kill must be Ok"); + assert!(!leaf.exists(), "empty leaf must be removed"); +} + +// The leaf is already gone but its parent survives: the new openat_dir(leaf) +// ENOENT branch returns Ok without touching the parent. +#[test] +fn missing_leaf_with_present_parent_is_ok() { + let tmp = tempfile::tempdir().unwrap(); + let base = tmp.path(); + let parent = base.join("slice"); + fs::create_dir(&parent).unwrap(); + let leaf = parent.join("leaf"); // never created + + remove_cgroup_under(base, &leaf).expect("missing leaf must be Ok"); + assert!(parent.exists(), "parent must survive"); +} + proptest! { // For a chroot `depth` components below the jail base (target never created), // remove_chroot_under returns Ok iff depth == 2, else ChrootDepth. The diff --git a/native/suidhelper/tests/tools/jailer.rs b/native/suidhelper/tests/tools/jailer.rs new file mode 100644 index 00000000..9ab75447 --- /dev/null +++ b/native/suidhelper/tests/tools/jailer.rs @@ -0,0 +1,162 @@ +//! Refusal contracts for the jailer's pure validators — the security core. A +//! valid input must round-trip to its canonical form; an invalid one must +//! *always* be rejected, never silently accepted. These properties are what stop +//! a compromised BEAM from naming uid 0, a privileged path, a traversal, or a +//! flag through the jailer subcommand. + +use hyper_suidhelper::tools::jailer::{validate_id_number, CgroupSetting, JailSock, VmId}; +use proptest::prelude::*; +use std::str::FromStr; + +proptest! { + /// uid/gid 0 is rejected for EVERY range — a jailer run with uid 0 skips its + /// privilege drop and leaves firecracker running as root. + #[test] + fn id_zero_always_rejected(lo in any::(), span in any::()) { + let hi = lo.saturating_add(span); + prop_assert!(validate_id_number(0, (lo, hi)).is_err()); + } + + /// Any nonzero value inside the (nonempty) range is accepted unchanged. + #[test] + fn id_in_range_nonzero_accepted( + lo in 1u32..=1_000_000, + span in 0u32..=1_000_000, + off in 0u32..=1_000_000, + ) { + let hi = lo + span; + let n = lo + (off % (span + 1)); // off bounded into [0, span] => n in [lo, hi] + prop_assert_eq!(validate_id_number(n, (lo, hi)).ok(), Some(n)); + } + + /// Values just below `lo` (still nonzero) and just above `hi` are rejected. + #[test] + fn id_out_of_range_rejected(lo in 2u32..=1_000_000, span in 0u32..=1_000_000) { + let hi = lo + span; + prop_assert!(validate_id_number(lo - 1, (lo, hi)).is_err()); + if hi < u32::MAX { + prop_assert!(validate_id_number(hi + 1, (lo, hi)).is_err()); + } + } + + /// Every string over the allowed charset/length with a non-dash leader parses + /// and renders back to itself. + #[test] + fn vmid_valid_round_trips(s in "[A-Za-z0-9_][A-Za-z0-9_-]{0,63}") { + prop_assert_eq!(VmId::from_str(&s).unwrap().to_string(), s); + } + + /// A leading dash is always rejected (no flag injection via `--id`). + #[test] + fn vmid_leading_dash_rejected(s in "-[A-Za-z0-9_-]{0,63}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Any embedded path separator is rejected (no chroot traversal via `--id`). + #[test] + fn vmid_slash_rejected(s in "[A-Za-z0-9_]{0,10}/[A-Za-z0-9_]{0,10}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// Over-length ids (>64) are rejected. + #[test] + fn vmid_too_long_rejected(s in "[A-Za-z][A-Za-z0-9_-]{64,90}") { + prop_assert!(VmId::from_str(&s).is_err()); + } + + /// A valid `memory.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_memory_round_trips(s in "memory[.]max=([0-9]{1,20}|max)") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A valid `cpu.max` setting re-renders to exactly `key=value`. + #[test] + fn cgroup_cpu_round_trips(s in "cpu[.]max=([0-9]{1,20} [0-9]{1,20}|max [0-9]{1,20})") { + prop_assert_eq!(CgroupSetting::from_str(&s).unwrap().to_string(), s); + } + + /// A single-filename absolute socket path round-trips; `.`/`..` filenames are + /// rejected even though they are within the charset. + #[test] + fn jailsock_single_filename(name in "[A-Za-z0-9_.-]{1,40}") { + let s = format!("/{name}"); + let res = JailSock::from_str(&s); + if name == "." || name == ".." { + prop_assert!(res.is_err()); + } else { + prop_assert_eq!(res.unwrap().to_string(), s); + } + } + + /// A second path component is always rejected (the socket must be a direct + /// child of `/`). + #[test] + fn jailsock_multi_component_rejected( + a in "[A-Za-z0-9_]{1,10}", + b in "[A-Za-z0-9_]{1,10}", + ) { + let s = format!("/{a}/{b}"); + prop_assert!(JailSock::from_str(&s).is_err()); + } +} + +#[test] +fn vmid_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "-leading", // leading dash + "a/b", // separator + "a.b", // dot + "a b", // whitespace + "a\tb", // tab + "a\0b", // NUL + "naïve", // non-ascii + &"x".repeat(65), // too long + ] { + assert!(VmId::from_str(bad).is_err(), "VmId accepted {bad:?}"); + } +} + +#[test] +fn cgroup_rejects_known_bad_shapes() { + for bad in [ + "linear=10", // unknown key + "memory.high=10", // unknown key + "memory.max=", // empty value + "memory.max=12x", // non-digit + "memory.max=1=2", // second '=' + "memory.max", // no '=' + "cpu.max=100000", // missing period field + "cpu.max=100000 100000 5", // extra field + "cpu.max=x 100000", // bad quota + "cpu.max=max max", // bad period + "cpu.max=max", // missing period + ] { + assert!( + CgroupSetting::from_str(bad).is_err(), + "CgroupSetting accepted {bad:?}" + ); + } +} + +#[test] +fn jailsock_rejects_known_bad_shapes() { + for bad in [ + "", // empty + "relative", // not absolute + "/", // no filename + "/a/b", // multi-component + "/../etc", // traversal + "/..", // traversal as whole filename + "/.", // current dir + "/a b", // whitespace + "/a\0b", // NUL + "//x", // empty leading component + ] { + assert!( + JailSock::from_str(bad).is_err(), + "JailSock accepted {bad:?}" + ); + } +} diff --git a/native/suidhelper/tests/util/confinement.rs b/native/suidhelper/tests/util/confinement.rs index 5cffd2b0..f537500e 100644 --- a/native/suidhelper/tests/util/confinement.rs +++ b/native/suidhelper/tests/util/confinement.rs @@ -8,6 +8,7 @@ use hyper_suidhelper::util::safe_file::{ Any, IsBlockDevice, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile, }; use hyper_suidhelper::util::safe_path::{IsAbsolute, SafePath, StrictComponents}; +use nix::errno::Errno; use nix::fcntl::OFlag; use std::fs; use std::os::unix::fs::{symlink, PermissionsExt}; @@ -116,6 +117,42 @@ fn safefile_mode_axis_rejects_group_writable() { assert!(SafeFile::::open(&p, OFlag::O_PATH).is_ok()); } +// write_file writes its bytes to an existing file through the pinned dir fd. +#[test] +fn write_file_writes_bytes_to_existing_file() { + let tmp = tempfile::tempdir().unwrap(); + let f = tmp.path().join("f"); + fs::write(&f, b"x").unwrap(); + + let dir = SafeDir::open(&safe(tmp.path())).unwrap(); + dir.write_file(Path::new("f"), b"1").unwrap(); + assert_eq!(fs::read(&f).unwrap(), b"1"); +} + +// write_file refuses a symlinked target (O_NOFOLLOW) and leaves it untouched: a +// write cannot be redirected through a symlink out of the pinned dir. +#[test] +fn write_file_refuses_symlinked_target_and_leaves_it_unchanged() { + let tmp = tempfile::tempdir().unwrap(); + let target = tmp.path().join("target"); + fs::write(&target, b"keep").unwrap(); + symlink(&target, tmp.path().join("link")).unwrap(); + + let dir = SafeDir::open(&safe(tmp.path())).unwrap(); + assert!(dir.write_file(Path::new("link"), b"1").is_err()); + assert_eq!(fs::read(&target).unwrap(), b"keep"); +} + +// write_file on a missing name fails with ENOENT (no O_CREAT), so the caller can +// treat an absent pseudo-file as a no-op. +#[test] +fn write_file_missing_returns_enoent() { + let tmp = tempfile::tempdir().unwrap(); + let dir = SafeDir::open(&safe(tmp.path())).unwrap(); + let err = dir.write_file(Path::new("nope"), b"1").unwrap_err(); + assert_eq!(err.errno(), Some(Errno::ENOENT)); +} + // SafeFile::open refuses a symlinked final component (O_NOFOLLOW). #[test] fn safefile_open_rejects_final_symlink() { diff --git a/native/suidhelper/tests/util/safe_bin.rs b/native/suidhelper/tests/util/safe_bin.rs index 950e06b7..e7522439 100644 --- a/native/suidhelper/tests/util/safe_bin.rs +++ b/native/suidhelper/tests/util/safe_bin.rs @@ -1,5 +1,5 @@ -//! `SafeBin` is what stops `--bin` from pointing the helper at an -//! attacker-controlled binary it would then run as root. The constructor demands +//! `SafeBin` is what stops a configured path from pointing the helper at +//! an attacker-controlled binary it would then run as root. The constructor demands //! an absolute path, exact basename, a real (non-symlink) regular file owned by //! root and not group/other-writable. These assert the refusal axes; the symlink //! axis is root-independent, the owner axis is asserted both ways. diff --git a/test/hyper/node/fire_vmm/jailer_test.exs b/test/hyper/node/fire_vmm/jailer_test.exs new file mode 100644 index 00000000..d2fff4a2 --- /dev/null +++ b/test/hyper/node/fire_vmm/jailer_test.exs @@ -0,0 +1,83 @@ +defmodule Hyper.Node.FireVMM.JailerTest do + @moduledoc """ + Examples for `Hyper.Node.FireVMM.Jailer.command/1`. + + Load-bearing invariant: the BEAM must never place a privileged binary path + (firecracker, jailer) or lifecycle flags owned by the suidhelper (`--exec-file`, + `--chroot-base-dir`, `--cgroup-version`, `--parent-cgroup`, `--`) in the args + it hands to the helper. The helper derives those from its trusted config. + """ + + use ExUnit.Case, async: false + + alias Hyper.Node.FireVMM + alias Hyper.Node.FireVMM.Jailer + + @vm_id "vmtest01" + + # Stub the cached TOML so firecracker/jailer resolve to dummy paths without + # requiring /etc/hyper/config.toml on the test host. async: false because the + # cache is global state. + setup do + Hyper.Cfg.Toml.put_cache(%{ + "tools" => %{ + "firecracker" => "/usr/local/bin/firecracker", + "jailer" => "/usr/local/bin/jailer" + } + }) + + on_exit(fn -> Hyper.Cfg.Toml.reload() end) + end + + defp micro_opts do + %FireVMM.Opts{ + vm_id: @vm_id, + uid: 900_001, + gid: 900_001, + type: :micro, + arch: :x86_64, + mutable: nil, + kernel: "/srv/hyper/redist/vmlinux/vmlinux-x86_64-6.1", + boot_args: nil + } + end + + test "binary is the suid helper" do + assert Jailer.command(micro_opts()).binary == Hyper.Cfg.Tools.suidhelper() + end + + test "args start with the jailer subcommand" do + %{args: [first | _]} = Jailer.command(micro_opts()) + assert first == "jailer" + end + + test "args contain --id, --uid, --gid with the opts values" do + %{args: args} = Jailer.command(micro_opts()) + assert "--id" in args + assert @vm_id in args + assert "--uid" in args + assert "--gid" in args + assert "900001" in args + end + + test "args end with --api-sock /api.socket" do + %{args: args} = Jailer.command(micro_opts()) + assert Enum.take(args, -2) == ["--api-sock", "/api.socket"] + end + + test "args do not contain privileged flags owned by the suidhelper" do + %{args: args} = Jailer.command(micro_opts()) + refute "--exec-file" in args + refute "--chroot-base-dir" in args + refute "--cgroup-version" in args + refute "--parent-cgroup" in args + refute "--" in args + end + + test "args include --cgroup cpu.max and memory.max for :micro type" do + %{args: args} = Jailer.command(micro_opts()) + assert "--cgroup" in args + assert Enum.any?(args, &String.starts_with?(&1, "cpu.max=")) + assert Enum.any?(args, &String.starts_with?(&1, "memory.max=")) + end +end diff --git a/test/hyper/node/reaper/plan_properties_test.exs b/test/hyper/node/reaper/plan_properties_test.exs new file mode 100644 index 00000000..6720a608 --- /dev/null +++ b/test/hyper/node/reaper/plan_properties_test.exs @@ -0,0 +1,78 @@ +defmodule Hyper.Node.Reaper.PlanPropertiesTest do + @moduledoc """ + Safety laws every `Hyper.Node.Reaper.Plan` decision must obey, generated over a + small shared id alphabet so live / cgroup-leaf / rw sets actually overlap: + + * a live vm_id is NEVER a reap candidate (the union of liveness sources is + removed from the orphan set); + * only an orphan seen on two consecutive ticks is reaped (`confirm/2` reaps a + subset of both current and last, and carries `current` forward); + * `hyper-thinpool` / `hyper-img-*` / any non-`hyper-rw-*` name never yields a + candidate, and `Mutable.dm_name(id)` round-trips back to exactly `id` (so a + future id-scheme change that breaks the strip fails loudly here). + """ + use ExUnit.Case, async: true + use ExUnitProperties + + alias Hyper.Node.Img.Mutable + alias Hyper.Node.Reaper.Plan + + # A deliberately tiny alphabet so generated live/leaf/rw id sets collide often, + # exercising the difference and intersection rather than always being disjoint. + defp id, do: member_of(~w(a b c d e)) + + defp id_set, do: map(list_of(id()), &MapSet.new/1) + + defp id_list, do: list_of(id()) + + property "a live vm_id is never a reap candidate" do + check all( + live <- id_set(), + leaves <- id_list(), + rw <- id_list() + ) do + orphans = Plan.orphans(live, leaves, rw) + assert MapSet.disjoint?(orphans, live) + end + end + + property "only twice-seen orphans are reaped; current is carried forward" do + check all( + current <- id_set(), + last <- id_set() + ) do + {reap, next} = Plan.confirm(current, last) + + assert MapSet.subset?(reap, current) + assert MapSet.subset?(reap, last) + assert next == current + end + end + + property "rw_ids excludes thinpool, img, and non-rw junk" do + safe_dm = + member_of([ + "hyper-thinpool", + "hyper-img-abc-0", + "hyper-img-deadbeef-3", + "unrelated-device", + "cryptroot" + ]) + + check all(names <- list_of(safe_dm)) do + assert Plan.rw_ids(names) == [] + end + end + + property "Mutable.dm_name/1 round-trips through rw_ids for a real vm_id" do + check all( + id <- + map( + binary(min_length: 1, max_length: 16), + &("v" <> Base.encode32(&1, padding: false, case: :lower)) + ) + ) do + assert Plan.rw_ids([Mutable.dm_name(id)]) == [id] + end + end +end diff --git a/test/hyper/node/reaper/plan_test.exs b/test/hyper/node/reaper/plan_test.exs new file mode 100644 index 00000000..0021db98 --- /dev/null +++ b/test/hyper/node/reaper/plan_test.exs @@ -0,0 +1,65 @@ +defmodule Hyper.Node.Reaper.PlanTest do + use ExUnit.Case, async: true + + alias Hyper.Node.Reaper.Plan + + defp set(ids), do: MapSet.new(ids) + + describe "orphans/3" do + test "a cgroup-leaf-only orphan is a candidate" do + assert Plan.orphans(set([]), ["dead"], []) == set(["dead"]) + end + + test "a dm-only orphan is a candidate" do + assert Plan.orphans(set([]), [], ["dead"]) == set(["dead"]) + end + + test "an id seen in both sources is a single candidate" do + assert Plan.orphans(set([]), ["dead"], ["dead"]) == set(["dead"]) + end + + test "an id present in live is never a candidate, even if it also has resources" do + assert Plan.orphans(set(["alive"]), ["alive"], ["alive"]) == set([]) + end + + test "only the non-live ids survive as candidates" do + assert Plan.orphans(set(["alive"]), ["alive", "dead"], ["alive", "gone"]) == + set(["dead", "gone"]) + end + end + + describe "confirm/2 two-strike grace" do + test "first tick reaps nothing (last is empty) but remembers the candidates" do + current = set(["x", "y"]) + {reap, next} = Plan.confirm(current, set([])) + + assert reap == set([]) + assert next == current + end + + test "second tick reaps the still-orphan ids" do + {_, last} = Plan.confirm(set(["x", "y"]), set([])) + {reap, _next} = Plan.confirm(set(["x", "y"]), last) + + assert reap == set(["x", "y"]) + end + + test "an id orphaned tick1 but live/absent tick2 is not reaped" do + {_, last} = Plan.confirm(set(["x"]), set([])) + {reap, _next} = Plan.confirm(set([]), last) + + assert reap == set([]) + end + + test "an id new in tick2 is not reaped (only one strike)" do + {_, last} = Plan.confirm(set(["x"]), set([])) + {reap, _next} = Plan.confirm(set(["x", "fresh"]), last) + + assert reap == set(["x"]) + end + end + + test "rw_ids/1 strips the hyper-rw- prefix and ignores thinpool/img names" do + assert Plan.rw_ids(["hyper-thinpool", "hyper-img-abc-0", "hyper-rw-vabc"]) == ["vabc"] + end +end diff --git a/test/hyper/suid_helper/dmsetup_properties_test.exs b/test/hyper/suid_helper/dmsetup_properties_test.exs index 73858fc6..a7e49d81 100644 --- a/test/hyper/suid_helper/dmsetup_properties_test.exs +++ b/test/hyper/suid_helper/dmsetup_properties_test.exs @@ -73,4 +73,17 @@ defmodule Hyper.SuidHelper.DmsetupPropertiesTest do assert Dmsetup.parse_targets(out) == MapSet.new(targets) end end + + property "parse_names recovers the device name from each `dmsetup ls` row" do + check all(names <- uniq_list_of(dev(), min_length: 1, max_length: 6)) do + # `dmsetup ls` rows are "\t(major:minor)"; order is preserved. + out = Enum.map_join(names, "\n", fn n -> "#{n}\t(254:0)" end) + assert Dmsetup.parse_names(out) == names + end + end + + test "parse_names reads the empty-table sentinel as no devices" do + assert Dmsetup.parse_names("No devices found\n") == [] + assert Dmsetup.parse_names("") == [] + end end diff --git a/test/hyper/suid_helper/losetup_test.exs b/test/hyper/suid_helper/losetup_test.exs new file mode 100644 index 00000000..4cdc6f0f --- /dev/null +++ b/test/hyper/suid_helper/losetup_test.exs @@ -0,0 +1,37 @@ +defmodule Hyper.SuidHelper.LosetupTest do + @moduledoc """ + `parse_list/1` turns `losetup --list --output NAME,BACK-FILE` rows into + `{device, backing_file}` pairs. The reclaim pass relies on it to recognise loop + devices backing Hyper's files, so the edges that matter are: a loop with no + backing file (skipped, nothing to reclaim by file) and a `(deleted)` backing + suffix (kept, so the data-dir prefix still matches). + """ + use ExUnit.Case, async: true + + alias Hyper.SuidHelper.Losetup + + test "pairs device with backing file and skips rows that have no backing file" do + out = """ + /dev/loop0 /srv/hyper/scratch/thinpool.meta + /dev/loop1 /srv/hyper/layers/blob + /dev/loop2 + """ + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.meta"}, + {"/dev/loop1", "/srv/hyper/layers/blob"} + ] + end + + test "keeps a `(deleted)` backing suffix so data-dir prefix matching still works" do + out = "/dev/loop0 /srv/hyper/scratch/thinpool.data (deleted)\n" + + assert Losetup.parse_list(out) == [ + {"/dev/loop0", "/srv/hyper/scratch/thinpool.data (deleted)"} + ] + end + + test "an empty listing yields no pairs" do + assert Losetup.parse_list("") == [] + end +end diff --git a/test/hyper/vm/id_test.exs b/test/hyper/vm/id_test.exs new file mode 100644 index 00000000..5f82fb1e --- /dev/null +++ b/test/hyper/vm/id_test.exs @@ -0,0 +1,20 @@ +defmodule Hyper.Vm.IdTest do + @moduledoc """ + The charset contract of `Hyper.Vm.Id.generate/0`. The load-bearing invariant is + the refusal property: a generated id is *always* strictly alphanumeric, so it + can never carry a char that firecracker (`_`) or dm/jailer names (`-`) reject. + """ + + use ExUnit.Case, async: true + use ExUnitProperties + + alias Hyper.Vm.Id + + property "generate/0 produces a `v`-prefixed, strictly alphanumeric id" do + check all(_ <- StreamData.constant(nil)) do + id = Id.generate() + assert id =~ ~r/\A[A-Za-z0-9]+\z/ + assert String.starts_with?(id, "v") + end + end +end