diff --git a/docs/cookbook/intro.md b/docs/cookbook/intro.md
index a0030d35..04500feb 100644
--- a/docs/cookbook/intro.md
+++ b/docs/cookbook/intro.md
@@ -13,19 +13,16 @@ of the aforementioned systems.
The absolute best way to understand `Hyper` and how it works is to play around
with it.
-## Getting Started
+#### Auto-redistributed
-The absolute best way to get started with `Hyper` is to play with it.
+`umoci` and the guest `vmlinux` kernels are downloaded, checksum-verified, and
+managed by Hyper itself; you do not install them.
-### Requirements
-
-Hyper requires the following software be installed on each node running it:
-
- - [`skopeo`](https://github.com/containers/skopeo)
- - [`e2fsprogs`](https://github.com/tytso/e2fsprogs)
-
-Hyper has more runtime dependencies, but they are automatically redistributed
-by Hyper.
+`firecracker` and `jailer` are not auto-downloaded. Install them with
+`mix firecracker.install [--prefix
]` (default prefix `/opt/firecracker`),
+which downloads the pinned v1.16.0 release, places the binaries at
+`/firecracker` and `/jailer`, and prints the `/etc/hyper/config.toml`
+snippet to paste in.
### Installation
@@ -33,31 +30,24 @@ by Hyper.
### Configuration
-Running `Hyper` is involved and requires a large number of pre-requisites. The
-configuration of `:hyper` can be done by creating a `config :hyper` entry in
-your `config.exs`. Refer to the given snippet for details on each
-configuration.
+Almost all host configuration — `work_dir`, the `[tools]` binary paths
+(`firecracker`, `jailer`, `dmsetup`, ...), and the `[jails]` table (`cgroup`,
+`uid_gid_range`) — lives in `/etc/hyper/config.toml` (the single source of
+truth shared with the setuid helper, shown above), and every node-local path
+(`jails`, `socks`, `scratch`, `layers`) is derived from `work_dir`. None of it is
+repeated in `config :hyper`.
+
+The node's own tool paths (`skopeo`, `mke2fs`, `umoci`, `suidhelper`) now live in
+the `[tools]` table of `/etc/hyper/config.toml` alongside the privileged binaries,
+so `config :hyper` holds only the per-architecture guest kernels (each with a
+default, so the block may be omitted):
```elixir
config :hyper,
- # TODO(markovejnovic): Remove this after it gets auto-downloaded.
- jailer_bin: "/opt/firecracker/jailer-v1.16.0-x86_64",
- # TODO(markovejnovic): Remove this after it gets auto-downloaded.
- firecracker_bin: "/opt/firecracker/firecracker-v1.16.0-x86_64",
- # You must create a parent cgroup on your system. Continue reading for
- # further details.
- cgroup_parent: "hyper",
- # TODO(markovejnovic): Merge these directories into one.
- jailer_chroot_base: "/srv/hyper/jails",
- socket_dir: "/srv/hyper/socks",
- scratch_dir: "/srv/hyper/scratch",
- # Hyper requires that each VM you pass
- uid_gid_range: {900_000, 999_999},
- layer_dir: "/srv/hyper/layers"
+ # Per-architecture guest kernel images placed on the host.
+ vmlinux: %{x86_64: "/srv/hyper/redist/vmlinux/vmlinux-x86_64"}
```
-
-
### Usage
diff --git a/lib/hyper.ex b/lib/hyper.ex
index fb0a626e..9b70f131 100644
--- a/lib/hyper.ex
+++ b/lib/hyper.ex
@@ -18,7 +18,7 @@ defmodule Hyper do
@spec create_vm(Hyper.Vm.Spec.t()) :: {:ok, Hyper.Vm.t()} | {:error, term()}
def create_vm(%Hyper.Vm.Spec{} = spec) do
with {:ok, arch} <- resolve_arch(spec.arch) do
- vm_id = gen_vm_id()
+ vm_id = Hyper.Vm.Id.generate()
spec = %{spec | arch: arch}
instance_spec = Hyper.Vm.Instance.spec(spec.type)
@@ -34,17 +34,13 @@ defmodule Hyper do
end
end
- @doc "Generate a fresh VM id (url-safe base64, dm-name compatible)."
- @spec gen_vm_id() :: Hyper.Vm.id()
- def gen_vm_id, do: Base.url_encode64(:crypto.strong_rand_bytes(9), padding: false)
-
@spec resolve_arch(Hyper.Vm.Instance.arch() | nil) ::
{:ok, Hyper.Vm.Instance.arch()} | {:error, term()}
defp resolve_arch(nil), do: Sys.Arch.current()
defp resolve_arch(arch), do: {:ok, arch}
@doc "Cluster-wide: which node currently runs `vm_id`? `nil` if unknown."
- @spec whereis(Hyper.Vm.id()) :: node() | nil
+ @spec whereis(Hyper.Vm.Id.t()) :: node() | nil
def whereis(vm_id), do: Hyper.Cluster.Routing.whereis(vm_id)
@doc """
@@ -57,7 +53,7 @@ defmodule Hyper do
died with its host, so "unknown" is the truthful answer. Only `:erpc`'s own
transport failures are swallowed; a genuine fault in the lookup still raises.
"""
- @spec id(Hyper.Vm.t()) :: Hyper.Vm.id() | nil
+ @spec id(Hyper.Vm.t()) :: Hyper.Vm.Id.t() | nil
def id(pid) when is_pid(pid) do
:erpc.call(node(pid), Hyper.Cluster.Routing, :id_for, [pid])
catch
diff --git a/lib/hyper/cluster/routing.ex b/lib/hyper/cluster/routing.ex
index c7802595..9491a132 100644
--- a/lib/hyper/cluster/routing.ex
+++ b/lib/hyper/cluster/routing.ex
@@ -28,8 +28,28 @@ defmodule Hyper.Cluster.Routing do
@spec via(term()) :: {:via, module(), {atom(), term()}}
def via(key), do: {:via, Horde.Registry, {@name, key}}
+ @doc """
+ Register the calling process under `key` from inside its own `init`.
+
+ Prefer this over starting a process with a `{:via, Horde.Registry, _}` name.
+ OTP's post-start name check (`gen:get_proc_name`) calls `whereis_name`
+ immediately after the synchronous `register`, but Horde materialises the name
+ into its local ETS only asynchronously, via the DeltaCRDT diff loop. Under
+ registry churn that read loses the race and OTP aborts startup with
+ `{:process_not_registered_via, Horde.Registry}`. Registering from within
+ `init` carries no such self-check, while leaving the name cluster-resolvable
+ through `via/1` once the diff propagates (callers already tolerate that lag).
+ """
+ @spec register_self(term()) :: :ok | {:error, {:already_registered, pid()}}
+ def register_self(key) do
+ case Horde.Registry.register(@name, key, nil) do
+ {:ok, _pid} -> :ok
+ {:error, {:already_registered, _pid}} = err -> err
+ end
+ end
+
@doc "Which node currently runs `vm_id`? `nil` if unknown."
- @spec whereis(Hyper.Vm.id()) :: node() | nil
+ @spec whereis(Hyper.Vm.Id.t()) :: node() | nil
@decorate with_span("Hyper.Cluster.Routing.whereis", include: [:vm_id])
def whereis(vm_id) do
case Horde.Registry.lookup(@name, {vm_id, :supervisor}) do
@@ -43,7 +63,7 @@ defmodule Hyper.Cluster.Routing do
replica via a registry match spec; intended to be called on the node that owns
`pid` (see `Hyper.id/1`).
"""
- @spec id_for(pid()) :: Hyper.Vm.id() | nil
+ @spec id_for(pid()) :: Hyper.Vm.Id.t() | nil
@decorate with_span("Hyper.Cluster.Routing.id_for")
def id_for(pid) when is_pid(pid) do
case Horde.Registry.select(@name, [
@@ -55,7 +75,7 @@ defmodule Hyper.Cluster.Routing do
end
@doc "Every VM the cluster currently knows about, paired with the node it runs on."
- @spec all() :: [{Hyper.Vm.id(), node()}]
+ @spec all() :: [{Hyper.Vm.Id.t(), node()}]
@decorate with_span("Hyper.Cluster.Routing.all")
def all do
@name
diff --git a/lib/hyper/cluster/scheduler.ex b/lib/hyper/cluster/scheduler.ex
index 1412a0b2..4877658c 100644
--- a/lib/hyper/cluster/scheduler.ex
+++ b/lib/hyper/cluster/scheduler.ex
@@ -16,6 +16,8 @@ defmodule Hyper.Cluster.Scheduler do
alias Hyper.Vm.Instance.Spec
alias Unit.Information
+ require Logger
+
use OpenTelemetryDecorator
@type layer_sizes :: [{Hyper.Layer.id(), Unit.Information.t()}]
@@ -45,8 +47,15 @@ defmodule Hyper.Cluster.Scheduler do
|> candidates(layers)
|> Enum.reduce_while({:error, :no_capacity}, fn node, acc ->
case attempt.(node) do
- {:ok, result} -> {:halt, {:ok, {node, result}}}
- {:error, _reason} -> {:cont, acc}
+ {:ok, result} ->
+ {:halt, {:ok, {node, result}}}
+
+ {:error, reason} ->
+ # The candidate fit the snapshot but refused at confirmation time.
+ # Log the real reason: otherwise an actual boot failure on the only
+ # candidate is indistinguishable from genuine `:no_capacity`.
+ Logger.warning("scheduler: #{inspect(node)} refused placement: #{inspect(reason)}")
+ {:cont, acc}
end
end)
end
diff --git a/lib/hyper/grpc/codec.ex b/lib/hyper/grpc/codec.ex
index 650040a1..9da622a0 100644
--- a/lib/hyper/grpc/codec.ex
+++ b/lib/hyper/grpc/codec.ex
@@ -85,15 +85,15 @@ defmodule Hyper.Grpc.Codec do
end
@doc "Convert a domain result to an outbound response message, or an error to `GRPC.RPCError`."
- @spec to_grpc({:created, Hyper.Vm.id(), node()}) :: CreateVmResponse.t()
+ @spec to_grpc({:created, Hyper.Vm.Id.t(), node()}) :: CreateVmResponse.t()
def to_grpc({:created, vm_id, node}) when is_binary(vm_id),
do: %CreateVmResponse{vm_id: vm_id, node: to_string(node)}
- @spec to_grpc({:located, Hyper.Vm.id(), node()}) :: GetVmResponse.t()
+ @spec to_grpc({:located, Hyper.Vm.Id.t(), node()}) :: GetVmResponse.t()
def to_grpc({:located, vm_id, node}),
do: %GetVmResponse{vm_id: vm_id, node: to_string(node)}
- @spec to_grpc({:vms, [{Hyper.Vm.id(), node()}]}) :: ListVmsResponse.t()
+ @spec to_grpc({:vms, [{Hyper.Vm.Id.t(), node()}]}) :: ListVmsResponse.t()
def to_grpc({:vms, vms}),
do: %ListVmsResponse{vms: Enum.map(vms, &vm/1)}
@@ -117,7 +117,7 @@ defmodule Hyper.Grpc.Codec do
def to_grpc({:exit, {:nodedown, _}}), do: rpc_error(:machine_unreachable)
def to_grpc({:exit, reason}), do: rpc_error({:stop_failed, reason})
- @spec vm({Hyper.Vm.id(), node()}) :: Vm.t()
+ @spec vm({Hyper.Vm.Id.t(), node()}) :: Vm.t()
defp vm({vm_id, node}), do: %Vm{vm_id: vm_id, node: to_string(node)}
@spec instance_type(instance_enum()) :: {:ok, Hyper.Vm.Instance.t()}
diff --git a/lib/hyper/img/db/lease.ex b/lib/hyper/img/db/lease.ex
index 7c737406..979b5f84 100644
--- a/lib/hyper/img/db/lease.ex
+++ b/lib/hyper/img/db/lease.ex
@@ -49,7 +49,7 @@ defmodule Hyper.Img.Db.Lease do
Upserts on `(node_id, vm_id)` - the same call both takes a fresh lease and
heartbeats a live one.
"""
- @spec bump(Hyper.Img.id(), Hyper.Vm.id(), Unit.Time.t()) ::
+ @spec bump(Hyper.Img.id(), Hyper.Vm.Id.t(), Unit.Time.t()) ::
{:ok, %__MODULE__{}} | {:error, Ecto.Changeset.t()}
@decorate with_span("Hyper.Img.Db.Lease.bump", include: [:image_id, :vm_id])
def bump(image_id, vm_id, ttl) do
@@ -72,7 +72,7 @@ defmodule Hyper.Img.Db.Lease do
Release the lease issued to the given node_id and the given vm_id. Since each VM only ever uses
one image, it is not necessary to specify the image id.
"""
- @spec release(Hyper.Vm.id()) :: :ok
+ @spec release(Hyper.Vm.Id.t()) :: :ok
@decorate with_span("Hyper.Img.Db.Lease.release", include: [:vm_id])
def release(vm_id) do
query = from(l in __MODULE__, where: l.node_id == ^to_string(node()) and l.vm_id == ^vm_id)
diff --git a/lib/hyper/node.ex b/lib/hyper/node.ex
index a5e0f63b..4f172e88 100644
--- a/lib/hyper/node.ex
+++ b/lib/hyper/node.ex
@@ -26,6 +26,11 @@ defmodule Hyper.Node do
real-time monitors backing the soft budget (`Hyper.Node.Budget.Soft`).
Lives here, not at the application root, because both are per-node and only
meaningful while this node hosts VMs.
+
+ * `Hyper.Node.Reaper` - a periodic, liveness-aware GC for per-VM host
+ resources (orphaned firecracker cgroups and `hyper-rw-*` dm volumes) stranded
+ by an unclean death whose vm_id never reboots. Started last so the VM
+ supervisor it consults for liveness is already up.
"""
use Supervisor
@@ -40,8 +45,14 @@ defmodule Hyper.Node do
def start_link(opts \\ []) do
case test_system() do
- :ok -> Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
- {:error, reason} -> {:error, reason}
+ :ok ->
+ # Clear any dm/loop devices a previous unclean shutdown left behind,
+ # before the device-owning children start and collide with them.
+ :ok = Hyper.Node.Reclaim.run()
+ Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
+
+ {:error, reason} ->
+ {:error, reason}
end
end
@@ -49,10 +60,16 @@ defmodule Hyper.Node do
def init(_opts) do
children = [
Hyper.Node.Users,
+ # Layer owns Hyper.Node.Layer.Registry, which Budget.Advertiser queries
+ # (via Hyper.Node.Layer.active/0) as it advertises on init - so Layer must
+ # be up first.
+ Hyper.Node.Layer,
Hyper.Node.Budget.Supervisor,
{DynamicSupervisor, name: @vm_sup, strategy: :one_for_one},
- Hyper.Node.Layer,
- Hyper.Node.Img
+ Hyper.Node.Img,
+ # Last child: :one_for_one starts in order, so the VM supervisor and Img are
+ # up before the reaper's first tick can read their liveness.
+ Hyper.Node.Reaper
]
Supervisor.init(children, strategy: :one_for_one)
@@ -63,7 +80,7 @@ defmodule Hyper.Node do
layer, resolve the kernel, and start the VM supervisor. The uid is freed and
the mutable layer torn down automatically when the VM supervisor dies.
"""
- @spec start_image_vm(Hyper.Vm.id(), Hyper.Vm.Spec.t()) :: {:ok, pid()} | {:error, term()}
+ @spec start_image_vm(Hyper.Vm.Id.t(), Hyper.Vm.Spec.t()) :: {:ok, pid()} | {:error, term()}
@decorate with_span("Hyper.Node.start_image_vm", include: [:vm_id, :spec])
def start_image_vm(vm_id, %Hyper.Vm.Spec{} = spec) do
with {:ok, uid} <- Users.claim(),
@@ -89,7 +106,7 @@ defmodule Hyper.Node do
end
@doc false
- @spec build_opts(Hyper.Vm.id(), Hyper.Vm.Spec.t(), Users.id(), pid(), Path.t()) ::
+ @spec build_opts(Hyper.Vm.Id.t(), Hyper.Vm.Spec.t(), Users.id(), pid(), Path.t()) ::
FireVMM.Opts.t()
def build_opts(vm_id, %Hyper.Vm.Spec{} = spec, uid, mutable, kernel) do
%FireVMM.Opts{
@@ -144,10 +161,11 @@ defmodule Hyper.Node do
@spec test_system :: :ok | {:error, term()}
def test_system do
with {:ok, _} <- Hyper.Cfg.Budget.load(),
- :ok <- Hyper.Node.FireVMM.Provider.ensure_installed(),
+ :ok <- check_firecracker_bins(),
:ok <- Hyper.Node.FireVMM.VmLinux.Provider.ensure_installed(),
:ok <- Hyper.Node.Vmlinux.test_system(),
:ok <- Hyper.Img.OciLoader.Umoci.ensure_installed(),
+ :ok <- Hyper.Img.OciLoader.test_system(),
:ok <- Hyper.Node.Users.test_system(),
:ok <- Hyper.Node.Layer.Repo.test_system(),
:ok <- Hyper.SuidHelper.test_system(),
@@ -157,6 +175,24 @@ defmodule Hyper.Node do
end
end
+ @spec check_firecracker_bins ::
+ :ok
+ | {:error, {:firecracker_bin_missing | :jailer_bin_missing, Path.t()}}
+ | {:error, :firecracker_not_configured | :jailer_not_configured}
+ defp check_firecracker_bins do
+ with {:fc, {:ok, fc}} <- {:fc, Hyper.Cfg.Tools.firecracker_configured()},
+ {:jail, {:ok, jail}} <- {:jail, Hyper.Cfg.Tools.jailer_configured()} do
+ cond do
+ not Sys.Posix.executable?(fc) -> {:error, {:firecracker_bin_missing, fc}}
+ not Sys.Posix.executable?(jail) -> {:error, {:jailer_bin_missing, jail}}
+ true -> :ok
+ end
+ else
+ {:fc, :error} -> {:error, :firecracker_not_configured}
+ {:jail, :error} -> {:error, :jailer_not_configured}
+ end
+ end
+
@spec check_helper_base(Path.t()) ::
:ok | {:error, {:suid_helper_base_mismatch, Path.t(), Path.t()}}
defp check_helper_base(base) do
diff --git a/lib/hyper/node/fire_vmm.ex b/lib/hyper/node/fire_vmm.ex
index 7e5603b2..22b069ac 100644
--- a/lib/hyper/node/fire_vmm.ex
+++ b/lib/hyper/node/fire_vmm.ex
@@ -34,7 +34,7 @@ defmodule Hyper.Node.FireVMM do
defstruct [:vm_id, :uid, :gid, :type, :arch, :mutable, :kernel, :boot_args]
@type t :: %__MODULE__{
- vm_id: Hyper.Vm.id(),
+ vm_id: Hyper.Vm.Id.t(),
uid: Hyper.Node.Users.id(),
gid: Hyper.Node.Users.id(),
type: Hyper.Vm.Instance.t(),
@@ -47,14 +47,15 @@ defmodule Hyper.Node.FireVMM do
@spec start_link(Opts.t()) :: Supervisor.on_start()
def start_link(opts) do
- Supervisor.start_link(__MODULE__, opts, name: via(opts.vm_id))
+ Supervisor.start_link(__MODULE__, opts)
end
+ @spec child_spec(Opts.t()) :: Supervisor.child_spec()
def child_spec(opts) do
# Keyed by VM id and :transient so a cleanly-stopped VM is not rebooted by
# the node-level DynamicSupervisor.
%{
- vm_id: {__MODULE__, opts.vm_id},
+ id: {__MODULE__, opts.vm_id},
start: {__MODULE__, :start_link, [opts]},
type: :supervisor,
restart: :transient
@@ -63,18 +64,26 @@ defmodule Hyper.Node.FireVMM do
@impl true
def init(opts) do
- children = [
- # Client must be registered before Core: Core starts the State machine,
- # which calls Client.run while waiting for the daemon's API. Client depends
- # only on vm_id (an independent peer), so it has no reverse dependency.
- {Client, %Client.Opts{vm_id: opts.vm_id}},
- {Core, opts}
- ]
+ # Self-register the cluster routing entry here rather than via a start name;
+ # see `Hyper.Cluster.Routing.register_self/1`. A fresh random vm_id never
+ # collides, so `:already_registered` only happens against a stale dead
+ # incarnation - decline the start and let the supervisor retry clean.
+ case Hyper.Cluster.Routing.register_self({opts.vm_id, :supervisor}) do
+ :ok ->
+ children = [
+ # Client must be registered before Core: Core starts the State machine,
+ # which calls Client.run while waiting for the daemon's API. Client
+ # depends only on vm_id (an independent peer), so no reverse dependency.
+ {Client, %Client.Opts{vm_id: opts.vm_id}},
+ {Core, opts}
+ ]
- Supervisor.init(children, strategy: :one_for_one)
- end
+ Supervisor.init(children, strategy: :one_for_one)
- defp via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :supervisor})
+ {:error, _} ->
+ :ignore
+ end
+ end
@doc "Test whether the system can run firecracker VMMs."
@spec test_system() :: :ok | {:error, term()}
diff --git a/lib/hyper/node/fire_vmm/chroot_jail.ex b/lib/hyper/node/fire_vmm/chroot_jail.ex
index edaa4f1d..cab7364c 100644
--- a/lib/hyper/node/fire_vmm/chroot_jail.ex
+++ b/lib/hyper/node/fire_vmm/chroot_jail.ex
@@ -27,7 +27,7 @@ defmodule Hyper.Node.FireVMM.ChrootJail do
`uid:gid`), and return `cold` with its kernel + rootfs paths rewritten to their
in-jail equivalents. Fails the boot if either artifact cannot be staged.
"""
- @spec stage(Hyper.Vm.id(), non_neg_integer(), non_neg_integer(), Cold.t()) ::
+ @spec stage(Hyper.Vm.Id.t(), non_neg_integer(), non_neg_integer(), Cold.t()) ::
{:ok, Cold.t()} | {:error, term()}
@decorate with_span("Hyper.Node.FireVMM.ChrootJail.stage", include: [:vm_id])
def stage(vm_id, uid, gid, %Cold{} = cold) do
diff --git a/lib/hyper/node/fire_vmm/client.ex b/lib/hyper/node/fire_vmm/client.ex
index c6901ae2..c41b5f1d 100644
--- a/lib/hyper/node/fire_vmm/client.ex
+++ b/lib/hyper/node/fire_vmm/client.ex
@@ -55,18 +55,15 @@ defmodule Hyper.Node.FireVMM.Client do
@type t :: %__MODULE__{socket_path: Path.t()}
end
+ # Prod path (vm_id, no explicit name) starts unnamed and self-registers in
+ # `init` - see `Hyper.Cluster.Routing.register_self/1`. A `:name` override
+ # (test stand-ins) is honoured as a plain local name and skips registration.
@spec start_link(Opts.t()) :: GenServer.on_start()
def start_link(%Opts{} = opts) do
- name =
- case opts.name do
- nil when not is_nil(opts.vm_id) -> via(opts.vm_id)
- other -> other
- end
-
- GenServer.start_link(__MODULE__, opts, gen_opts(name))
+ GenServer.start_link(__MODULE__, opts, gen_opts(opts.name))
end
- @spec via(Hyper.Vm.id()) :: GenServer.name()
+ @spec via(Hyper.Vm.Id.t()) :: GenServer.name()
def via(vm_id), do: Hyper.Cluster.Routing.via({vm_id, :client})
# Cap on a single Firecracker API call.
@@ -79,12 +76,26 @@ defmodule Hyper.Node.FireVMM.Client do
end
@impl true
- @spec init(Opts.t()) :: {:ok, State.t()}
+ @spec init(Opts.t()) :: {:ok, State.t()} | {:stop, {:already_registered, pid()}}
def init(%Opts{} = opts) do
- socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id)
- {:ok, %State{socket_path: socket_path}}
+ with :ok <- register(opts) do
+ socket_path = opts.socket_path || Jailer.host_socket(opts.vm_id)
+ {:ok, %State{socket_path: socket_path}}
+ end
+ end
+
+ # Register cluster-wide under {vm_id, :client} on the prod path. With an
+ # explicit name (test stand-in), the name is the local registration, so skip.
+ @spec register(Opts.t()) :: :ok | {:stop, {:already_registered, pid()}}
+ defp register(%Opts{name: nil, vm_id: vm_id}) when not is_nil(vm_id) do
+ case Hyper.Cluster.Routing.register_self({vm_id, :client}) do
+ :ok -> :ok
+ {:error, reason} -> {:stop, reason}
+ end
end
+ defp register(%Opts{}), do: :ok
+
@impl true
def handle_call({:run, op_fun}, _from, %State{socket_path: socket_path} = state) do
{:reply, op_fun.(socket_path: socket_path), state}
diff --git a/lib/hyper/node/fire_vmm/core.ex b/lib/hyper/node/fire_vmm/core.ex
index 8ccf6180..34830b54 100644
--- a/lib/hyper/node/fire_vmm/core.ex
+++ b/lib/hyper/node/fire_vmm/core.ex
@@ -18,8 +18,9 @@ defmodule Hyper.Node.FireVMM.Core do
* firecracker crash -> the `Daemon` child exits; both restart; `Daemon`
resets the stale jail and relaunches, and the fresh controller cold-boots.
- `MuonTrap` kills the OS process when its port closes (teardown or BEAM death),
- so no firecracker process outlives the supervisor.
+ `Daemon` guarantees firecracker is dead on teardown via the helper's
+ `cgroup.kill` (MuonTrap's port-close kill misses the setsid'd firecracker), so
+ no firecracker process outlives a graceful supervisor shutdown.
"""
use Supervisor
@@ -28,9 +29,12 @@ defmodule Hyper.Node.FireVMM.Core do
alias Hyper.Node.FireVMM.Daemon
alias Hyper.Node.FireVMM.State
+ # Started unnamed: nothing resolves the core by name (it is addressed as a
+ # child of `Hyper.Node.FireVMM`), so it needs no registry entry - and avoids a
+ # needless racy Horde registration at startup.
@spec start_link(FireVMM.Opts.t()) :: Supervisor.on_start()
def start_link(opts) do
- Supervisor.start_link(__MODULE__, opts, name: Hyper.Cluster.Routing.via({opts.vm_id, :core}))
+ Supervisor.start_link(__MODULE__, opts)
end
@impl true
diff --git a/lib/hyper/node/fire_vmm/daemon.ex b/lib/hyper/node/fire_vmm/daemon.ex
index 7cbc159c..882a6cca 100644
--- a/lib/hyper/node/fire_vmm/daemon.ex
+++ b/lib/hyper/node/fire_vmm/daemon.ex
@@ -3,26 +3,37 @@ defmodule Hyper.Node.FireVMM.Daemon do
The jailed firecracker OS process for one microVM, as a static child of
`Hyper.Node.FireVMM.Core`.
- Lifecycle is supervisor-owned. On every (re)start it first resets any stale
- jail left by a prior incarnation - the firecracker jailer refuses to reuse an
- existing chroot - then builds the jailer command and runs it under
- `MuonTrap.Daemon`, which kills the OS process when its port closes (controller
- crash, container teardown, or BEAM death). So no firecracker process outlives
- the supervisor, and `Core`'s `:one_for_all` restarting this child (e.g. after a
- firecracker crash) cleanly cold-boots against a fresh jail.
-
- The supervised process *is* the `MuonTrap.Daemon` - `start_link/1` does the
- reset, then delegates and returns that pid.
+ A `trap_exit` GenServer that owns firecracker's lifetime end to end:
+
+ * on every (re)start it resets any stale jail left by a prior incarnation —
+ the firecracker jailer refuses to reuse an existing chroot — then launches
+ the jailer under a linked `MuonTrap.Daemon`. The supervised process is
+ `hyper-suidhelper jailer ...`, which `execve`s into the jailer (same pid).
+ * if firecracker exits, the linked `MuonTrap.Daemon` exits and this server
+ stops with that reason, so `Core`'s `:one_for_all` cold-boots the pair.
+ * on teardown it **guarantees firecracker is dead**: `MuonTrap`'s port-close
+ kills by process group, but the jailer `setsid`s firecracker into its own
+ session, so it escapes that kill and would leak (holding the cgroup, the
+ rootfs dm device, loop devices). `terminate/2` therefore runs the helper's
+ `cgroup.kill` teardown (`ChrootJail.remove`), which SIGKILLs the whole leaf
+ cgroup regardless of session. The same call on (re)start cleans up after a
+ prior incarnation the BEAM could not (a SIGKILL'd node leaves no
+ `terminate/2`); the periodic `Hyper.Node.Reaper` is the final backstop.
"""
+ use GenServer
+ use OpenTelemetryDecorator
+
alias Hyper.Node.FireVMM.{Jailer, Opts}
alias Hyper.SuidHelper
alias Unit.Time
- use OpenTelemetryDecorator
+ require Logger
@shutdown_timeout Time.s(5)
+ defstruct [:opts, :muontrap]
+
@spec child_spec(Opts.t()) :: Supervisor.child_spec()
def child_spec(%Opts{} = opts) do
%{
@@ -34,20 +45,82 @@ defmodule Hyper.Node.FireVMM.Daemon do
}
end
- @doc """
- Reset the VM's stale jail, then launch the jailer under `MuonTrap.Daemon` and
- return its pid. Fails (so the supervisor retries) if the reset cannot run.
- """
- @spec start_link(Opts.t()) :: {:ok, pid()} | {:error, term()}
- @decorate with_span("Hyper.Node.FireVMM.Daemon.start_link", include: [:id])
- def start_link(%Opts{vm_id: id} = opts) do
- with :ok <- SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id)) do
- cmd = Jailer.command(opts)
-
- case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, []) do
- {:ok, pid} -> {:ok, pid}
- {:error, _} = err -> err
- end
+ @spec start_link(Opts.t()) :: GenServer.on_start()
+ def start_link(%Opts{} = opts) do
+ GenServer.start_link(__MODULE__, opts)
+ end
+
+ @impl true
+ @decorate with_span("Hyper.Node.FireVMM.Daemon.init", include: [:id])
+ def init(%Opts{vm_id: id} = opts) do
+ # Trap exits so the linked MuonTrap's exit reaches `handle_info` (not a silent
+ # link kill) and so `terminate/2` runs on supervisor shutdown.
+ Process.flag(:trap_exit, true)
+
+ with :ok <- reset_stale_jail(id),
+ {:ok, muontrap} <- launch(opts) do
+ {:ok, %__MODULE__{opts: opts, muontrap: muontrap}}
+ else
+ {:error, reason} -> {:stop, reason}
+ end
+ end
+
+ # firecracker (the linked MuonTrap.Daemon) exited: stop with its reason so
+ # `Core`'s `:one_for_all` discards the controller too and cold-boots the pair.
+ @impl true
+ def handle_info({:EXIT, muontrap, reason}, %__MODULE__{muontrap: muontrap} = state) do
+ {:stop, reason, state}
+ end
+
+ def handle_info(_msg, state), do: {:noreply, state}
+
+ # Guarantee firecracker is dead and its jail cleared. MuonTrap cannot kill the
+ # setsid'd firecracker; the helper's `cgroup.kill` can. Best-effort: a failure
+ # here is logged, and the `Reaper` will retry, but it must not crash teardown.
+ @impl true
+ @decorate with_span("Hyper.Node.FireVMM.Daemon.terminate", include: [:id])
+ def terminate(_reason, %__MODULE__{opts: %Opts{vm_id: id}}) do
+ case clear_jail(id) do
+ :ok ->
+ :ok
+
+ {:error, reason} ->
+ Logger.error("vm #{id}: teardown failed to clear jail: #{inspect(reason)}")
+ end
+ end
+
+ @spec reset_stale_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()}
+ defp reset_stale_jail(id), do: clear_jail(id)
+
+ @spec clear_jail(Hyper.Vm.Id.t()) :: :ok | {:error, term()}
+ defp clear_jail(id) do
+ SuidHelper.ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id))
+ end
+
+ @spec launch(Opts.t()) :: {:ok, pid()} | {:error, term()}
+ defp launch(%Opts{vm_id: id} = opts) do
+ cmd = Jailer.command(opts)
+
+ # Surface what the jailed process actually does: `log_output` routes the
+ # helper/jailer/firecracker stdout+stderr (guest serial console included)
+ # to the Logger, and `exit_status_to_reason` turns MuonTrap's opaque
+ # `:error_exit_status` into `{:firecracker_exited, status}` so a crash
+ # report names the real exit code instead of hiding it.
+ daemon_opts = [
+ log_output: :info,
+ log_prefix: "vm #{id} firecracker: ",
+ stderr_to_stdout: true,
+ exit_status_to_reason: &{:firecracker_exited, &1}
+ ]
+
+ case MuonTrap.Daemon.start_link(cmd.binary, cmd.args, daemon_opts) do
+ {:ok, pid} ->
+ Logger.info("vm #{id}: jailer launched under MuonTrap (#{inspect(pid)})")
+ {:ok, pid}
+
+ {:error, reason} = err ->
+ Logger.error("vm #{id}: jailer failed to launch: #{inspect(reason)}")
+ err
end
end
end
diff --git a/lib/hyper/node/fire_vmm/jailer.ex b/lib/hyper/node/fire_vmm/jailer.ex
index 381f9d23..3b6deb23 100644
--- a/lib/hyper/node/fire_vmm/jailer.ex
+++ b/lib/hyper/node/fire_vmm/jailer.ex
@@ -1,27 +1,26 @@
defmodule Hyper.Node.FireVMM.Jailer do
@moduledoc """
- Builds the firecracker
- [jailer](https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md)
- command for one VM.
+ Builds the `hyper-suidhelper jailer` command for one VM.
- The jailer sets up the chroot, namespaces, cgroup (via `Hyper.Vm.Instance`
- flags) and drops privileges, then exec's firecracker. We run the jailer (not
- firecracker directly) under `MuonTrap.Daemon`; MuonTrap only supervises the OS
- process, the jailer owns isolation.
+ The BEAM does not invoke the jailer directly. Instead it calls the setuid helper
+ with the `jailer` subcommand; the helper reads the firecracker binary path, chroot
+ base, parent cgroup, and cgroup version from its trusted `/etc/hyper/config.toml`,
+ re-acquires root, and `execve`s the jailer (same pid, so `MuonTrap.Daemon` keeps
+ supervising it).
+
+ This means the BEAM passes only untrusted-origin values: `--id`, `--uid`, `--gid`,
+ repeated `--cgroup KEY=VALUE`, and `--api-sock`. The helper derives and validates
+ everything else; it also inserts the `--` separator between its own flags and
+ firecracker's flags.
Because firecracker is chrooted to `///root`, the API
- socket it opens at `/api.socket` lives at `host_socket` on the host - that's the
+ socket it opens at `/api.socket` lives at `host_socket` on the host — that's the
path the controller connects to.
-
- Host config: paths are derived from `config :hyper, work_dir: ...`. The
- firecracker + jailer binaries are installed under `/redist/firecracker`
- by `Hyper.Node.FireVMM.Provider`; the chroot base is `/jails`.
"""
use OpenTelemetryDecorator
alias Hyper.Node.FireVMM
- alias Hyper.Node.FireVMM.Provider
alias Hyper.Vm.Instance
# firecracker's API socket path *inside* the chroot.
@@ -36,6 +35,8 @@ defmodule Hyper.Node.FireVMM.Jailer do
first failure.
"""
+ alias Hyper.Cfg.{Dirs, Jails}
+
@doc "Run every pre-requisite check, halting at the first failure."
@spec run() :: :ok | {:error, term()}
def run do
@@ -61,7 +62,7 @@ defmodule Hyper.Node.FireVMM.Jailer do
end
defp parent_cgroup_present do
- if Sys.Linux.Cgroup.V2.named_exists?(Hyper.Cfg.Jails.cgroup()),
+ if Sys.Linux.Cgroup.V2.named_exists?(Jails.cgroup()),
do: :ok,
else: {:error, :missing_parent_cgroup}
end
@@ -77,7 +78,7 @@ defmodule Hyper.Node.FireVMM.Jailer do
end
defp chroot_writable do
- case Sys.Posix.ensure_writable_dir(Hyper.Cfg.Dirs.chroot_base()) do
+ case Sys.Posix.ensure_writable_dir(Dirs.chroot_base()) do
{:ok} -> :ok
{:error, reason} -> {:error, {:chroot_base_unavailable, reason}}
end
@@ -92,26 +93,11 @@ defmodule Hyper.Node.FireVMM.Jailer do
@spec command(FireVMM.Opts.t()) :: t()
def command(opts) do
args =
- [
- "--id",
- opts.vm_id,
- "--exec-file",
- Provider.firecracker_bin(),
- "--uid",
- to_string(opts.uid),
- "--gid",
- to_string(opts.gid),
- "--chroot-base-dir",
- Hyper.Cfg.Dirs.chroot_base(),
- "--cgroup-version",
- "2",
- "--parent-cgroup",
- Hyper.Cfg.Jails.cgroup()
- ] ++
+ ["jailer", "--id", opts.vm_id, "--uid", to_string(opts.uid), "--gid", to_string(opts.gid)] ++
cgroup_flags(opts.type) ++
- ["--", "--api-sock", "/" <> @jail_socket]
+ ["--api-sock", "/" <> @jail_socket]
- %{binary: Provider.jailer_bin(), args: args, host_socket: host_socket(opts.vm_id)}
+ %{binary: Hyper.Cfg.Tools.suidhelper(), args: args, host_socket: host_socket(opts.vm_id)}
end
# Find the appropriate jailer cgroup flags for the given instance type.
@@ -125,25 +111,41 @@ defmodule Hyper.Node.FireVMM.Jailer do
end
@doc "Host path of the VM's per-VM jail dir (`//`)."
- @spec chroot_dir(Hyper.Vm.id()) :: Path.t()
+ @spec chroot_dir(Hyper.Vm.Id.t()) :: Path.t()
def chroot_dir(id) do
Path.join([Hyper.Cfg.Dirs.chroot_base(), exec_name(), id])
end
@doc "Host path of the VM's chroot root (`///root`)."
- @spec chroot_root(Hyper.Vm.id()) :: Path.t()
+ @spec chroot_root(Hyper.Vm.Id.t()) :: Path.t()
def chroot_root(id) do
Path.join(chroot_dir(id), "root")
end
@doc """
- Host path of the VM's cgroup leaf (`/sys/fs/cgroup///`), the
- cgroup the jailer creates for firecracker. Reconstructed (the jailer owns its
+ Host path of the cgroup dir holding every VM's leaf (`/sys/fs/cgroup/`).
+ Its immediate subdir names are vm_ids - the leaves the jailer creates for
+ firecracker - so listing it (directories only; the dir also holds cgroup control
+ files) enumerates the cgroup leaves on this host.
+
+ Note the cgroup hierarchy has NO `` level: the jailer (cgroup v2,
+ `--parent-cgroup `) places firecracker directly at `/`,
+ unlike the chroot (`//`). Confirmed via
+ `/proc//cgroup` = `0:://`.
+ """
+ @spec cgroup_parent_dir() :: Path.t()
+ def cgroup_parent_dir do
+ Path.join("/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup())
+ end
+
+ @doc """
+ Host path of the VM's cgroup leaf (`/sys/fs/cgroup//`), the cgroup
+ the jailer creates for firecracker. Reconstructed (the jailer owns its
placement) so a relaunch can clear the stale leaf left by a prior incarnation.
"""
- @spec cgroup_dir(Hyper.Vm.id()) :: Path.t()
+ @spec cgroup_dir(Hyper.Vm.Id.t()) :: Path.t()
def cgroup_dir(id) do
- Path.join(["/sys/fs/cgroup", Hyper.Cfg.Jails.cgroup(), exec_name(), id])
+ Path.join(cgroup_parent_dir(), id)
end
@doc """
@@ -153,7 +155,7 @@ defmodule Hyper.Node.FireVMM.Jailer do
derive it independently and are guaranteed to agree. We do not control where
the jailer places the socket, so the path is reconstructed here.
"""
- @spec host_socket(Hyper.Vm.id()) :: Path.t()
+ @spec host_socket(Hyper.Vm.Id.t()) :: Path.t()
def host_socket(id) do
Path.join([
Hyper.Cfg.Dirs.chroot_base(),
@@ -164,5 +166,5 @@ defmodule Hyper.Node.FireVMM.Jailer do
])
end
- defp exec_name, do: Path.basename(Provider.firecracker_bin())
+ defp exec_name, do: Path.basename(Hyper.Cfg.Tools.firecracker())
end
diff --git a/lib/hyper/node/fire_vmm/provider.ex b/lib/hyper/node/fire_vmm/provider.ex
deleted file mode 100644
index 0730c588..00000000
--- a/lib/hyper/node/fire_vmm/provider.ex
+++ /dev/null
@@ -1,92 +0,0 @@
-defmodule Hyper.Node.FireVMM.Provider do
- @moduledoc """
- Installs the firecracker release for the current architecture into
- `Hyper.Cfg.Dirs.firecracker_install_dir/0` (`/redist/firecracker`).
-
- `ensure_installed/0` is idempotent: if the binaries are already present and
- executable it returns `:ok` without touching the network. Otherwise it fetches
- the official firecracker release tarball for the detected architecture via
- `Redist.Targz` (download, SHA-256 verify, extract). The archive is
- extracted as-is - the binaries live under `release-v-/` exactly as
- firecracker ships them, and `firecracker_bin/0` / `jailer_bin/0` resolve those
- paths.
- """
-
- alias Redist.Targz
-
- @downloads %{
- x86_64: %{
- url:
- "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-x86_64.tgz",
- sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6",
- firecracker: "release-v1.16.0-x86_64/firecracker-v1.16.0-x86_64",
- jailer: "release-v1.16.0-x86_64/jailer-v1.16.0-x86_64"
- },
- aarch64: %{
- url:
- "https://github.com/firecracker-microvm/firecracker/releases/download/v1.16.0/firecracker-v1.16.0-aarch64.tgz",
- sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7",
- firecracker: "release-v1.16.0-aarch64/firecracker-v1.16.0-aarch64",
- jailer: "release-v1.16.0-aarch64/jailer-v1.16.0-aarch64"
- }
- }
-
- @doc "Absolute path to the installed firecracker binary."
- @spec firecracker_bin() :: Path.t()
- def firecracker_bin, do: bin_path(:firecracker)
-
- @doc "Absolute path to the installed jailer binary."
- @spec jailer_bin() :: Path.t()
- def jailer_bin, do: bin_path(:jailer)
-
- @doc "Ensure the firecracker release is installed for this node."
- @spec ensure_installed() :: :ok | {:error, term()}
- def ensure_installed do
- with {:ok, arch} <- Sys.Arch.current() do
- dl = Map.fetch!(@downloads, arch)
-
- case check_install(dl) do
- :ok -> :ok
- {:error, :not_installed} -> install(dl)
- {:error, :bad_install} -> reinstall(dl)
- end
- end
- end
-
- # `:ok` if `dl`'s version-specific binaries are present and executable;
- # `{:error, :not_installed}` if the install dir is empty/absent; otherwise
- # `{:error, :bad_install}` - something is there but it's the wrong version,
- # partial, or corrupt, which we cannot fix in place because `Targz` keeps
- # existing files. The remedy is to wipe and reinstall.
- @spec check_install(map()) :: :ok | {:error, :not_installed | :bad_install}
- defp check_install(dl) do
- fc = Path.join(install_dir(), dl.firecracker)
- jail = Path.join(install_dir(), dl.jailer)
-
- cond do
- Sys.Posix.executable?(fc) and Sys.Posix.executable?(jail) ->
- :ok
-
- File.dir?(install_dir()) and File.ls!(install_dir()) != [] ->
- {:error, :bad_install}
-
- true ->
- {:error, :not_installed}
- end
- end
-
- defp install(dl), do: Targz.install(dl.url, dl.sha256, install_dir())
-
- defp reinstall(dl) do
- _ = File.rm_rf!(install_dir())
- install(dl)
- end
-
- defp bin_path(key) do
- {:ok, arch} = Sys.Arch.current()
- dl = Map.fetch!(@downloads, arch)
- Path.join(install_dir(), Map.fetch!(dl, key))
- end
-
- defp install_dir, do: Hyper.Cfg.Dirs.firecracker_install_dir()
-end
diff --git a/lib/hyper/node/fire_vmm/state.ex b/lib/hyper/node/fire_vmm/state.ex
index b390965c..eab53db5 100644
--- a/lib/hyper/node/fire_vmm/state.ex
+++ b/lib/hyper/node/fire_vmm/state.ex
@@ -39,12 +39,13 @@ defmodule Hyper.Node.FireVMM.State do
}
@enforce_keys [:opts]
- defstruct [:opts, :spec, :boot_deadline]
+ defstruct [:opts, :spec, :boot_deadline, api_granted: false]
@type t :: %State{
opts: Opts.t(),
spec: BootSpec.Cold.t() | nil,
- boot_deadline: integer() | nil
+ boot_deadline: integer() | nil,
+ api_granted: boolean()
}
# How long to wait for the daemon's API to come up before failing the boot.
@@ -54,11 +55,14 @@ defmodule Hyper.Node.FireVMM.State do
%{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}}
end
- def start_link(%Opts{vm_id: id} = opts) do
- :gen_statem.start_link(via(id), __MODULE__, opts, [])
+ # Started unnamed; the controller self-registers under `{id, :state}` from
+ # `init` (see `Hyper.Cluster.Routing.register_self/1`). `stop/1` still resolves
+ # it cluster-wide through `via/1`.
+ def start_link(%Opts{} = opts) do
+ :gen_statem.start_link(__MODULE__, opts, [])
end
- @spec stop(Hyper.Vm.id()) :: :ok
+ @spec stop(Hyper.Vm.Id.t()) :: :ok
def stop(id) do
:gen_statem.call(via(id), :stop)
end
@@ -72,12 +76,21 @@ defmodule Hyper.Node.FireVMM.State do
# The daemon is already (being) started by `Core` as our sibling. Read the root
# device off the per-VM mutable layer, resolve the boot spec, set the readiness
# deadline, and start probing the API.
- def init(%Opts{mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} = opts) do
- spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type)
- deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout)
- data = %State{opts: opts, spec: spec, boot_deadline: deadline}
-
- {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]}
+ def init(
+ %Opts{vm_id: id, mutable: mutable, kernel: kernel, boot_args: boot_args, type: type} =
+ opts
+ ) do
+ case Hyper.Cluster.Routing.register_self({id, :state}) do
+ :ok ->
+ spec = BootSpec.resolve(boot_source(kernel, Mutable.blk_path(mutable), boot_args), type)
+ deadline = System.monotonic_time(:millisecond) + Time.as_ms(@ready_timeout)
+ data = %State{opts: opts, spec: spec, boot_deadline: deadline}
+
+ {:ok, :awaiting_api, data, [{:state_timeout, 0, :probe}]}
+
+ {:error, reason} ->
+ {:stop, reason}
+ end
end
# Assemble the `Hyper.Vm.source()` BootSpec expects from the resolved kernel +
@@ -110,31 +123,81 @@ defmodule Hyper.Node.FireVMM.State do
@moduledoc "Poll the (already-launched) daemon's API socket, then advance to `:configuring`."
alias Hyper.Firecracker.Api.{InstanceInfo, Operations}
- alias Hyper.Node.FireVMM.{Client, Opts}
+ alias Hyper.Node.FireVMM.{Client, Jailer, Opts}
+ alias Hyper.SuidHelper.ChrootJail
alias Unit.Time
+ require Logger
+
# How often to probe the daemon's API while waiting for it.
@probe_interval Time.ms(50)
- # Poll the daemon's API until it answers, then configure. Give up if the
- # readiness deadline passes first.
+ # Hand the jailed API socket to the node user, then poll the daemon's API
+ # until it answers and advance to `:configuring`. Give up if the readiness
+ # deadline passes first. The grant must happen before the probe: firecracker
+ # creates the socket owned by the per-VM uid, so the unprivileged controller
+ # gets EACCES on connect until the helper chowns it to us.
def handle(:state_timeout, :probe, %{opts: %Opts{vm_id: id}} = data) do
- case Client.run(Client.via(id), &Operations.describe_instance/1) do
- {:ok, %InstanceInfo{}} ->
- {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]}
-
- {:error, _reason} ->
- if System.monotonic_time(:millisecond) >= data.boot_deadline do
- {:stop, {:shutdown, {:boot_failed, :daemon_unready}}, data}
- else
- {:keep_state_and_data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]}
+ case ensure_api_granted(id, data) do
+ {:cont, data} ->
+ case Client.run(Client.via(id), &Operations.describe_instance/1) do
+ {:ok, %InstanceInfo{}} ->
+ {:next_state, :configuring, data, [{:state_timeout, 0, :configure}]}
+
+ {:error, reason} ->
+ keep_probing(id, data, reason)
end
+
+ {:wait, data, reason} ->
+ keep_probing(id, data, reason)
end
end
def handle({:call, from}, :stop, data) do
{:next_state, :stopping, data, [{:reply, from, :ok}]}
end
+
+ # Ensure the jailed API socket has been handed to the node user. Idempotent
+ # once granted (we record it in `data` so we ask the helper only once).
+ # `:socket_pending` means firecracker has not created the socket yet, so we
+ # keep waiting; a hard error is logged but also tolerated until the deadline
+ # (the probe that follows would fail with EACCES anyway and drive the stop).
+ @spec ensure_api_granted(Hyper.Vm.Id.t(), State.t()) ::
+ {:cont, State.t()} | {:wait, State.t(), term()}
+ defp ensure_api_granted(_id, %{api_granted: true} = data), do: {:cont, data}
+
+ defp ensure_api_granted(id, data) do
+ case ChrootJail.grant_api(Jailer.host_socket(id)) do
+ :ok ->
+ {:cont, %{data | api_granted: true}}
+
+ {:error, :socket_pending} ->
+ {:wait, data, :socket_pending}
+
+ {:error, reason} ->
+ Logger.warning("vm #{id}: grant-api failed: #{inspect(reason)}")
+ {:wait, data, {:grant_api, reason}}
+ end
+ end
+
+ # Keep waiting for readiness, re-arming the probe timer, unless the deadline
+ # has lapsed - then fail the boot, surfacing `reason` rather than swallowing
+ # it into a bare `:daemon_unready`. A persistent failure here points at the
+ # host->jail socket (path or, more often, the grant/permission step above).
+ @spec keep_probing(Hyper.Vm.Id.t(), State.t(), term()) ::
+ {:keep_state, State.t(), list()} | {:stop, term(), State.t()}
+ defp keep_probing(id, data, reason) do
+ if System.monotonic_time(:millisecond) >= data.boot_deadline do
+ Logger.warning(
+ "vm #{id}: firecracker API not reachable before deadline; " <>
+ "last probe error: #{inspect(reason)}"
+ )
+
+ {:stop, {:shutdown, {:boot_failed, {:daemon_unready, reason}}}, data}
+ else
+ {:keep_state, data, [{:state_timeout, Time.as_ms(@probe_interval), :probe}]}
+ end
+ end
end
defmodule Configuring do
@@ -145,8 +208,12 @@ defmodule Hyper.Node.FireVMM.State do
alias Hyper.Firecracker.Api.{InstanceActionInfo, Operations}
alias Hyper.Node.FireVMM.{BootSpec, ChrootJail, Client, Opts}
+ require Logger
+
# Stage boot artifacts into the chroot, then issue the pre-boot config and
- # start the guest.
+ # start the guest. Both failure paths end the boot via a supervisor restart,
+ # so log the reason here - otherwise it vanishes into the `:one_for_all`
+ # cycle and the VM just appears to relaunch for no visible cause.
def handle(
:state_timeout,
:configure,
@@ -155,11 +222,17 @@ defmodule Hyper.Node.FireVMM.State do
case ChrootJail.stage(id, uid, gid, spec) do
{:ok, jailed_spec} ->
case apply_spec(id, jailed_spec) do
- :ok -> {:next_state, :running, data}
- {:error, reason} -> {:stop, {:shutdown, {:boot_failed, reason}}, data}
+ :ok ->
+ Logger.info("vm #{id}: configured, guest starting")
+ {:next_state, :running, data}
+
+ {:error, reason} ->
+ Logger.error("vm #{id}: boot failed applying config: #{inspect(reason)}")
+ {:stop, {:shutdown, {:boot_failed, reason}}, data}
end
{:error, reason} ->
+ Logger.error("vm #{id}: boot failed staging jail: #{inspect(reason)}")
{:stop, {:shutdown, {:boot_failed, {:staging, reason}}}, data}
end
end
@@ -174,7 +247,7 @@ defmodule Hyper.Node.FireVMM.State do
# Cold boot, issued through the Client and aborting at the first error:
# machine-config -> boot-source -> each drive -> each NIC -> InstanceStart.
- @spec apply_spec(Hyper.Vm.id(), BootSpec.Cold.t()) :: :ok | {:error, term()}
+ @spec apply_spec(Hyper.Vm.Id.t(), BootSpec.Cold.t()) :: :ok | {:error, term()}
@decorate with_span("Hyper.Node.FireVMM.State.Configuring.apply_spec", include: [:id])
defp apply_spec(id, %BootSpec.Cold{} = cold) do
via = Client.via(id)
diff --git a/lib/hyper/node/img.ex b/lib/hyper/node/img.ex
index 320d4a7f..53bf2ae6 100644
--- a/lib/hyper/node/img.ex
+++ b/lib/hyper/node/img.ex
@@ -53,7 +53,7 @@ defmodule Hyper.Node.Img do
end
@doc "Create a per-VM mutable layer for `vm_id` over `img_id`."
- @spec create_mutable(Hyper.Img.id(), Hyper.Vm.id()) :: {:ok, pid()} | {:error, term()}
+ @spec create_mutable(Hyper.Img.id(), Hyper.Vm.Id.t()) :: {:ok, pid()} | {:error, term()}
def create_mutable(img_id, vm_id) do
case DynamicSupervisor.start_child(
@mutable_sup,
@@ -74,7 +74,7 @@ defmodule Hyper.Node.Img do
Serve `img` to `vm_id` for the duration of `callable`, holding a DB lease on the
image (and transitively its whole blob chain) the whole time.
"""
- @spec with_image(Hyper.Img.id(), Hyper.Vm.id(), (-> result)) :: result | {:error, term()}
+ @spec with_image(Hyper.Img.id(), Hyper.Vm.Id.t(), (-> result)) :: result | {:error, term()}
when result: var
def with_image(img, vm_id, callable) do
with_image_lease(img, vm_id, callable)
@@ -84,7 +84,8 @@ defmodule Hyper.Node.Img do
# even if `callable` raises. A background task re-bumps the lease for the whole
# run, so a long-lived VM never lets its claim lapse. If the lease cannot be
# taken, returns the error and never runs `callable`.
- @spec with_image_lease(Hyper.Img.id(), Hyper.Vm.id(), (-> result)) :: result | {:error, term()}
+ @spec with_image_lease(Hyper.Img.id(), Hyper.Vm.Id.t(), (-> result)) ::
+ result | {:error, term()}
when result: var
defp with_image_lease(img, vm_id, callable) do
ttl = Db.Lease.default_ttl()
@@ -104,7 +105,7 @@ defmodule Hyper.Node.Img do
# Re-bump the lease forever at 1/3 of the TTL, until killed. Runs in a task for the
# lifetime of `callable`; transient bump failures are swallowed so a DB hiccup
# can't tear down the VM - the next tick retries.
- @spec heartbeat(Hyper.Img.id(), Hyper.Vm.id(), Unit.Time.t()) :: no_return()
+ @spec heartbeat(Hyper.Img.id(), Hyper.Vm.Id.t(), Unit.Time.t()) :: no_return()
defp heartbeat(img, vm_id, ttl) do
Process.sleep(div(Unit.Time.as_ms(ttl), 3))
diff --git a/lib/hyper/node/img/mutable.ex b/lib/hyper/node/img/mutable.ex
index 57de460b..07f06269 100644
--- a/lib/hyper/node/img/mutable.ex
+++ b/lib/hyper/node/img/mutable.ex
@@ -29,7 +29,7 @@ defmodule Hyper.Node.Img.Mutable do
@enforce_keys [:img_id, :vm_id]
defstruct [:img_id, :vm_id]
- @type t :: %__MODULE__{img_id: Hyper.Img.id(), vm_id: Hyper.Vm.id()}
+ @type t :: %__MODULE__{img_id: Hyper.Img.id(), vm_id: Hyper.Vm.Id.t()}
end
defmodule State do
@@ -119,6 +119,12 @@ defmodule Hyper.Node.Img.Mutable do
@impl true
def handle_info(:idle_timeout, state), do: {:noreply, state}
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, state) do
# Destroy the thin volume, then release the image (its monitor on us also
@@ -129,7 +135,7 @@ defmodule Hyper.Node.Img.Mutable do
end
@doc false
- @spec dm_name(Hyper.Vm.id()) :: String.t()
+ @spec dm_name(Hyper.Vm.Id.t()) :: String.t()
def dm_name(vm_id), do: "hyper-rw-#{sanitize(vm_id)}"
defp sanitize(id), do: String.replace(id, ~r/[^A-Za-z0-9._-]/, "_")
diff --git a/lib/hyper/node/img/server.ex b/lib/hyper/node/img/server.ex
index 9587049c..24760465 100644
--- a/lib/hyper/node/img/server.ex
+++ b/lib/hyper/node/img/server.ex
@@ -128,6 +128,12 @@ defmodule Hyper.Node.Img.Server do
{:noreply, state}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, %State{dm_names: dm_names}) do
# Remove top-down (a snapshot's origin is the device below it). Layers are
diff --git a/lib/hyper/node/img/thin_pool.ex b/lib/hyper/node/img/thin_pool.ex
index 7a50f7ec..dbab44c7 100644
--- a/lib/hyper/node/img/thin_pool.ex
+++ b/lib/hyper/node/img/thin_pool.ex
@@ -94,6 +94,13 @@ defmodule Hyper.Node.Img.ThinPool do
{:reply, :ok, id_free(state, id)}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it. An abnormal exit
+ # carries a non-`:normal` reason and falls through to the default handler.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, state) do
_ = SuidHelper.Dmsetup.remove(@pool_name)
diff --git a/lib/hyper/node/layer/server.ex b/lib/hyper/node/layer/server.ex
index 79e70491..d7249a74 100644
--- a/lib/hyper/node/layer/server.ex
+++ b/lib/hyper/node/layer/server.ex
@@ -119,6 +119,12 @@ defmodule Hyper.Node.Layer.Server do
{:noreply, state}
end
+ @impl true
+ # Each privileged command runs through `System.cmd`, which links a transient
+ # port to this process; because we trap exits (for `terminate/2` teardown),
+ # the port's normal close is delivered here. Ignore it.
+ def handle_info({:EXIT, _port, :normal}, state), do: {:noreply, state}
+
@impl true
def terminate(_reason, %State{blk_path: blk_path}) do
case SuidHelper.Losetup.detach(blk_path) do
diff --git a/lib/hyper/node/reaper.ex b/lib/hyper/node/reaper.ex
new file mode 100644
index 00000000..479a1726
--- /dev/null
+++ b/lib/hyper/node/reaper.ex
@@ -0,0 +1,166 @@
+defmodule Hyper.Node.Reaper do
+ @moduledoc """
+ Per-node periodic, liveness-aware garbage collector for per-VM host resources
+ that an unclean BEAM death can strand: a firecracker cgroup leaf and a
+ `hyper-rw-` dm volume whose owning processes' `terminate/2` never ran and
+ whose vm_id never reboots (so `Hyper.Node.Reclaim`, which runs once at boot, and
+ the relaunch-time cleanup in the FireVMM path, never get a chance to clear it).
+
+ Liveness is the whole point. The reaper consults two independent sources of
+ truth for "this vm is alive" (`Plan.orphans/3` removes their union from the
+ candidate set) and only ever touches `hyper-rw-*` dm names and per-VM cgroup
+ leaves - never `hyper-thinpool`, `hyper-img-*`, or a live VM's resources. A
+ candidate must also survive two consecutive ticks (`Plan.confirm/2`) before it
+ is reaped, so a VM caught mid-boot (resources present, not yet registered) is
+ given a grace tick rather than destroyed.
+
+ The decision logic lives in the pure `Hyper.Node.Reaper.Plan`; this module is a
+ thin I/O adapter that gathers the inputs, calls the plan, and executes the
+ best-effort, idempotent removals.
+ """
+
+ use GenServer
+ use OpenTelemetryDecorator
+ require Logger
+
+ alias Hyper.Cluster.Routing
+ alias Hyper.Node.FireVMM.Jailer
+ alias Hyper.Node.Img
+ alias Hyper.Node.Reaper.{Config, Plan}
+ alias Hyper.SuidHelper.{ChrootJail, Dmsetup}
+
+ @vm_sup Hyper.Node.VMSupervisor
+
+ defstruct [:config, last_orphans: MapSet.new()]
+
+ @type t :: %__MODULE__{config: Config.t(), last_orphans: MapSet.t(String.t())}
+
+ @spec start_link(keyword()) :: GenServer.on_start()
+ def start_link(opts \\ []) do
+ GenServer.start_link(__MODULE__, opts, name: __MODULE__)
+ end
+
+ @impl true
+ def init(opts) do
+ config = Keyword.get(opts, :config) || Config.load()
+
+ if config.enabled do
+ schedule(config)
+ {:ok, %__MODULE__{config: config}}
+ else
+ Logger.info("reaper: disabled by config; not starting")
+ :ignore
+ end
+ end
+
+ @impl true
+ def handle_info(:tick, state) do
+ state = sweep(state)
+ schedule(state.config)
+ {:noreply, state}
+ end
+
+ # Ignore any unexpected message (port noise, stale timers) rather than crashing.
+ def handle_info(_msg, state), do: {:noreply, state}
+
+ @spec schedule(Config.t()) :: reference()
+ defp schedule(config) do
+ Process.send_after(self(), :tick, Unit.Time.as_ms(config.interval))
+ end
+
+ @spec sweep(t()) :: t()
+ @decorate with_span("Hyper.Node.Reaper.sweep")
+ defp sweep(%__MODULE__{} = state) do
+ live = gather_live()
+ leaves = list_cgroup_leaves()
+ rw = Plan.rw_ids(list_rw_dm())
+
+ current = Plan.orphans(live, leaves, rw)
+ {to_reap, next} = Plan.confirm(current, state.last_orphans)
+
+ Enum.each(to_reap, &reap_one/1)
+ %{state | last_orphans: next}
+ end
+
+ # Over-counting "live" only defers a reap (safe); under-counting destroys a live
+ # VM (catastrophic). So union two independent liveness sources: the local VM
+ # supervisor's children and the cluster routing table's view of this node.
+ @spec gather_live() :: MapSet.t(String.t())
+ defp gather_live, do: MapSet.union(local_live(), routed_live())
+
+ @spec local_live() :: MapSet.t(String.t())
+ defp local_live do
+ case Process.whereis(@vm_sup) do
+ nil ->
+ MapSet.new()
+
+ _pid ->
+ @vm_sup
+ |> DynamicSupervisor.which_children()
+ |> Enum.map(fn {_, child, _, _} -> child end)
+ |> Enum.filter(&is_pid/1)
+ |> Enum.map(&Routing.id_for/1)
+ |> Enum.reject(&is_nil/1)
+ |> MapSet.new()
+ end
+ end
+
+ @spec routed_live() :: MapSet.t(String.t())
+ defp routed_live do
+ for {id, node} <- Routing.all(), node == node(), into: MapSet.new(), do: id
+ end
+
+ @spec list_cgroup_leaves() :: [String.t()]
+ defp list_cgroup_leaves do
+ parent = Jailer.cgroup_parent_dir()
+
+ case File.ls(parent) do
+ {:ok, names} ->
+ # The parent holds the per-VM leaf directories alongside cgroup control
+ # files (`cgroup.procs`, `cgroup.controllers`, ...); only the directories
+ # are vm_id leaves.
+ Enum.filter(names, &File.dir?(Path.join(parent, &1)))
+
+ {:error, :enoent} ->
+ []
+
+ {:error, reason} ->
+ Logger.warning("reaper: could not list cgroup leaves: #{inspect(reason)}")
+ []
+ end
+ end
+
+ @spec list_rw_dm() :: [String.t()]
+ defp list_rw_dm do
+ case Dmsetup.list() do
+ {:ok, names} ->
+ names
+
+ {:error, reason} ->
+ Logger.warning("reaper: could not list dm devices: #{inspect(reason)}")
+ []
+ end
+ end
+
+ @spec reap_one(String.t()) :: :ok
+ @decorate with_span("Hyper.Node.Reaper.reap_one", include: [:id])
+ defp reap_one(id) do
+ Logger.warning("reaper: reaping orphan vm #{id}")
+
+ log_result(
+ "chroot/cgroup",
+ id,
+ ChrootJail.remove(Jailer.chroot_dir(id), Jailer.cgroup_dir(id))
+ )
+
+ log_result("dm volume", id, Dmsetup.remove(Img.Mutable.dm_name(id)))
+ :ok
+ end
+
+ @spec log_result(String.t(), String.t(), :ok | {:error, term()}) :: :ok
+ defp log_result(_what, _id, :ok), do: :ok
+
+ defp log_result(what, id, {:error, reason}) do
+ Logger.warning("reaper: removing #{what} for #{id} failed: #{inspect(reason)}")
+ end
+end
diff --git a/lib/hyper/node/reaper/config.ex b/lib/hyper/node/reaper/config.ex
new file mode 100644
index 00000000..5c0da184
--- /dev/null
+++ b/lib/hyper/node/reaper/config.ex
@@ -0,0 +1,35 @@
+defmodule Hyper.Node.Reaper.Config do
+ @moduledoc """
+ Configuration for the per-node resource reaper (`Hyper.Node.Reaper`).
+
+ Every field has a default, so configuration is optional - set only what you want
+ to change. Durations are `Unit.Time` values, so (like `Hyper.Img.Db.Gc.Config`)
+ overrides belong in `config/runtime.exs`:
+
+ config :hyper, Hyper.Node.Reaper.Config,
+ enabled: true,
+ interval: Unit.Time.s(30)
+
+ Set `enabled: false` to turn the reaper off entirely - it then never starts.
+
+ ## Fields
+
+ * `enabled` - run the reaper at all (default `true`).
+ * `interval` - rest between reap ticks (default `60s`). The two-strike grace
+ means an orphan is reaped at most one interval after it is first seen.
+ """
+
+ @type t :: %__MODULE__{
+ enabled: boolean(),
+ interval: Unit.Time.t()
+ }
+
+ defstruct enabled: true,
+ interval: Unit.Time.s(60)
+
+ @doc "Build the config from app env, filling any unset field with its default."
+ @spec load() :: t()
+ def load do
+ struct!(__MODULE__, Application.get_env(:hyper, __MODULE__, []))
+ end
+end
diff --git a/lib/hyper/node/reaper/plan.ex b/lib/hyper/node/reaper/plan.ex
new file mode 100644
index 00000000..17b6dde9
--- /dev/null
+++ b/lib/hyper/node/reaper/plan.ex
@@ -0,0 +1,32 @@
+defmodule Hyper.Node.Reaper.Plan do
+ @moduledoc """
+ Pure reap-decision core for `Hyper.Node.Reaper`. No I/O. Every safety invariant
+ is a property of these functions: a live vm_id is never a candidate, only an
+ orphan seen on two consecutive ticks is reaped, and only `hyper-rw-*` dm names
+ yield candidates (so `hyper-thinpool` / `hyper-img-*` can never be reaped).
+ """
+
+ @rw_prefix "hyper-rw-"
+
+ @doc "vm_ids of orphan rw-volumes from raw `dmsetup ls` names (only `hyper-rw-*`)."
+ @spec rw_ids([String.t()]) :: [String.t()]
+ def rw_ids(dm_names) do
+ for name <- dm_names,
+ String.starts_with?(name, @rw_prefix),
+ do: String.replace_prefix(name, @rw_prefix, "")
+ end
+
+ @doc "Candidate orphans this tick: (cgroup leaves ∪ rw vm_ids) minus the live set."
+ @spec orphans(MapSet.t(String.t()), [String.t()], [String.t()]) :: MapSet.t(String.t())
+ def orphans(live, cgroup_leaves, rw_ids) do
+ cgroup_leaves
+ |> MapSet.new()
+ |> MapSet.union(MapSet.new(rw_ids))
+ |> MapSet.difference(live)
+ end
+
+ @doc "Two-strike grace: reap only ids that were also orphans last tick. Returns {to_reap, next_last}."
+ @spec confirm(MapSet.t(String.t()), MapSet.t(String.t())) ::
+ {MapSet.t(String.t()), MapSet.t(String.t())}
+ def confirm(current, last), do: {MapSet.intersection(current, last), current}
+end
diff --git a/lib/hyper/node/reclaim.ex b/lib/hyper/node/reclaim.ex
new file mode 100644
index 00000000..a358e0c0
--- /dev/null
+++ b/lib/hyper/node/reclaim.ex
@@ -0,0 +1,99 @@
+defmodule Hyper.Node.Reclaim do
+ @moduledoc """
+ Boot-time reclamation of device-mapper and loop devices orphaned by an unclean
+ shutdown (SIGKILL or `:erlang.halt`, where the owning GenServers' `terminate/2`
+ never ran to tear them down).
+
+ Hyper names every dm device it creates with a `hyper-` prefix (`hyper-thinpool`,
+ `hyper-rw-`, `hyper-img--`), so this removes exactly those - never an
+ operator's unrelated dm devices. Removal is leaf-first: a device still open by
+ another (the pool under a thin volume, a snapshot under the next in its chain)
+ refuses until its dependents are gone, so leftovers are retried until a pass
+ removes nothing new. Loop devices backing files under Hyper's data dirs are then
+ detached (the dm devices that held them are gone by that point).
+
+ Entirely best-effort: every failure is logged and boot continues. It runs once,
+ before any device-owning GenServer starts, so the freshly-booting node never
+ collides with its own previous instance's leftovers.
+ """
+
+ alias Hyper.SuidHelper.{Dmsetup, Losetup}
+
+ require Logger
+
+ @dm_prefix "hyper-"
+
+ @spec run() :: :ok
+ def run do
+ reclaim_dm()
+ reclaim_loops()
+ :ok
+ end
+
+ defp reclaim_dm do
+ case Dmsetup.list() do
+ {:ok, names} ->
+ case Enum.filter(names, &String.starts_with?(&1, @dm_prefix)) do
+ [] ->
+ :ok
+
+ stale ->
+ Logger.warning(
+ "reclaim: removing #{length(stale)} stale dm device(s): #{inspect(stale)}"
+ )
+
+ remove_dm(stale)
+ end
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not list dm devices: #{inspect(reason)}")
+ end
+ end
+
+ @spec remove_dm([String.t()]) :: :ok
+ defp remove_dm([]), do: :ok
+
+ defp remove_dm(names) do
+ {failed, removed_any?} =
+ Enum.reduce(names, {[], false}, fn name, {failed, any?} ->
+ case Dmsetup.remove(name) do
+ :ok -> {failed, true}
+ {:error, _} -> {[name | failed], any?}
+ end
+ end)
+
+ cond do
+ failed == [] -> :ok
+ # A pass made progress: a retry may now clear the devices that were still
+ # held by the ones just removed.
+ removed_any? -> remove_dm(failed)
+ true -> Logger.error("reclaim: could not remove dm devices: #{inspect(failed)}")
+ end
+ end
+
+ defp reclaim_loops do
+ case Losetup.list() do
+ {:ok, pairs} ->
+ for {dev, backing} <- pairs, under_data_dirs?(backing) do
+ case Losetup.detach(dev) do
+ :ok ->
+ :ok
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not detach #{dev} (#{backing}): #{inspect(reason)}")
+ end
+ end
+
+ :ok
+
+ {:error, reason} ->
+ Logger.warning("reclaim: could not list loop devices: #{inspect(reason)}")
+ end
+ end
+
+ @spec under_data_dirs?(Path.t()) :: boolean()
+ defp under_data_dirs?(path) do
+ String.starts_with?(path, Hyper.Cfg.Dirs.scratch_dir() <> "/") or
+ String.starts_with?(path, Hyper.Cfg.Dirs.layer_dir() <> "/")
+ end
+end
diff --git a/lib/hyper/suid_helper.ex b/lib/hyper/suid_helper.ex
index 1a11992a..74dd73bf 100644
--- a/lib/hyper/suid_helper.ex
+++ b/lib/hyper/suid_helper.ex
@@ -16,7 +16,7 @@ defmodule Hyper.SuidHelper do
self-test and reports the base path it was compiled against.
"""
- alias Hyper.SuidHelper.{Blockdev, Dmsetup, Expected, Losetup}
+ alias Hyper.SuidHelper.{Dmsetup, Expected}
use OpenTelemetryDecorator
@@ -51,18 +51,20 @@ defmodule Hyper.SuidHelper do
end
@doc """
- Check that the setuid helper and every tool it execs are usable on this
- machine: the helper binary is present, is the build this release expects
- (`verify_version/0`), then each tool submodule's own check.
+ Check that the setuid helper is usable on this machine: the helper binary is
+ present, is the build this release expects (`verify_version/0`), and the kernel
+ exposes the device-mapper targets we need (`Dmsetup.test_system/0`, which also
+ exercises the helper's configured `dmsetup` binary).
+
+ The `losetup`/`blockdev` binaries are validated by the helper the first time
+ each is used; their paths live in the helper's own config, not here.
"""
@spec test_system() :: :ok | {:error, term()}
@decorate with_span("Hyper.SuidHelper.test_system")
def test_system do
with :ok <- helper_present(),
- :ok <- verify_version(),
- :ok <- Losetup.test_system(),
- :ok <- Dmsetup.test_system() do
- Blockdev.test_system()
+ :ok <- verify_version() do
+ Dmsetup.test_system()
end
end
diff --git a/lib/hyper/suid_helper/blockdev.ex b/lib/hyper/suid_helper/blockdev.ex
index be9bc39e..760fae84 100644
--- a/lib/hyper/suid_helper/blockdev.ex
+++ b/lib/hyper/suid_helper/blockdev.ex
@@ -11,18 +11,9 @@ defmodule Hyper.SuidHelper.Blockdev do
@spec device_sectors(Path.t()) :: {:ok, pos_integer()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Blockdev.device_sectors", include: [:path])
def device_sectors(path) do
- case SuidHelper.exec(["blockdev", "--bin", Hyper.Cfg.Tools.blockdev(), "--getsz", path]) do
+ case SuidHelper.exec(["blockdev", "--getsz", path]) do
{:ok, %{"sectors" => n}} -> {:ok, n}
{:error, _} = err -> err
end
end
-
- @doc "Check the blockdev binary is present."
- @spec test_system() :: :ok | {:error, :blockdev_not_found}
- @decorate with_span("Hyper.SuidHelper.Blockdev.test_system")
- def test_system do
- if System.find_executable(Hyper.Cfg.Tools.blockdev()),
- do: :ok,
- else: {:error, :blockdev_not_found}
- end
end
diff --git a/lib/hyper/suid_helper/chroot_jail.ex b/lib/hyper/suid_helper/chroot_jail.ex
index fdc0f1ce..cf8370c0 100644
--- a/lib/hyper/suid_helper/chroot_jail.ex
+++ b/lib/hyper/suid_helper/chroot_jail.ex
@@ -57,4 +57,25 @@ defmodule Hyper.SuidHelper.ChrootJail do
{:error, _} = err -> err
end
end
+
+ @doc """
+ Hand the firecracker API `socket` to the node user so the unprivileged
+ controller can `connect()` to it. The jailer drops firecracker to a per-VM
+ uid/gid and chroots it, so the socket it creates is owned by that per-VM id and
+ the node (a different uid) gets `EACCES` on connect. The helper chowns just
+ that one socket to its caller (the node user) and chmods it `0660`, leaving the
+ rest of the per-VM isolation intact.
+
+ Returns `{:error, :socket_pending}` while firecracker has not yet created the
+ socket, so the caller can keep waiting.
+ """
+ @spec grant_api(Path.t()) :: :ok | {:error, :socket_pending} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.ChrootJail.grant_api", include: [:socket])
+ def grant_api(socket) do
+ case SuidHelper.exec(["chroot-jail", "grant-api", "--socket", socket]) do
+ {:ok, %{"result" => "granted"}} -> :ok
+ {:ok, %{"result" => "pending"}} -> {:error, :socket_pending}
+ {:error, _} = err -> err
+ end
+ end
end
diff --git a/lib/hyper/suid_helper/dmsetup.ex b/lib/hyper/suid_helper/dmsetup.ex
index c8dabc88..1be0bbfe 100644
--- a/lib/hyper/suid_helper/dmsetup.ex
+++ b/lib/hyper/suid_helper/dmsetup.ex
@@ -59,14 +59,7 @@ defmodule Hyper.SuidHelper.Dmsetup do
@spec remove(String.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.remove", include: [:name])
def remove(name) do
- case SuidHelper.exec([
- "dmsetup",
- "--bin",
- Hyper.Cfg.Tools.dmsetup(),
- "remove",
- "--retry",
- name
- ]) do
+ case SuidHelper.exec(["dmsetup", "remove", "--retry", name]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
@@ -76,39 +69,31 @@ defmodule Hyper.SuidHelper.Dmsetup do
@spec message(String.t(), String.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.message", include: [:name, :message])
def message(name, message) do
- argv =
- ["dmsetup", "--bin", Hyper.Cfg.Tools.dmsetup(), "message", name, "--message", message]
-
- case SuidHelper.exec(argv) do
+ case SuidHelper.exec(["dmsetup", "message", name, "--message", message]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
end
@doc """
- Check the dmsetup binary is present and the kernel exposes the dm targets we
- use (snapshot, thin, thin-pool).
+ Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool).
+
+ Routes through the setuid helper: `dmsetup targets` opens `/dev/mapper/control`,
+ which needs root, and the BEAM runs unprivileged. The helper validates its
+ configured `dmsetup` binary before running it, so a missing or unsafe binary
+ surfaces here too.
"""
@spec test_system() :: :ok | {:error, term()}
@decorate with_span("Hyper.SuidHelper.Dmsetup.test_system")
def test_system do
- if System.find_executable(Hyper.Cfg.Tools.dmsetup()),
- do: test_targets(),
- else: {:error, :dmsetup_not_found}
- end
-
- @doc "Verify the kernel exposes the dm targets we use (snapshot, thin, thin-pool)."
- @spec test_targets() :: :ok | {:error, term()}
- @decorate with_span("Hyper.SuidHelper.Dmsetup.test_targets")
- def test_targets do
- case System.cmd(Hyper.Cfg.Tools.dmsetup(), ["targets"], stderr_to_stdout: true) do
- {out, 0} ->
+ case SuidHelper.exec(["dmsetup", "targets"]) do
+ {:ok, %{"output" => out}} ->
have = parse_targets(out)
missing = Enum.reject(@required_targets, &MapSet.member?(have, &1))
if missing == [], do: :ok, else: {:error, {:missing_dm_targets, missing}}
- {out, code} ->
- {:error, {:dmsetup_targets_failed, code, String.trim(out)}}
+ {:error, {code, msg}} ->
+ {:error, {:dmsetup_targets_failed, code, msg}}
end
end
@@ -122,6 +107,32 @@ defmodule Hyper.SuidHelper.Dmsetup do
|> MapSet.new()
end
+ @doc "Names of every device-mapper device currently present on this host."
+ @spec list() :: {:ok, [String.t()]} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.Dmsetup.list")
+ def list do
+ case SuidHelper.exec(["dmsetup", "ls"]) do
+ {:ok, %{"output" => out}} -> {:ok, parse_names(out)}
+ {:error, _} = err -> err
+ end
+ end
+
+ @doc false
+ @spec parse_names(String.t()) :: [String.t()]
+ def parse_names(out) do
+ case String.trim(out) do
+ # `dmsetup ls` prints this sentinel (not a device row) when there are none.
+ "No devices found" ->
+ []
+
+ _ ->
+ out
+ |> String.split("\n", trim: true)
+ |> Enum.map(&(&1 |> String.split() |> List.first()))
+ |> Enum.reject(&is_nil/1)
+ end
+ end
+
@doc false
@spec snapshot_table(Path.t(), Path.t(), pos_integer(), pos_integer()) :: String.t()
def snapshot_table(origin_dev, cow_dev, sectors, chunk_sectors) do
@@ -145,9 +156,7 @@ defmodule Hyper.SuidHelper.Dmsetup do
# create flags (e.g. `--readonly`). Returns the `/dev/mapper/` path.
@spec create(String.t(), String.t(), [String.t()]) :: {:ok, Path.t()} | {:error, err()}
defp create(name, table, flags) do
- argv =
- ["dmsetup", "--bin", Hyper.Cfg.Tools.dmsetup(), "create", name] ++
- flags ++ ["--table", table]
+ argv = ["dmsetup", "create", name] ++ flags ++ ["--table", table]
case SuidHelper.exec(argv) do
{:ok, %{"device" => dev}} -> {:ok, dev}
diff --git a/lib/hyper/suid_helper/losetup.ex b/lib/hyper/suid_helper/losetup.ex
index 9ee9191a..3abf4fba 100644
--- a/lib/hyper/suid_helper/losetup.ex
+++ b/lib/hyper/suid_helper/losetup.ex
@@ -11,7 +11,7 @@ defmodule Hyper.SuidHelper.Losetup do
@spec attach_ro(Path.t()) :: {:ok, Path.t()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.attach_ro", include: [:path])
def attach_ro(path) do
- case SuidHelper.exec(["losetup", "--bin", Hyper.Cfg.Tools.losetup(), "attach", path]) do
+ case SuidHelper.exec(["losetup", "attach", path]) do
{:ok, %{"device" => dev}} -> {:ok, dev}
{:error, _} = err -> err
end
@@ -21,14 +21,7 @@ defmodule Hyper.SuidHelper.Losetup do
@spec attach_rw(Path.t()) :: {:ok, Path.t()} | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.attach_rw", include: [:path])
def attach_rw(path) do
- case SuidHelper.exec([
- "losetup",
- "--bin",
- Hyper.Cfg.Tools.losetup(),
- "attach",
- "--rw",
- path
- ]) do
+ case SuidHelper.exec(["losetup", "attach", "--rw", path]) do
{:ok, %{"device" => dev}} -> {:ok, dev}
{:error, _} = err -> err
end
@@ -38,18 +31,34 @@ defmodule Hyper.SuidHelper.Losetup do
@spec detach(Path.t()) :: :ok | {:error, err()}
@decorate with_span("Hyper.SuidHelper.Losetup.detach", include: [:dev])
def detach(dev) do
- case SuidHelper.exec(["losetup", "--bin", Hyper.Cfg.Tools.losetup(), "detach", dev]) do
+ case SuidHelper.exec(["losetup", "detach", dev]) do
{:ok, _} -> :ok
{:error, _} = err -> err
end
end
- @doc "Check the losetup binary is present."
- @spec test_system() :: :ok | {:error, :losetup_not_found}
- @decorate with_span("Hyper.SuidHelper.Losetup.test_system")
- def test_system do
- if System.find_executable(Hyper.Cfg.Tools.losetup()),
- do: :ok,
- else: {:error, :losetup_not_found}
+ @doc "Currently-attached loop devices as `{device, backing_file}` pairs."
+ @spec list() :: {:ok, [{Path.t(), Path.t()}]} | {:error, err()}
+ @decorate with_span("Hyper.SuidHelper.Losetup.list")
+ def list do
+ case SuidHelper.exec(["losetup", "list"]) do
+ {:ok, %{"output" => out}} -> {:ok, parse_list(out)}
+ {:error, _} = err -> err
+ end
+ end
+
+ @doc false
+ @spec parse_list(String.t()) :: [{Path.t(), Path.t()}]
+ def parse_list(out) do
+ out
+ |> String.split("\n", trim: true)
+ |> Enum.flat_map(fn line ->
+ # `NAME BACK-FILE` rows; a loop with no backing file has only one column
+ # (nothing for us to reclaim by file), so skip it.
+ case String.split(line, " ", parts: 2, trim: true) do
+ [dev, backing] -> [{dev, String.trim(backing)}]
+ _ -> []
+ end
+ end)
end
end
diff --git a/lib/hyper/vm.ex b/lib/hyper/vm.ex
index 0b99da8d..3001e41c 100644
--- a/lib/hyper/vm.ex
+++ b/lib/hyper/vm.ex
@@ -10,7 +10,6 @@ defmodule Hyper.Vm do
@dialyzer {:nowarn_function, [fast_fork: 1, fork: 1]}
@type t :: pid()
- @type id :: String.t()
@typedoc """
What a VM boots from: explicit, already-jail-visible artifact paths for a cold
diff --git a/lib/hyper/vm/id.ex b/lib/hyper/vm/id.ex
new file mode 100644
index 00000000..7c43d29e
--- /dev/null
+++ b/lib/hyper/vm/id.ex
@@ -0,0 +1,27 @@
+defmodule Hyper.Vm.Id do
+ @moduledoc """
+ A microVM id and its generator.
+
+ An id is a `v` prefix followed by lowercase base32 of 10 random bytes, charset
+ `[a-z2-7]` - alphanumeric only, no `-`, `_`, or other punctuation. That charset
+ is the intersection of three independent constraints the id must satisfy at
+ once:
+
+ * firecracker rejects `_` in an instance id (`InvalidInstanceId`);
+ * dm/jailer names must not start with `-`;
+ * registry keys and chroot path components stay trivially safe.
+ """
+
+ @type t :: String.t()
+
+ @doc """
+ Generate a fresh, random VM id (see the module doc for the charset contract).
+
+ The previous base64url encoding emitted `-` and `_`, so it could produce ids
+ firecracker refused at boot (`Invalid char (_)`).
+ """
+ @spec generate() :: t()
+ def generate do
+ "v" <> Base.encode32(:crypto.strong_rand_bytes(10), padding: false, case: :lower)
+ end
+end
diff --git a/lib/mix/tasks/firecracker.install.ex b/lib/mix/tasks/firecracker.install.ex
new file mode 100644
index 00000000..38bb7ad1
--- /dev/null
+++ b/lib/mix/tasks/firecracker.install.ex
@@ -0,0 +1,148 @@
+defmodule Mix.Tasks.Firecracker.Install do
+ @shortdoc "Download, verify, and install the pinned Firecracker release"
+ @moduledoc """
+ Downloads, verifies, and installs the pinned Firecracker release (v1.16.0)
+ for the current CPU architecture.
+
+ mix firecracker.install [--prefix DIR]
+
+ Steps performed:
+
+ 1. Detects the CPU architecture (`x86_64` or `aarch64`).
+ 2. Downloads the release tarball and verifies its SHA-256 checksum.
+ 3. Extracts the tarball, then copies the binaries to `/firecracker`
+ and `/jailer` using the **bare basenames** `firecracker` and
+ `jailer`. The setuid helper validates binaries via `SafeBin<"firecracker">`
+ and `SafeBin<"jailer">`, which match on basename only — version-stamped
+ names such as `firecracker-v1.16.0-x86_64` are rejected unconditionally.
+ 4. Marks both binaries executable (`0o755`).
+ 5. Prints the `/etc/hyper/config.toml` snippet the operator needs to paste.
+
+ This task installs **unprivileged** binaries and prints configuration.
+ Privilege at runtime is handled by `hyper-suidhelper` (the setuid helper).
+ This task does **not** setuid `firecracker` or `jailer`. Install and setuid
+ the helper separately with `mix suidhelper.install`.
+
+ ## Options
+
+ * `--prefix DIR` — installation directory (default: `/opt/firecracker`).
+
+ ## Security requirements
+
+ After installing, ensure:
+
+ * The binaries are root-owned and **not** group- or world-writable.
+ The suidhelper refuses binaries with loose permissions.
+ * `/etc/hyper/config.toml` is root-owned with mode `0644`.
+ """
+
+ use Mix.Task
+
+ @version "1.16.0"
+ @default_prefix "/opt/firecracker"
+
+ @impl Mix.Task
+ @spec run([String.t()]) :: :ok
+ def run(argv) do
+ {opts, _rest, _invalid} = OptionParser.parse(argv, strict: [prefix: :string])
+ prefix = Keyword.get(opts, :prefix, @default_prefix)
+
+ arch = detect_arch!()
+
+ case Application.ensure_all_started(:req) do
+ {:ok, _} -> :ok
+ {:error, {reason, app}} -> Mix.raise("Cannot start HTTP client #{app}: #{inspect(reason)}")
+ end
+
+ install!(release_for(arch), prefix)
+ print_config(prefix)
+ end
+
+ defp detect_arch! do
+ case Sys.Arch.current() do
+ {:ok, arch} ->
+ arch
+
+ {:error, {:unsupported_arch, raw}} ->
+ Mix.raise(
+ "Unsupported CPU architecture #{inspect(raw)}; " <>
+ "Firecracker supports x86_64 and aarch64."
+ )
+ end
+ end
+
+ defp release_for(:x86_64) do
+ %{
+ url:
+ "https://github.com/firecracker-microvm/firecracker/releases/download/" <>
+ "v#{@version}/firecracker-v#{@version}-x86_64.tgz",
+ sha256: "bd04e26952d4e158085778c6230a0b383d2619c319182e27eaa9d61a212e92d6",
+ firecracker_path: "release-v#{@version}-x86_64/firecracker-v#{@version}-x86_64",
+ jailer_path: "release-v#{@version}-x86_64/jailer-v#{@version}-x86_64"
+ }
+ end
+
+ defp release_for(:aarch64) do
+ %{
+ url:
+ "https://github.com/firecracker-microvm/firecracker/releases/download/" <>
+ "v#{@version}/firecracker-v#{@version}-aarch64.tgz",
+ sha256: "531c713cdbc37d4b8bc2533d851aabc0267096afa1768086a37672abb668efd7",
+ firecracker_path: "release-v#{@version}-aarch64/firecracker-v#{@version}-aarch64",
+ jailer_path: "release-v#{@version}-aarch64/jailer-v#{@version}-aarch64"
+ }
+ end
+
+ defp install!(
+ %{url: url, sha256: sha256, firecracker_path: fc_rel, jailer_path: jailer_rel},
+ prefix
+ ) do
+ extract_dir = Path.join(prefix, ".firecracker-extract")
+
+ Mix.shell().info("Downloading Firecracker v#{@version} from #{url} ...")
+
+ case Redist.Targz.install(url, sha256, extract_dir) do
+ :ok -> :ok
+ {:error, reason} -> Mix.raise("Download from #{url} failed: #{inspect(reason)}")
+ end
+
+ dst_fc = Path.join(prefix, "firecracker")
+ dst_jailer = Path.join(prefix, "jailer")
+
+ # The release ships version-stamped names; copy to bare basenames so SafeBin
+ # validation passes. The helper matches on basename, not full path.
+ File.cp!(Path.join(extract_dir, fc_rel), dst_fc)
+ File.cp!(Path.join(extract_dir, jailer_rel), dst_jailer)
+ File.chmod!(dst_fc, 0o755)
+ File.chmod!(dst_jailer, 0o755)
+ _ = File.rm_rf!(extract_dir)
+
+ Mix.shell().info("Installed #{dst_fc}")
+ Mix.shell().info("Installed #{dst_jailer}")
+ end
+
+ defp print_config(prefix) do
+ fc = Path.join(prefix, "firecracker")
+ jailer = Path.join(prefix, "jailer")
+
+ # This task runs unprivileged, so the binaries land owned by the invoking
+ # user. The suidhelper's SafeBin refuses any binary not owned by root and not
+ # free of group/other write bits, so the operator MUST chown/chmod them or
+ # every jailer launch fails closed. Print the exact commands rather than a
+ # vague "ensure root-owned".
+ Mix.shell().info("""
+
+ Almost done. Run these as root so the setuid helper will accept the binaries
+ (it refuses any jailer/firecracker not owned by root):
+
+ sudo chown root:root #{fc} #{jailer}
+ sudo chmod 0755 #{fc} #{jailer}
+
+ Then add to /etc/hyper/config.toml (file: root-owned, mode 0644):
+
+ [tools]
+ firecracker = "#{fc}"
+ jailer = "#{jailer}"
+ """)
+ end
+end
diff --git a/lib/mix/tasks/suidhelper.install.ex b/lib/mix/tasks/suidhelper.install.ex
new file mode 100644
index 00000000..2085de3c
--- /dev/null
+++ b/lib/mix/tasks/suidhelper.install.ex
@@ -0,0 +1,93 @@
+defmodule Mix.Tasks.Suidhelper.Install do
+ @shortdoc "Build, stamp, and install the setuid helper"
+ @moduledoc """
+ Builds, stamps, and installs the Rust setuid helper.
+
+ mix suidhelper.install
+
+ Two steps:
+
+ 1. `cargo xtask stamp` in `native/suidhelper` builds the release binary and
+ writes its BLAKE3 self-checksum into `.note.sum` (the same step the
+ `:suidhelper_stamp` compiler runs).
+ 2. The stamped binary is copied setuid-root (mode `4755`) to
+ `/usr/local/bin/hyper-suidhelper`.
+
+ The copy needs root, but Mix runs every subprocess in its own session with no
+ controlling terminal (`erl_child_setup` calls `setsid`), so a nested `sudo`
+ cannot open `/dev/tty` to prompt for a password. This task therefore only runs
+ `sudo` itself when it is already non-interactive (`sudo -n` succeeds, e.g.
+ `NOPASSWD` or a usable cached credential). Otherwise it prints the exact
+ privileged command for you to run in your own terminal.
+
+ This is the privileged counterpart to `mix suidhelper.stamp`, which stamps
+ only. `cargo` and the helper's toolchain (see
+ `native/suidhelper/rust-toolchain.toml`) must be installed.
+ """
+
+ use Mix.Task
+
+ @helper_dir "native/suidhelper"
+ @source Path.join(@helper_dir, "target/release/hyper-suidhelper")
+ # Must match `Hyper.Config`'s default `suid_helper` path and the xtask's
+ # `INSTALL_PATH`: a `PATH` location the unprivileged node can exec.
+ @install_path "/usr/local/bin/hyper-suidhelper"
+
+ @impl Mix.Task
+ def run(argv) do
+ stamp!(argv)
+ install_privileged()
+ end
+
+ defp stamp!(argv) do
+ case System.cmd("cargo", ["xtask", "stamp" | argv],
+ cd: @helper_dir,
+ into: IO.stream(:stdio, :line)
+ ) do
+ {_, 0} ->
+ :ok
+
+ {_, _} ->
+ Mix.raise("""
+ `cargo xtask stamp` failed building the suidhelper.
+
+ Ensure `cargo` and the helper's toolchain (see #{@helper_dir}/rust-toolchain.toml)
+ are installed.
+ """)
+ end
+ end
+
+ defp install_privileged do
+ if passwordless_sudo?() do
+ Mix.shell().info("Installing #{@source} -> #{@install_path} (setuid root)")
+
+ case System.cmd("sudo", install_argv(), into: IO.stream(:stdio, :line)) do
+ {_, 0} -> Mix.shell().info("installed #{@install_path} (setuid root)")
+ {_, _} -> Mix.raise(manual_instructions())
+ end
+ else
+ Mix.shell().info(manual_instructions())
+ end
+ end
+
+ # `sudo -n true` exits 0 only when sudo can run without prompting. With no
+ # controlling terminal a cached `tty_tickets` credential is invisible, so this
+ # is true essentially only under `NOPASSWD` -- exactly the case where the
+ # nested `sudo install` below can succeed.
+ defp passwordless_sudo? do
+ match?({_, 0}, System.cmd("sudo", ["-n", "true"], stderr_to_stdout: true))
+ end
+
+ defp install_argv,
+ do: ["install", "-o", "root", "-g", "root", "-m", "4755", @source, @install_path]
+
+ defp manual_instructions do
+ """
+
+ The binary is built and stamped, but installing it setuid-root needs a
+ password and `sudo` has no terminal to prompt on here. Run the copy yourself:
+
+ sudo #{Enum.join(install_argv(), " ")}
+ """
+ end
+end
diff --git a/mix.exs b/mix.exs
index 7ccae729..fe3f48f4 100644
--- a/mix.exs
+++ b/mix.exs
@@ -24,6 +24,10 @@ defmodule Hyper.MixProject do
# Cache the PLTs in a stable, gitignored dir so CI can cache them.
plt_local_path: "priv/plts",
plt_core_path: "priv/plts",
+ # `:mix` is needed so the Mix tasks under `lib/mix/tasks` (which call
+ # `Mix.raise/1`, `Mix.shell/0`, and implement the `Mix.Task` behaviour)
+ # resolve instead of tripping `unknown_function`.
+ plt_add_apps: [:mix],
# Verify @specs against actual returns, and flag ignored return values.
flags: [:unmatched_returns, :extra_return, :missing_return]
]
diff --git a/native/suidhelper/Cargo.toml b/native/suidhelper/Cargo.toml
index f5b4b1b7..be6a449c 100644
--- a/native/suidhelper/Cargo.toml
+++ b/native/suidhelper/Cargo.toml
@@ -48,6 +48,10 @@ path = "tests/util/safe_bin.rs"
name = "tools_dmsetup_parsers"
path = "tests/tools/dmsetup_parsers.rs"
+[[test]]
+name = "tools_jailer"
+path = "tests/tools/jailer.rs"
+
[[test]]
name = "util_confinement"
path = "tests/util/confinement.rs"
@@ -56,6 +60,10 @@ path = "tests/util/confinement.rs"
name = "tools_chroot_jail_remove"
path = "tests/tools/chroot_jail_remove.rs"
+[[test]]
+name = "tools_chroot_jail_grant_api"
+path = "tests/tools/chroot_jail_grant_api.rs"
+
[[test]]
name = "util_chroot_jail"
path = "tests/util/chroot_jail.rs"
@@ -68,14 +76,22 @@ path = "tests/e2e/config.rs"
name = "e2e_argv"
path = "tests/e2e/argv.rs"
+[[test]]
+name = "e2e_jailer"
+path = "tests/e2e/jailer.rs"
+
[[test]]
name = "e2e_chroot_jail"
path = "tests/e2e/chroot_jail.rs"
+[[test]]
+name = "config_uid_gid_range"
+path = "tests/config_uid_gid_range.rs"
+
[dependencies]
clap = { version = "4", features = ["derive"] }
hyper-suidhelper-meta = { path = "meta" }
-nix = { version = "0.29", features = ["user", "fs", "dir"] }
+nix = { version = "0.29", features = ["user", "fs", "dir", "process"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
diff --git a/native/suidhelper/src/config.rs b/native/suidhelper/src/config.rs
index 425ba386..41b00a75 100644
--- a/native/suidhelper/src/config.rs
+++ b/native/suidhelper/src/config.rs
@@ -1,6 +1,14 @@
// SPDX-License-Identifier: AGPL-3.0-only
//! Runtime host configuration, read from a single root-owned TOML file.
+//!
+//! ## UID/GID range divergence
+//!
+//! Elixir keeps a default `{900_000, 999_999}` that governs which UIDs the node
+//! hands *out*; this helper reads `[jails] uid_gid_range` from config.toml to
+//! decide which UIDs it *accepts* (default `{900_000, 999_999}` when the key is
+//! absent). Operators narrowing the range must set **both**.
+use crate::util::safe_bin::{self, SafeBin};
use crate::util::safe_file::{self, IsRegularFile, OnlyRootWritable, RootOwner, SafeFile};
use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents};
use nix::fcntl::OFlag;
@@ -21,11 +29,56 @@ pub enum LoadingError {
Malformed(PathBuf),
#[error("work_dir in {0:?} must be an absolute path")]
Relative(PathBuf),
+ #[error("uid_gid_range.min must be >= 1 and <= max (got min={min}, max={max})")]
+ BadUidGidRange { min: u32, max: u32 },
+}
+
+/// Error returned by config accessors for tool binaries derived from config.
+#[derive(Debug, Error)]
+pub enum BinError {
+ #[error("required binary `{0}` is not configured in /etc/hyper/config.toml")]
+ Unconfigured(&'static str),
+ #[error(transparent)]
+ Bin(#[from] safe_bin::Error),
}
const CONFIG_PATHSTR: &str = "/etc/hyper/config.toml";
const INSECURE_CONFIG_PATH_ENV: &str = "HYPER_SETUIDHELPER_CONFIG_PATH";
+/// UID/GID allocation band, read from `[jails] uid_gid_range` in config.toml as
+/// a two-element `[min, max]` array. Controls which UIDs the helper *accepts*
+/// from the BEAM — see module docs.
+#[derive(Debug, Clone, Copy, Deserialize)]
+#[serde(from = "[u32; 2]")]
+pub struct UidGidRange {
+ pub min: u32,
+ pub max: u32,
+}
+
+impl From<[u32; 2]> for UidGidRange {
+ fn from([min, max]: [u32; 2]) -> Self {
+ Self { min, max }
+ }
+}
+
+// Band defaults match Elixir's `compile_env` allocation defaults so that an
+// unconfigured helper and an unconfigured node agree out of the box.
+const DEFAULT_UID_GID: (u32, u32) = (900_000, 999_999);
+
+/// Validate a uid_gid_range value. A present range where min==0 or min>max is
+/// treated as a config trust violation — fatal at load, consistent with the
+/// "present but untrusted" model. Exposed so tests can verify the contract
+/// without touching the file system.
+pub fn validate_uid_gid_range(r: &UidGidRange) -> Result<(), LoadingError> {
+ if r.min == 0 || r.min > r.max {
+ return Err(LoadingError::BadUidGidRange {
+ min: r.min,
+ max: r.max,
+ });
+ }
+ Ok(())
+}
+
/// The config file path. In production this is the fixed `/etc/hyper/config.toml`.
/// Only in INSECURE TEST MODE (both gates open) may an env var redirect it — the
/// secure arm is always the hardcoded path, so a release build cannot be steered.
@@ -44,13 +97,103 @@ fn config_path() -> PathBuf {
#[derive(Debug, Clone, Deserialize)]
pub struct Config {
work_dir: PathBuf,
+ #[serde(default)]
+ tools: Tools,
+ #[serde(default)]
+ jails: Jails,
+}
+
+/// The `[jails]` table: how the helper places and confines each VM jail.
+///
+/// `cgroup` is the parent cgroup the jailer nests every VM beneath (default
+/// `"hyper"`). `uid_gid_range` is the `[min, max]` band of UIDs/GIDs the helper
+/// accepts from the BEAM; absent means the built-in default. A missing `[jails]`
+/// table, or any missing key within it, falls back to these defaults.
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct Jails {
+ cgroup: String,
+ uid_gid_range: Option,
+}
+
+impl Default for Jails {
+ fn default() -> Self {
+ Self {
+ cgroup: default_parent_cgroup(),
+ uid_gid_range: None,
+ }
+ }
+}
+
+/// Paths to the external binaries the helper runs, the `[tools]` table.
+///
+/// The device tools (`dmsetup`, `losetup`, `blockdev`) carry built-in defaults;
+/// `firecracker` and `jailer` have none and must be configured before any VM can
+/// launch — their absence surfaces as [`BinError::Unconfigured`] at use time, not
+/// at load. Every path is validated as a root-owned, correctly-named [`SafeBin`]
+/// when accessed, never at parse time (the file is read unprivileged). A missing
+/// `[tools]` table, or any missing key within it, falls back to these defaults.
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct Tools {
+ dmsetup: PathBuf,
+ losetup: PathBuf,
+ blockdev: PathBuf,
+ firecracker: Option,
+ jailer: Option,
+}
+
+impl Default for Tools {
+ fn default() -> Self {
+ Self {
+ dmsetup: default_dmsetup(),
+ losetup: default_losetup(),
+ blockdev: default_blockdev(),
+ firecracker: None,
+ jailer: None,
+ }
+ }
+}
+
+// The default data root. Must match the Elixir node's `@dev_work_dir`, which it
+// uses when the same config file is absent, so both sides agree (see
+// `Hyper.Node.check_helper_base`).
+fn default_work_dir() -> PathBuf {
+ PathBuf::from("/srv/hyper")
+}
+
+fn default_dmsetup() -> PathBuf {
+ PathBuf::from("/usr/sbin/dmsetup")
+}
+
+fn default_losetup() -> PathBuf {
+ PathBuf::from("/usr/sbin/losetup")
+}
+
+fn default_blockdev() -> PathBuf {
+ PathBuf::from("/usr/sbin/blockdev")
+}
+
+fn default_parent_cgroup() -> String {
+ // Must match Elixir node's `@parent_cgroup`; operators need to keep them in sync.
+ "hyper".into()
+}
+
+impl Default for Config {
+ fn default() -> Self {
+ Self {
+ work_dir: default_work_dir(),
+ tools: Tools::default(),
+ jails: Jails::default(),
+ }
+ }
}
impl Config {
/// The process-wide config, loaded once (and forced unprivileged by
- /// [`Config::init`]). A load failure is fatal: the helper cannot safely
- /// operate without a trusted data root, so it prints the error and exits
- /// rather than guessing a default.
+ /// [`Config::init`]). An absent file yields the built-in defaults; a
+ /// *present but untrusted* file (wrong owner/mode, malformed) is fatal —
+ /// the helper prints the error and exits rather than trusting it.
pub fn get() -> &'static Config {
LazyLock::force(&CONFIG)
}
@@ -58,7 +201,7 @@ impl Config {
/// Force the config to load now. Call this once at the very start of `main`,
/// after privileges have already been dropped (the `.preinit_array` entry in
/// `setuid_privileged` runs before `main`), so the file is never first read
- /// lazily from inside a `Privileged` scope - i.e. it is guaranteed to be read
+ /// lazily from inside a `Privileged` scope — i.e. it is guaranteed to be read
/// as the real uid, not as root.
pub fn init() {
let _ = Self::get();
@@ -74,6 +217,60 @@ impl Config {
self.work_dir.join("jails")
}
+ /// The validated `dmsetup` binary the helper will run.
+ pub fn dmsetup(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.tools.dmsetup)
+ }
+
+ /// The validated `losetup` binary the helper will run.
+ pub fn losetup(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.tools.losetup)
+ }
+
+ /// The validated `blockdev` binary the helper will run.
+ pub fn blockdev(&self) -> Result, safe_bin::Error> {
+ SafeBin::from_path(&self.tools.blockdev)
+ }
+
+ /// The Firecracker VMM binary, validated as root-owned and correctly named.
+ /// Errors [`BinError::Unconfigured`] when absent from config — an operator
+ /// must set `[tools] firecracker` before any VM can be launched.
+ pub fn firecracker(&self) -> Result, BinError> {
+ self.tools
+ .firecracker
+ .as_deref()
+ .ok_or(BinError::Unconfigured("firecracker"))
+ .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin))
+ }
+
+ /// The Firecracker jailer binary, validated as root-owned and correctly named.
+ /// Errors [`BinError::Unconfigured`] when absent from config — an operator
+ /// must set `[tools] jailer` before any VM can be launched.
+ pub fn jailer(&self) -> Result, BinError> {
+ self.tools
+ .jailer
+ .as_deref()
+ .ok_or(BinError::Unconfigured("jailer"))
+ .and_then(|p| SafeBin::from_path(p).map_err(BinError::Bin))
+ }
+
+ /// The jailer `--parent-cgroup` value, from `[jails] cgroup`. Defaults to
+ /// `"hyper"`, matching the Elixir node's default.
+ pub fn parent_cgroup(&self) -> &str {
+ &self.jails.cgroup
+ }
+
+ /// The UID/GID band the helper accepts from the BEAM. Defaults to
+ /// `(900_000, 999_999)` when the key is absent (matching Elixir's defaults).
+ /// A present range with min==0 or min>max is rejected at load time by
+ /// [`Config::safe_load`], so this accessor is always total.
+ pub fn uid_gid_range(&self) -> (u32, u32) {
+ self.jails
+ .uid_gid_range
+ .map(|r| (r.min, r.max))
+ .unwrap_or(DEFAULT_UID_GID)
+ }
+
/// Read, ownership-check, parse, and validate the config file. See the module
/// docs for the trust model.
pub fn safe_load() -> Result {
@@ -82,7 +279,18 @@ impl Config {
let safe_path: SafePath = path.clone().try_into()?;
let file: SafeFile =
- SafeFile::open(&safe_path, OFlag::O_RDONLY)?;
+ match SafeFile::open(&safe_path, OFlag::O_RDONLY) {
+ Ok(file) => file,
+ // A genuinely-absent file means "use the built-in defaults": those
+ // are compiled into this root-owned binary, so they are trusted. Any
+ // OTHER failure — a present but wrong-owner/mode file, an I/O error —
+ // stays fatal, because it is a signal (someone put an untrusted file
+ // there), not an absence.
+ Err(safe_file::ValidationError::Open(nix::errno::Errno::ENOENT)) => {
+ return Ok(Self::default())
+ }
+ Err(e) => return Err(e.into()),
+ };
let body = std::io::read_to_string(std::fs::File::from(file.into_owned_fd()))
.map_err(|_| LoadingError::Unreadable(path.clone()))?;
@@ -92,6 +300,11 @@ impl Config {
if !config.work_dir.is_absolute() {
return Err(LoadingError::Relative(path));
}
+
+ if let Some(r) = &config.jails.uid_gid_range {
+ validate_uid_gid_range(r)?;
+ }
+
Ok(config)
}
}
diff --git a/native/suidhelper/src/main.rs b/native/suidhelper/src/main.rs
index 617fd6d5..11b08319 100644
--- a/native/suidhelper/src/main.rs
+++ b/native/suidhelper/src/main.rs
@@ -15,6 +15,7 @@
use clap::{Parser, Subcommand};
use hyper_suidhelper::config;
+use hyper_suidhelper::tools::jailer::{self, JailerArgs};
use hyper_suidhelper::tools::Tool;
use hyper_suidhelper::util::setuid_privileged::{self, Privileged};
use serde::Serialize;
@@ -37,6 +38,9 @@ enum Command {
Tool(Tool),
/// Check the helper is correctly installed (can promote to root).
SysTest,
+ /// Validate the caller's args, become root, and `execve` the firecracker
+ /// jailer in our place. Prints nothing on success (the image is replaced).
+ Jailer(JailerArgs),
/// Print the build version and BLAKE3 checksum of this binary.
Version,
}
@@ -101,12 +105,21 @@ fn main() {
config::Config::init();
// Each command yields a serializable value (errors stringified to unify); we
- // render the final JSON line here.
+ // render the final JSON line here. The jailer is the exception: it `execve`s
+ // in place, so on success it never returns and never emits JSON, and on
+ // failure it reports to stderr and exits before reaching the JSON pipeline.
let output = match command {
Command::Tool(tool) => tool.run().map(Output::Tool).map_err(|e| e.to_string()),
Command::SysTest => SysTest::perform()
.map(Output::SysTest)
.map_err(|e| e.to_string()),
+ Command::Jailer(args) => {
+ let e = jailer::run(args).expect_err("jailer::run only returns on error");
+ eprintln!("{e}");
+ // _exit bypasses atexit handlers; safe because we are permanently root
+ // at this point and must not run any cleanup registered before escalation.
+ unsafe { nix::libc::_exit(2) }
+ }
Command::Version => unreachable!("handled above"),
};
diff --git a/native/suidhelper/src/tools/chroot_jail/grant_api.rs b/native/suidhelper/src/tools/chroot_jail/grant_api.rs
new file mode 100644
index 00000000..4ec94a21
--- /dev/null
+++ b/native/suidhelper/src/tools/chroot_jail/grant_api.rs
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+//! `chroot-jail grant-api`: hand the firecracker API socket to the node user so
+//! the unprivileged controller can `connect()` to it.
+//!
+//! The jailer drops firecracker to a per-VM uid/gid and chroots it; firecracker
+//! then creates its API socket at `/root/api.socket` owned by that per-VM
+//! id. Connecting a unix socket needs *write* permission on the node, so the
+//! node user (a different uid) gets `EACCES`. This op chowns just that one
+//! socket to the helper's CALLER — `getuid()`/`getgid()`, which inside the
+//! privileged scope are the real (caller) ids while euid is 0 — and chmods it
+//! `0660`. The node thus connects as owner, and humans added to the node's
+//! group connect via the group bit.
+//!
+//! That alone is not enough: the jailer leaves `/root` as `0700` owned by
+//! the per-VM uid, and connecting needs *search* (`+x`) on every ancestor, so
+//! the node cannot even traverse into `root` to reach the (now its own) socket.
+//! So this op also opens just that one directory to the caller's group: it keeps
+//! the per-VM uid as owner (firecracker still needs it), chgrps `root` to the
+//! caller's gid, and chmods it `0710` — owner `rwx`, group `--x` (traverse, not
+//! list), other none. Per-VM isolation is otherwise untouched: only this socket
+//! and its immediate parent's group/mode move, nothing else in the jail, and
+//! unrelated users stay locked out.
+//!
+//! Security: the socket path is validated as a `SafePath` and reached by an
+//! `O_NOFOLLOW` walk from `JAIL_BASE`, so a symlinked component cannot redirect
+//! the chown outside the jail, and every op is fd-relative on the pinned `root`
+//! dir fd, never by re-resolved name. The leaf must be exactly `api.socket`
+//! `//root` below the base, and `fstatat(AT_SYMLINK_NOFOLLOW)` must
+//! report a *socket* — a regular file or symlink planted at that name is an
+//! attack and is refused, never chmod'd. A missing socket (`ENOENT`, anywhere on
+//! the path) is `Pending`, not an error: firecracker has not created it yet, so
+//! the controller keeps probing.
+
+use crate::config::Config;
+use crate::tools::IsTool;
+use crate::util::safe_dir::{self, SafeDir};
+use crate::util::safe_path::{self, IsAbsolute, SafePath, StrictComponents};
+use clap::Args;
+use nix::errno::Errno;
+use nix::sys::stat::SFlag;
+use nix::unistd::{getgid, getuid};
+use serde::Serialize;
+use std::path::{Path, PathBuf};
+use thiserror::Error as ThisError;
+
+/// The fixed in-jail socket name firecracker opens (mirrors the Elixir
+/// `Hyper.Node.FireVMM.Jailer` `@jail_socket`).
+const SOCKET_NAME: &str = "api.socket";
+
+/// The socket sits at `///root/api.socket`: three parent
+/// components (``, ``, `root`) before the leaf.
+const SOCKET_PARENT_DEPTH: usize = 3;
+
+/// Mode handed to the node: owner+group read/write, no world access.
+const SOCKET_MODE: u32 = 0o660;
+
+/// Mode set on the jail `root` dir so the node's group can *traverse* it to
+/// reach the socket: owner `rwx` (the per-VM uid, unchanged), group `--x`
+/// (traverse, not list), other none.
+const JAIL_ROOT_MODE: u32 = 0o710;
+
+type LexicalPath = SafePath;
+
+#[derive(Debug, ThisError)]
+pub enum Error {
+ #[error("--socket path: {0}")]
+ SocketPath(#[source] safe_path::ValidationError),
+ #[error("--socket must be exactly //root/api.socket below JAIL_BASE: {0:?}")]
+ SocketShape(PathBuf),
+ #[error("--socket leaf must be {SOCKET_NAME:?}: {0:?}")]
+ SocketName(PathBuf),
+ #[error("walking to the jail root: {0}")]
+ Walk(#[source] safe_dir::Error),
+ #[error("api.socket is not a socket (or is a symlink); refusing to touch it")]
+ NotASocket,
+ #[error("statting the socket: {0}")]
+ Stat(#[source] safe_dir::Error),
+ #[error("chowning the socket to the caller: {0}")]
+ Chown(#[source] safe_dir::Error),
+ #[error("chmoding the socket: {0}")]
+ Chmod(#[source] safe_dir::Error),
+ #[error("chgrp-ing the jail root dir to the caller: {0}")]
+ ChgrpRoot(#[source] safe_dir::Error),
+ #[error("chmoding the jail root dir for traversal: {0}")]
+ ChmodRoot(#[source] safe_dir::Error),
+}
+
+#[derive(Args)]
+pub struct GrantApiArgs {
+ /// Host path of the firecracker API socket, shape
+ /// ///root/api.socket.
+ #[arg(long)]
+ socket: PathBuf,
+}
+
+#[derive(Debug, Serialize)]
+#[serde(tag = "result", rename_all = "snake_case")]
+pub enum GrantOut {
+ /// The socket was handed to the caller (chowned + chmoded).
+ Granted,
+ /// The socket does not exist yet; the caller should keep waiting.
+ Pending,
+}
+
+/// Run the `grant-api` op in its own privileged scope (returns its serialized `Value`).
+pub fn run(args: GrantApiArgs) -> Result {
+ GrantApi { args }.run()
+}
+
+struct GrantApi {
+ args: GrantApiArgs,
+}
+
+impl IsTool for GrantApi {
+ type Args = GrantApiArgs;
+ type Output = GrantOut;
+ type RunT = Result;
+
+ fn run_privileged(&self) -> Self::RunT {
+ grant_api_under(&Config::get().jail_base(), &self.args.socket)
+ }
+
+ fn parse(&self, res: Self::RunT) -> Result> {
+ Ok(res?)
+ }
+}
+
+/// Hand `socket` (`///root/api.socket`) to the helper's
+/// caller, fd-relative after an `O_NOFOLLOW` walk from `jail_base`. Returns
+/// `Pending` if any path component or the socket itself is not yet present.
+pub fn grant_api_under(jail_base: &Path, socket: &Path) -> Result {
+ let path: LexicalPath = socket.to_path_buf().try_into().map_err(Error::SocketPath)?;
+ let (parents, leaf) = path.relative_to(jail_base).map_err(Error::SocketPath)?;
+ if parents.len() != SOCKET_PARENT_DEPTH {
+ return Err(Error::SocketShape(socket.to_path_buf()));
+ }
+ if leaf != Path::new(SOCKET_NAME) {
+ return Err(Error::SocketName(socket.to_path_buf()));
+ }
+
+ let Some(root) = walk(jail_base.to_path_buf(), &parents)? else {
+ return Ok(GrantOut::Pending); // jail not fully created yet
+ };
+
+ let leaf = Path::new(SOCKET_NAME);
+ let stat = match root.stat(leaf) {
+ Ok(stat) => stat,
+ Err(e) if e.errno() == Some(Errno::ENOENT) => return Ok(GrantOut::Pending),
+ Err(e) => return Err(Error::Stat(e)),
+ };
+ // `stat` used AT_SYMLINK_NOFOLLOW, so a symlink reports as S_IFLNK and fails
+ // this check too: only a real socket is accepted, anything else is refused.
+ if stat.st_mode & SFlag::S_IFMT.bits() != SFlag::S_IFSOCK.bits() {
+ return Err(Error::NotASocket);
+ }
+
+ root.chown(leaf, getuid().as_raw(), getgid().as_raw())
+ .map_err(Error::Chown)?;
+ root.chmod(leaf, SOCKET_MODE).map_err(Error::Chmod)?;
+
+ // Open `root` itself to the caller's group so the node can traverse into it
+ // to reach the socket (the jailer leaves it 0700 / per-VM uid). Owner stays
+ // the per-VM uid; only the group and mode move. Operate on the pinned `root`
+ // fd (already opened `O_NOFOLLOW`), never by name - TOCTOU-safe.
+ root.chgrp_self(getgid().as_raw())
+ .map_err(Error::ChgrpRoot)?;
+ root.chmod_self(JAIL_ROOT_MODE).map_err(Error::ChmodRoot)?;
+ Ok(GrantOut::Granted)
+}
+
+/// Open `base` and walk `parents` from it (`O_NOFOLLOW` each step). Returns
+/// `Ok(None)` if `base` or any parent is not yet present (`ENOENT`), so the
+/// caller can treat a half-built jail as `Pending` rather than an error.
+fn walk(base: PathBuf, parents: &[PathBuf]) -> Result