From 612a74ab2ce2b3c9221b67e480537b83877c8615 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 11 May 2026 14:08:29 +0200 Subject: [PATCH 01/13] virtio/fs: move init binary blob into its own crate Move the init binary build script and include_bytes!() from the devices crate into a new init-blob crate. The passthrough modules reference the binary as init_blob::INIT_BINARY instead of using include_bytes! directly. build.rs based on code from https://github.com/containers/libkrun/pull/593. Co-authored-by: Geoffrey Goodman Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- Cargo.lock | 5 ++++ Cargo.toml | 1 + src/devices/Cargo.toml | 3 ++- .../src/virtio/fs/linux/passthrough.rs | 2 +- .../src/virtio/fs/macos/passthrough.rs | 2 +- src/init-blob/Cargo.toml | 11 ++++++++ src/{devices => init-blob}/build.rs | 25 +++++++++++++------ src/init-blob/src/lib.rs | 1 + 8 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 src/init-blob/Cargo.toml rename src/{devices => init-blob}/build.rs (75%) create mode 100644 src/init-blob/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index c066dd4eb..f97077330 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -563,6 +563,10 @@ dependencies = [ "serde_core", ] +[[package]] +name = "init-blob" +version = "0.1.0" + [[package]] name = "iocuddle" version = "0.1.1" @@ -699,6 +703,7 @@ dependencies = [ "caps", "crossbeam-channel", "imago", + "init-blob", "krun-arch", "krun-display", "krun-hvf", diff --git a/Cargo.toml b/Cargo.toml index 83db53c57..94960ad2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "src/libkrun", + "src/init-blob", "src/input", "src/display", "src/utils", diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index ab6ecfe2a..89e11cb45 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -3,7 +3,7 @@ name = "krun-devices" version = "0.1.0-1.18.0" authors = ["The libkrun Authors"] edition = "2021" -build = "build.rs" + description = "Virtual device emulation for libkrun" license = "Apache-2.0" repository = "https://github.com/containers/libkrun" @@ -39,6 +39,7 @@ vm-memory = { version = "0.17", features = ["backend-mmap"] } zerocopy = { version = "0.8.26", optional = true, features = ["derive"] } krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", features = ["bindgen_clang_runtime"], optional = true } +init-blob = { path = "../init-blob" } arch = { package = "krun-arch", version = "=0.1.0-1.18.0", path = "../arch" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index e5ca21a03..a0c1d6020 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -33,7 +33,7 @@ const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; const INIT_CSTR: &[u8] = b"init.krun\0"; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); +static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 53680bd92..419cd645b 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -37,7 +37,7 @@ const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); +static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; diff --git a/src/init-blob/Cargo.toml b/src/init-blob/Cargo.toml new file mode 100644 index 000000000..c984f1ea6 --- /dev/null +++ b/src/init-blob/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "init-blob" +version = "0.1.0" +edition = "2021" +description = "Default init binary blob for libkrun guests" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" +build = "build.rs" + +[lib] +path = "src/lib.rs" diff --git a/src/devices/build.rs b/src/init-blob/build.rs similarity index 75% rename from src/devices/build.rs rename to src/init-blob/build.rs index 49a4346d2..0482edf3d 100644 --- a/src/devices/build.rs +++ b/src/init-blob/build.rs @@ -46,20 +46,31 @@ fn build_default_init() -> PathBuf { .unwrap_or_else(|e| panic!("failed to execute {cc}: {e}")); if !status.success() { - panic!("failed to compile init/init.c: {status}"); + panic!("failed to compile init/init.c with {cc}: {status}"); } + init_bin } fn main() { + let manifest_dir = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); + let repo_init_bin = manifest_dir.join("../..").join("init/init"); + println!("cargo:rerun-if-changed={}", repo_init_bin.display()); + let init_binary_path = std::env::var_os("KRUN_INIT_BINARY_PATH") .map(PathBuf::from) - .unwrap_or_else(|| { - let init_path = build_default_init(); - // SAFETY: The build script is single threaded. - unsafe { std::env::set_var("KRUN_INIT_BINARY_PATH", &init_path) }; - init_path - }); + .or_else(|| { + if repo_init_bin.exists() { + Some(repo_init_bin) + } else { + None + } + }) + .unwrap_or_else(build_default_init); + + // SAFETY: The build script is single threaded. + unsafe { std::env::set_var("KRUN_INIT_BINARY_PATH", &init_binary_path) }; + println!( "cargo:rustc-env=KRUN_INIT_BINARY_PATH={}", init_binary_path.display() diff --git a/src/init-blob/src/lib.rs b/src/init-blob/src/lib.rs new file mode 100644 index 000000000..4397da679 --- /dev/null +++ b/src/init-blob/src/lib.rs @@ -0,0 +1 @@ +pub static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); From cf3145dd2648566905bdf932c04ca41a8bd30f7c Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 4 May 2026 16:15:52 +0200 Subject: [PATCH 02/13] virtio/fs: introduce InodeAllocator for shared inode numbering Replace the private next_inode AtomicU64 inside PassthroughFs with a shared InodeAllocator that is passed in at construction. This lets multiple layers (e.g. a future virtual-inode overlay) allocate from the same counter without implicit coordination via reserved ranges. PassthroughFs::new() and PassthroughFsRo::new() now take an Arc parameter. FsWorker::new() creates the allocator and passes it through. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/inode_alloc.rs | 28 +++++++++++++++++++ .../src/virtio/fs/linux/passthrough.rs | 9 +++--- .../src/virtio/fs/macos/passthrough.rs | 9 +++--- src/devices/src/virtio/fs/mod.rs | 1 + src/devices/src/virtio/fs/read_only.rs | 5 ++-- src/devices/src/virtio/fs/worker.rs | 12 ++++++-- 6 files changed, 52 insertions(+), 12 deletions(-) create mode 100644 src/devices/src/virtio/fs/inode_alloc.rs diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs new file mode 100644 index 000000000..1919b1406 --- /dev/null +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -0,0 +1,28 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use super::fuse; + +/// Allocates unique FUSE inode numbers. +/// +/// FUSE inode numbers are opaque identifiers with two reserved values: +/// - `0` — invalid / negative-entry cache sentinel (never allocated) +/// - `1` (`ROOT_ID`) — the root directory of the filesystem +/// +/// All other numbers are allocated sequentially starting from `ROOT_ID + 1`. +/// The allocator is `Send + Sync` and safe to share across threads. +pub struct InodeAllocator { + next: AtomicU64, +} + +impl InodeAllocator { + pub fn new() -> Self { + Self { + next: AtomicU64::new(fuse::ROOT_ID + 1), + } + } + + /// Allocate the next inode number. Each call returns a unique value. + pub fn next(&self) -> u64 { + self.next.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index a0c1d6020..abda1ce53 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -25,6 +25,7 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const CURRENT_DIR_CSTR: &[u8] = b".\0"; @@ -358,7 +359,7 @@ pub struct PassthroughFs { // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, - next_inode: AtomicU64, + inode_alloc: Arc, init_inode: u64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be @@ -392,7 +393,7 @@ enum FileOrLink { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { @@ -438,7 +439,7 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), + inode_alloc, init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), @@ -579,7 +580,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 419cd645b..3d27aec7f 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -29,6 +29,7 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const INIT_CSTR: &[u8] = b"init.krun\0"; @@ -543,7 +544,7 @@ impl Default for Config { /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { inodes: RwLock>>, - next_inode: AtomicU64, + inode_alloc: Arc, init_inode: u64, handles: RwLock>>, @@ -560,7 +561,7 @@ pub struct PassthroughFs { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let root = CString::new(cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. @@ -579,7 +580,7 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), + inode_alloc, init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), @@ -723,7 +724,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 7ce9d48c2..179535131 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -2,6 +2,7 @@ mod device; #[allow(dead_code)] mod filesystem; pub mod fuse; +mod inode_alloc; #[allow(dead_code)] mod multikey; mod read_only; diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index e975f2dda..eb8aebef3 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -25,6 +25,7 @@ use super::filesystem::{ OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::fuse; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use crate::virtio::bindings; @@ -60,9 +61,9 @@ pub struct PassthroughFsRo { } impl PassthroughFsRo { - pub fn new(cfg: passthrough::Config) -> io::Result { + pub fn new(cfg: passthrough::Config, inode_alloc: Arc) -> io::Result { Ok(Self { - inner: PassthroughFs::new(cfg)?, + inner: PassthroughFs::new(cfg, inode_alloc)?, }) } } diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index c612b3e9b..e554aa377 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -16,6 +16,7 @@ use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; @@ -83,10 +84,17 @@ impl FsWorker { exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { + let inode_alloc = Arc::new(InodeAllocator::new()); let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new(passthrough_cfg)?)) + FsServer::ReadOnly(Server::new(PassthroughFsRo::new( + passthrough_cfg, + inode_alloc, + )?)) } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new(passthrough_cfg)?)) + FsServer::ReadWrite(Server::new(PassthroughFs::new( + passthrough_cfg, + inode_alloc, + )?)) }; Ok(Self { queues, From 8c5c68efcca292a3665ec18a7c285ebc94506f39 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 4 May 2026 16:23:35 +0200 Subject: [PATCH 03/13] virtio/fs: introduce generic AugmentFs overlay for files like init.krun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce AugmentFs, a generic overlay that wraps any FileSystem implementation and intercepts FUSE operations for virtual inodes — synthetic read-only files backed by static data. One-shot files can only be looked up once. The overlay uses the shared InodeAllocator to assign inode numbers, so virtual and passthrough inodes never collide. Remove all init.krun special-case code (init_inode, init_handle, INIT_CSTR, init_payload) from both the Linux and macOS passthrough implementations. The init.krun virtual file is now configured via VirtualEntry in the krun API layer and handled generically by the overlay. FsDeviceConfig carries a Vec and FsWorker wraps AugmentFs / AugmentFs. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- Cargo.lock | 2 +- include/libkrun.h | 672 ++++++++++-------- src/devices/Cargo.toml | 1 - src/devices/src/virtio/fs/augment_fs.rs | 614 ++++++++++++++++ src/devices/src/virtio/fs/device.rs | 6 + .../src/virtio/fs/linux/passthrough.rs | 77 +- .../src/virtio/fs/macos/passthrough.rs | 49 +- src/devices/src/virtio/fs/mod.rs | 2 + src/devices/src/virtio/fs/virtual_inode.rs | 37 + src/devices/src/virtio/fs/worker.rs | 27 +- src/libkrun/Cargo.toml | 1 + src/libkrun/src/lib.rs | 26 +- src/vmm/src/builder.rs | 13 +- src/vmm/src/vmm_config/fs.rs | 4 +- 14 files changed, 1072 insertions(+), 459 deletions(-) create mode 100644 src/devices/src/virtio/fs/augment_fs.rs create mode 100644 src/devices/src/virtio/fs/virtual_inode.rs diff --git a/Cargo.lock b/Cargo.lock index f97077330..3b40bfbc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -703,7 +703,6 @@ dependencies = [ "caps", "crossbeam-channel", "imago", - "init-blob", "krun-arch", "krun-display", "krun-hvf", @@ -890,6 +889,7 @@ version = "1.18.0" dependencies = [ "crossbeam-channel", "env_logger", + "init-blob", "krun-aws-nitro", "krun-devices", "krun-display", diff --git a/include/libkrun.h b/include/libkrun.h index 87d5e1fa1..c0b09e25f 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -6,8 +6,8 @@ extern "C" { #endif #include -#include #include +#include #include /** @@ -27,7 +27,6 @@ extern "C" { */ int32_t krun_set_log_level(uint32_t level); - #define KRUN_LOG_TARGET_DEFAULT -1 #define KRUN_LOG_LEVEL_OFF 0 @@ -47,23 +46,27 @@ int32_t krun_set_log_level(uint32_t level); * Initializes logging for the library. * * Arguments: - * "target_fd" - File descriptor to write log to. Note that using a file descriptor pointing to a regular file on - * filesystem might slow down the VM. - * Use KRUN_LOG_TARGET_DEFAULT to use the default target for log output (stderr). + * "target_fd" - File descriptor to write log to. Note that using a file + * descriptor pointing to a regular file on filesystem might slow down the VM. + * Use KRUN_LOG_TARGET_DEFAULT to use the default target for log + * output (stderr). * - * "level" - Level is an integer specifying the level of verbosity, higher number means more verbose log. - * The log levels are described by the constants: KRUN_LOG_LEVEL_{OFF, ERROR, WARN, INFO, DEBUG, TRACE} + * "level" - Level is an integer specifying the level of verbosity, higher + * number means more verbose log. The log levels are described by the constants: + * KRUN_LOG_LEVEL_{OFF, ERROR, WARN, INFO, DEBUG, TRACE} * - * "style" - Enable/disable usage of terminal escape sequences (to display colors) - * One of: KRUN_LOG_STYLE_{AUTO, ALWAYS, NEVER}. + * "style" - Enable/disable usage of terminal escape sequences (to display + * colors) One of: KRUN_LOG_STYLE_{AUTO, ALWAYS, NEVER}. * * "options" - Bitmask of logging options, use 0 for default options. - * KRUN_LOG_OPTION_NO_ENV to disallow environment variables to override these settings. + * KRUN_LOG_OPTION_NO_ENV to disallow environment variables to + * override these settings. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_init_log(int target_fd, uint32_t level, uint32_t style, uint32_t options); +int32_t krun_init_log(int target_fd, uint32_t level, uint32_t style, + uint32_t options); /** * Creates a configuration context. @@ -95,23 +98,27 @@ int32_t krun_free_ctx(uint32_t ctx_id); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_vm_config(uint32_t ctx_id, uint8_t num_vcpus, uint32_t ram_mib); +int32_t krun_set_vm_config(uint32_t ctx_id, uint8_t num_vcpus, + uint32_t ram_mib); /** - * The virtiofs tag used for the root filesystem. Can be used with krun_add_virtiofs* - * for more control over root filesystem parameters (e.g. read-only, DAX window size). + * The virtiofs tag used for the root filesystem. Can be used with + * krun_add_virtiofs* for more control over root filesystem parameters (e.g. + * read-only, DAX window size). */ #define KRUN_FS_ROOT_TAG "/dev/root" /** - * Sets the path to be use as root for the microVM. Not available in libkrun-SEV. + * Sets the path to be use as root for the microVM. Not available in + * libkrun-SEV. * * For more control over the root filesystem (e.g. read-only, DAX window size), * use krun_add_virtiofs3() with KRUN_FS_ROOT_TAG instead. * * Arguments: * "ctx_id" - the configuration context ID. - * "root_path" - a null-terminated string representing the path to be used as root. + * "root_path" - a null-terminated string representing the path to be used as + * root. * * Returns: * Zero on success or a negative error number on failure. @@ -121,13 +128,13 @@ int32_t krun_set_root(uint32_t ctx_id, const char *root_path); /** * DEPRECATED. Use krun_add_disk instead. * - * Sets the path to the disk image that contains the file-system to be used as root for the microVM. - * The only supported image format is "raw". + * Sets the path to the disk image that contains the file-system to be used as + * root for the microVM. The only supported image format is "raw". * * Arguments: * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. + * "disk_path" - a null-terminated string representing the path leading to the + * disk image that contains the root file-system. * * Returns: * Zero on success or a negative error number on failure. @@ -142,8 +149,8 @@ int32_t krun_set_root_disk(uint32_t ctx_id, const char *disk_path); * * Arguments: * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. + * "disk_path" - a null-terminated string representing the path leading to the + * disk image that contains the root file-system. * * Returns: * Zero on success or a negative error number on failure. @@ -151,27 +158,29 @@ int32_t krun_set_root_disk(uint32_t ctx_id, const char *disk_path); int32_t krun_set_data_disk(uint32_t ctx_id, const char *disk_path); /** - * Adds a disk image to be used as a general partition for the microVM. The only supported image - * format is "raw". + * Adds a disk image to be used as a general partition for the microVM. The only + * supported image format is "raw". * * This API is mutually exclusive with the deprecated krun_set_root_disk and * krun_set_data_disk methods and must not be used together. * - * This function deliberately only handles images in the Raw format, because it doesn't allow - * specifying an image format, and probing an image's format is dangerous. For more information, - * see the security note on `krun_add_disk2`, which allows opening non-Raw images. + * This function deliberately only handles images in the Raw format, because it + * doesn't allow specifying an image format, and probing an image's format is + * dangerous. For more information, see the security note on `krun_add_disk2`, + * which allows opening non-Raw images. * * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). + * "disk_path" - a null-terminated string representing the path leading to the + * disk image. "read_only" - whether the mount should be read-only. Required if + * the caller does not have write permissions (for disk images in /usr/share). * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_path, bool read_only); +int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, + const char *disk_path, bool read_only); /* Supported disk image formats */ #define KRUN_DISK_FORMAT_RAW 0 @@ -180,73 +189,77 @@ int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_pa #define KRUN_DISK_FORMAT_VMDK 2 /** - * Adds a disk image to be used as a general partition for the microVM. The supported - * image formats are: "raw" and "qcow2". + * Adds a disk image to be used as a general partition for the microVM. The + * supported image formats are: "raw" and "qcow2". * * This API is mutually exclusive with the deprecated krun_set_root_disk and * krun_set_data_disk methods and must not be used together. * * SECURITY NOTE: - * Non-Raw images can reference other files, which libkrun will automatically open, and to which the - * guest will have access. Libkrun should therefore never be asked to open an image in a non-Raw - * format when it doesn't come from a fully trustworthy source. - * - * Consequently, probing an image's format is quite dangerous and to be avoided if at all possible, - * which is why libkrun provides no facilities for doing so. If it's not clear what format an image - * has, it may also not be clear whether it can be trusted to not reference files to which the guest - * shouldn't have access. - * - * If probing absolutely can't be avoided, it must only be done on images that are fully trusted, i.e. - * before a potentially untrusted guest had write access to it. Specifically, consider that a guest has - * full access to all of a Raw image, and can therefore turn it into a file in an arbitrary format, for - * example, into a Qcow2 image, referencing and granting a malicious guest access to arbitrary files. - * To hand a Raw image to an untrusted and potentially malicious guest, and then to re-probe it after - * the guest was able to write to it (when it can no longer be trusted), would therefore be a severe - * security vulnerability. - * - * Therefore, after having probed a yet fully trusted image once, the result must be remembered so the - * image will from then on always be opened in the format that was detected originally. When adhering - * to this, a guest can write anything they want to a Raw image, it's always going to be opened as a - * Raw image, preventing the security vulnerability outlined above. - * - * However, if at all possible, the image format should be explicitly selected based on knowledge - * obtained separately from the pure image data, for example by the user. + * Non-Raw images can reference other files, which libkrun will automatically + * open, and to which the guest will have access. Libkrun should therefore never + * be asked to open an image in a non-Raw format when it doesn't come from a + * fully trustworthy source. + * + * Consequently, probing an image's format is quite dangerous and to be avoided + * if at all possible, which is why libkrun provides no facilities for doing so. + * If it's not clear what format an image has, it may also not be clear whether + * it can be trusted to not reference files to which the guest shouldn't have + * access. + * + * If probing absolutely can't be avoided, it must only be done on images that + * are fully trusted, i.e. before a potentially untrusted guest had write access + * to it. Specifically, consider that a guest has full access to all of a Raw + * image, and can therefore turn it into a file in an arbitrary format, for + * example, into a Qcow2 image, referencing and granting a malicious guest + * access to arbitrary files. To hand a Raw image to an untrusted and + * potentially malicious guest, and then to re-probe it after the guest was able + * to write to it (when it can no longer be trusted), would therefore be a + * severe security vulnerability. + * + * Therefore, after having probed a yet fully trusted image once, the result + * must be remembered so the image will from then on always be opened in the + * format that was detected originally. When adhering to this, a guest can write + * anything they want to a Raw image, it's always going to be opened as a Raw + * image, preventing the security vulnerability outlined above. + * + * However, if at all possible, the image format should be explicitly selected + * based on knowledge obtained separately from the pure image data, for example + * by the user. * * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "disk_format" - the disk image format (i.e. KRUN_DISK_FORMAT_{RAW, QCOW2}) - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). + * "disk_path" - a null-terminated string representing the path leading to + * the disk image. "disk_format" - the disk image format (i.e. + * KRUN_DISK_FORMAT_{RAW, QCOW2}) "read_only" - whether the mount should be + * read-only. Required if the caller does not have write permissions (for disk + * images in /usr/share). * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_disk2(uint32_t ctx_id, - const char *block_id, - const char *disk_path, - uint32_t disk_format, +int32_t krun_add_disk2(uint32_t ctx_id, const char *block_id, + const char *disk_path, uint32_t disk_format, bool read_only); - /* Supported sync modes */ /** * Ignore VIRTIO_BLK_F_FLUSH. - * WARNING: may lead to loss of data - */ + * WARNING: may lead to loss of data + */ #define KRUN_SYNC_NONE 0 /** - * Honor VIRTIO_BLK_F_FLUSH requests, but relax strict hardware syncing on macOS. - * This is the recommended mode. + * Honor VIRTIO_BLK_F_FLUSH requests, but relax strict hardware syncing on + * macOS. This is the recommended mode. * * On macOS this flushes the OS buffers, but does not ask the drive to flush - * its buffered data, which significantly improves performance. + * its buffered data, which significantly improves performance. * On Linux this is the same as full sync. */ #define KRUN_SYNC_RELAXED 1 -/** +/** * Honor VIRTIO_BLK_F_FLUSH, strictly flushing buffers to physical disk. */ #define KRUN_SYNC_FULL 2 @@ -263,93 +276,91 @@ int32_t krun_add_disk2(uint32_t ctx_id, * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "disk_format" - the disk image format (i.e. KRUN_DISK_FORMAT_{RAW, QCOW2}) - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). - * "direct_io" - whether to bypass the host caches. - * "sync_mode" - whether to enable VIRTIO_BLK_F_FLUSH. On macOS, an additional relaxed sync - * mode is available, which is enabled by default, and will not ask the drive - * to flush its buffered data. + * "disk_path" - a null-terminated string representing the path leading to + * the disk image. "disk_format" - the disk image format (i.e. + * KRUN_DISK_FORMAT_{RAW, QCOW2}) "read_only" - whether the mount should be + * read-only. Required if the caller does not have write permissions (for disk + * images in /usr/share). "direct_io" - whether to bypass the host caches. + * "sync_mode" - whether to enable VIRTIO_BLK_F_FLUSH. On macOS, an + * additional relaxed sync mode is available, which is enabled by default, and + * will not ask the drive to flush its buffered data. * * Returns: * Zero on success or a negative error number on failure. */ - int32_t krun_add_disk3(uint32_t ctx_id, - const char *block_id, - const char *disk_path, - uint32_t disk_format, - bool read_only, - bool direct_io, - uint32_t sync_mode); +int32_t krun_add_disk3(uint32_t ctx_id, const char *block_id, + const char *disk_path, uint32_t disk_format, + bool read_only, bool direct_io, uint32_t sync_mode); /** * NO LONGER SUPPORTED. DO NOT USE. * - * Configures the mapped volumes for the microVM. Only supported on macOS, on Linux use - * user_namespaces and bind-mounts instead. Not available in libkrun-SEV. + * Configures the mapped volumes for the microVM. Only supported on macOS, on + * Linux use user_namespaces and bind-mounts instead. Not available in + * libkrun-SEV. * * Arguments: * "ctx_id" - the configuration context ID. - * "mapped_volumes" - an array of string pointers with format "host_path:guest_path" representing - * the volumes to be mapped inside the microVM + * "mapped_volumes" - an array of string pointers with format + * "host_path:guest_path" representing the volumes to be mapped inside the + * microVM * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_mapped_volumes(uint32_t ctx_id, const char *const mapped_volumes[]); +int32_t krun_set_mapped_volumes(uint32_t ctx_id, + const char *const mapped_volumes[]); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs(uint32_t ctx_id, - const char *c_tag, +int32_t krun_add_virtiofs(uint32_t ctx_id, const char *c_tag, const char *c_path); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. This variant allows specifying the size of the DAX window. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. - * "shm_size" - size of the DAX SHM window in bytes. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. "shm_size" - size of the DAX SHM window in bytes. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs2(uint32_t ctx_id, - const char *c_tag, - const char *c_path, - uint64_t shm_size); +int32_t krun_add_virtiofs2(uint32_t ctx_id, const char *c_tag, + const char *c_path, uint64_t shm_size); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window and a read-only flag. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. This variant allows specifying the size of the DAX window and a + * read-only flag. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. - * "shm_size" - size of the DAX SHM window in bytes. - * "read_only" - if true, the filesystem will be exposed as read-only to the guest. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. "shm_size" - size of the DAX SHM window in bytes. + * "read_only" - if true, the filesystem will be exposed as read-only to + * the guest. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs3(uint32_t ctx_id, - const char *c_tag, - const char *c_path, - uint64_t shm_size, +int32_t krun_add_virtiofs3(uint32_t ctx_id, const char *c_tag, + const char *c_path, uint64_t shm_size, bool read_only); /* Send the VFKIT magic after establishing the connection, @@ -358,8 +369,8 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, #define NET_FLAG_DHCP_CLIENT (1 << 1) /* TSI (Transparent Socket Impersonation) feature flags for vsock */ -#define KRUN_TSI_HIJACK_INET (1 << 0) -#define KRUN_TSI_HIJACK_UNIX (1 << 1) +#define KRUN_TSI_HIJACK_INET (1 << 0) +#define KRUN_TSI_HIJACK_UNIX (1 << 1) /* Taken from uapi/linux/virtio_net.h */ #define NET_FEATURE_CSUM 1 << 0 @@ -371,10 +382,11 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, #define NET_FEATURE_HOST_TSO6 1 << 12 #define NET_FEATURE_HOST_UFO 1 << 14 -/* These are the features enabled by krun_set_passt_fd and krun_set_gvproxy_path. */ -#define COMPAT_NET_FEATURES NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM | \ - NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_UFO | \ - NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO +/* These are the features enabled by krun_set_passt_fd and + * krun_set_gvproxy_path. */ +#define COMPAT_NET_FEATURES \ + NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM | NET_FEATURE_GUEST_TSO4 | \ + NET_FEATURE_GUEST_UFO | NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO /** * Adds an independent virtio-net device connected to a * unixstream-based userspace network proxy, such as passt or @@ -411,11 +423,8 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_unixstream(uint32_t ctx_id, - const char *c_path, - int fd, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_unixstream(uint32_t ctx_id, const char *c_path, int fd, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -455,11 +464,8 @@ int32_t krun_add_net_unixstream(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_unixgram(uint32_t ctx_id, - const char *c_path, - int fd, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_unixgram(uint32_t ctx_id, const char *c_path, int fd, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -486,10 +492,8 @@ int32_t krun_add_net_unixgram(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_tap(uint32_t ctx_id, - char *c_tap_name, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_tap(uint32_t ctx_id, char *c_tap_name, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -556,17 +560,18 @@ int32_t krun_set_net_mac(uint32_t ctx_id, uint8_t *const c_mac); * -ENOTSUP when passt networking is used * * Notes: - * Passing NULL (or not calling this function) as "port_map" has a different meaning than - * passing an empty array. The first one will instruct libkrun to attempt to expose all - * listening ports in the guest to the host, while the second means that no port from - * the guest will be exposed to host. + * Passing NULL (or not calling this function) as "port_map" has a different + * meaning than passing an empty array. The first one will instruct libkrun to + * attempt to expose all listening ports in the guest to the host, while the + * second means that no port from the guest will be exposed to host. * - * Exposed ports will only become accessible by their "host_port" in the guest too. This - * means that for a map such as "8080:80", applications running inside the guest will also - * need to access the service through the "8080" port. + * Exposed ports will only become accessible by their "host_port" in the guest + * too. This means that for a map such as "8080:80", applications running inside + * the guest will also need to access the service through the "8080" port. * - * If past networking mode is used (krun_set_passt_fd was called), port mapping is not supported - * as an API of libkrun (but you can still do port mapping using command line arguments of passt) + * If past networking mode is used (krun_set_passt_fd was called), port mapping + * is not supported as an API of libkrun (but you can still do port mapping + * using command line arguments of passt) */ int32_t krun_set_port_map(uint32_t ctx_id, const char *const port_map[]); @@ -606,17 +611,18 @@ int32_t krun_set_gpu_options(uint32_t ctx_id, uint32_t virgl_flags); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_gpu_options2(uint32_t ctx_id, - uint32_t virgl_flags, +int32_t krun_set_gpu_options2(uint32_t ctx_id, uint32_t virgl_flags, uint64_t shm_size); -/* Maximum number of displays. Same as VIRTIO_GPU_MAX_SCANOUTS defined in the virtio-gpu spec */ +/* Maximum number of displays. Same as VIRTIO_GPU_MAX_SCANOUTS defined in the + * virtio-gpu spec */ #define KRUN_MAX_DISPLAYS 16 /** * Configure a display output for the VM. * - * Note that to have display output a display backend must also be set (see krun_set_display_backend). + * Note that to have display output a display backend must also be set (see + * krun_set_display_backend). * * Arguments: * "ctx_id" - the configuration context ID. @@ -624,17 +630,19 @@ int32_t krun_set_gpu_options2(uint32_t ctx_id, * "height" - the height of the window/display * * Returns: - * The id of the display (0 to KRUN_MAX_DISPLAYS - 1) on success or a negative error number on failure. + * The id of the display (0 to KRUN_MAX_DISPLAYS - 1) on success or a negative + * error number on failure. */ int32_t krun_add_display(uint32_t ctx_id, uint32_t width, uint32_t height); /** * Configure a custom EDID blob for a display * - * This replaces the generated EDID with a custom one. Configuring an EDID blob makes all display parameters except - * width and height ignored. + * This replaces the generated EDID with a custom one. Configuring an EDID blob + * makes all display parameters except width and height ignored. * - * Note that libkrun doesn't do any checks if the EDID matches the width/height specified in krun_add_display(). + * Note that libkrun doesn't do any checks if the EDID matches the width/height + * specified in krun_add_display(). * * Arguments: * "ctx_id" - the configuration context ID. @@ -645,7 +653,8 @@ int32_t krun_add_display(uint32_t ctx_id, uint32_t width, uint32_t height); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, const uint8_t* edid_blob, size_t blob_size); +int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, + const uint8_t *edid_blob, size_t blob_size); /** * Configure DPI of the display reported to the guest @@ -660,12 +669,14 @@ int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, const uint8_ * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, uint32_t dpi); +int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, + uint32_t dpi); /** * Configure physical size of the display reported to the guest * - * This overrides the physical size of the display set by krun_set_display_physical_size() + * This overrides the physical size of the display set by + * krun_set_display_physical_size() * * Arguments: * "ctx_id" - the configuration context ID. @@ -676,7 +687,8 @@ int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, uint32_t dpi) * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, uint16_t width_mm, uint16_t height_mm); +int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, + uint16_t width_mm, uint16_t height_mm); /** * Configure refresh rate for a display @@ -690,10 +702,12 @@ int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, uin * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, uint32_t refresh_rate); +int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, + uint32_t refresh_rate); /** - * Configures a krun_display_backend struct to be used for display output. (see libkrun_display.h) + * Configures a krun_display_backend struct to be used for display output. (see + * libkrun_display.h) * * Arguments: * "ctx_id" - the configuration context ID @@ -703,8 +717,8 @@ int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, uint * Returns: * Zero on success or a negative error number (errno) on failure. */ -int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, size_t backend_size); - +int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, + size_t backend_size); /** * Adds an input device with separate config and events objects. @@ -719,13 +733,16 @@ int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, s * Returns: * Zero on success or a negative error code otherwise. */ -int krun_add_input_device(uint32_t ctx_id, const void *config_backend, size_t config_backend_size, - const void *events_backend, size_t events_backend_size); +int krun_add_input_device(uint32_t ctx_id, const void *config_backend, + size_t config_backend_size, + const void *events_backend, + size_t events_backend_size); /** * Creates a passthrough input device from a host /dev/input/* file descriptor. - * The device configuration will be automatically queried from the host device using ioctls. - * + * The device configuration will be automatically queried from the host device + * using ioctls. + * * Arguments: * "ctx_id" - The krun context * "input_fd" - File descriptor to a /dev/input/* device on the host @@ -740,7 +757,8 @@ int krun_add_input_device_fd(uint32_t ctx_id, int input_fd); * * Arguments: * "ctx_id" - the configuration context ID. - * "enable" - boolean indicating whether virtio-snd should be enabled or disabled. + * "enable" - boolean indicating whether virtio-snd should be enabled or + * disabled. * * Returns: * Zero on success or a negative error number on failure. @@ -762,7 +780,8 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); /** * Vhost-user console device default queue configuration. * Console device uses 4 queues for multiport support: - * receiveq (idx 0), transmitq (idx 1), control receiveq (idx 2), control transmitq (idx 3). + * receiveq (idx 0), transmitq (idx 1), control receiveq (idx 2), control + * transmitq (idx 3). */ #define KRUN_VHOST_USER_CONSOLE_NUM_QUEUES 4 #define KRUN_VHOST_USER_CONSOLE_QUEUE_SIZES ((uint16_t[]){128, 128, 64, 64}) @@ -790,7 +809,8 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); /** * Vhost-user sound device default queue configuration. - * Sound device uses 4 queues: control (idx 0), event (idx 1), TX/playback (idx 2), RX/capture (idx 3). + * Sound device uses 4 queues: control (idx 0), event (idx 1), TX/playback (idx + * 2), RX/capture (idx 3). */ #define KRUN_VHOST_USER_SND_NUM_QUEUES 4 #define KRUN_VHOST_USER_SND_QUEUE_SIZES ((uint16_t[]){64, 64, 64, 64}) @@ -820,36 +840,38 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); * * Arguments: * "ctx_id" - the configuration context ID. - * "device_type" - type of vhost-user device (e.g., KRUN_VHOST_USER_DEVICE_RNG). - * "socket_path" - path to the vhost-user Unix domain socket (e.g., "/tmp/vhost-rng.sock"). - * "name" - device name for logging/debugging (e.g., "vhost-rng", "vhost-snd"). - * NULL = auto-generate from device_type ("vhost-user-4", "vhost-user-25", etc.) - * "num_queues" - number of virtqueues. - * 0 = auto-detect from backend (requires backend MQ support). - * >0 = explicit queue count. - * Or use device-specific constants like KRUN_VHOST_USER_RNG_NUM_QUEUES. - * "queue_sizes" - array of queue sizes for each queue. - * NULL = use default size (256) for all queues. - * When num_queues=0 (auto-detect): array must be 0-terminated (sentinel). - * When num_queues>0 (explicit): array must have exactly num_queues elements. - * Use device-specific constants like KRUN_VHOST_USER_RNG_QUEUE_SIZES for defaults. + * "device_type" - type of vhost-user device (e.g., + * KRUN_VHOST_USER_DEVICE_RNG). "socket_path" - path to the vhost-user Unix + * domain socket (e.g., "/tmp/vhost-rng.sock"). "name" - device name for + * logging/debugging (e.g., "vhost-rng", "vhost-snd"). NULL = auto-generate from + * device_type ("vhost-user-4", "vhost-user-25", etc.) "num_queues" - number + * of virtqueues. 0 = auto-detect from backend (requires backend MQ support). >0 + * = explicit queue count. Or use device-specific constants like + * KRUN_VHOST_USER_RNG_NUM_QUEUES. "queue_sizes" - array of queue sizes for + * each queue. NULL = use default size (256) for all queues. When num_queues=0 + * (auto-detect): array must be 0-terminated (sentinel). When num_queues>0 + * (explicit): array must have exactly num_queues elements. Use device-specific + * constants like KRUN_VHOST_USER_RNG_QUEUE_SIZES for defaults. * * Examples: * // Auto-detect queue count, use default size (256) - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", NULL, 0, NULL); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * NULL, 0, NULL); * * // Auto-detect queue count, use custom size (512) for all queues * uint16_t custom_size[] = {512, 0}; // 0 = sentinel terminator - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", NULL, 0, custom_size); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * NULL, 0, custom_size); * * // Explicit defaults using #define constants - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", "vhost-rng", - * KRUN_VHOST_USER_RNG_NUM_QUEUES, + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * "vhost-rng", KRUN_VHOST_USER_RNG_NUM_QUEUES, * KRUN_VHOST_USER_RNG_QUEUE_SIZES); * * // Explicit queue count with custom sizes * uint16_t sizes[] = {256, 512}; - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_SND, "/tmp/snd.sock", "vhost-snd", 2, sizes); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_SND, "/tmp/snd.sock", + * "vhost-snd", 2, sizes); * * Returns: * Zero on success or a negative error number on failure. @@ -857,19 +879,19 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); * -ENOENT - Context doesn't exist * -ENOTSUP - vhost-user support not compiled in */ -int32_t krun_add_vhost_user_device(uint32_t ctx_id, - uint32_t device_type, - const char *socket_path, - const char *name, +int32_t krun_add_vhost_user_device(uint32_t ctx_id, uint32_t device_type, + const char *socket_path, const char *name, uint16_t num_queues, const uint16_t *queue_sizes); /** - * Configures a map of rlimits to be set in the guest before starting the isolated binary. + * Configures a map of rlimits to be set in the guest before starting the + * isolated binary. * * Arguments: * "ctx_id" - the configuration context ID. - * "rlimits" - an array of string pointers with format "RESOURCE=RLIM_CUR:RLIM_MAX". + * "rlimits" - an array of string pointers with format + * "RESOURCE=RLIM_CUR:RLIM_MAX". * * Returns: * Zero on success or a negative error number on failure. @@ -881,53 +903,55 @@ int32_t krun_set_rlimits(uint32_t ctx_id, const char *const rlimits[]); * * Arguments: * "ctx_id" - the configuration context ID. - * "oem_strings" - an array of string pointers. Must be terminated with an additional NULL pointer. + * "oem_strings" - an array of string pointers. Must be terminated with an + * additional NULL pointer. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_smbios_oem_strings(uint32_t ctx_id, const char *const oem_strings[]); +int32_t krun_set_smbios_oem_strings(uint32_t ctx_id, + const char *const oem_strings[]); /** * Sets the working directory for the executable to be run inside the microVM. * * Arguments: * "ctx_id" - the configuration context ID. - * "workdir_path" - the path to the working directory, relative to the root configured with - * "krun_set_root". + * "workdir_path" - the path to the working directory, relative to the root + * configured with "krun_set_root". * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_workdir(uint32_t ctx_id, - const char *workdir_path); +int32_t krun_set_workdir(uint32_t ctx_id, const char *workdir_path); /** - * Sets the path to the executable to be run inside the microVM, the arguments to be passed to the - * executable, and the environment variables to be configured in the context of the executable. + * Sets the path to the executable to be run inside the microVM, the arguments + * to be passed to the executable, and the environment variables to be + * configured in the context of the executable. * * Arguments: * "ctx_id" - the configuration context ID. - * "exec_path" - the path to the executable, relative to the root configured with "krun_set_root". - * "argv" - an array of string pointers to be passed as arguments. - * "envp" - an array of string pointers to be injected as environment variables into the - * context of the executable. If NULL, it will auto-generate an array collecting the - * the variables currently present in the environment. + * "exec_path" - the path to the executable, relative to the root configured + * with "krun_set_root". "argv" - an array of string pointers to be passed + * as arguments. "envp" - an array of string pointers to be injected as + * environment variables into the context of the executable. If NULL, it will + * auto-generate an array collecting the the variables currently present in the + * environment. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_exec(uint32_t ctx_id, - const char *exec_path, - const char *const argv[], - const char *const envp[]); +int32_t krun_set_exec(uint32_t ctx_id, const char *exec_path, + const char *const argv[], const char *const envp[]); /** * Sets the path to the firmware to be loaded into the microVM. * * Arguments: * "ctx_id" - the configuration context ID. - * "firmware_path" - the path to the firmware, relative to the host's filesystem. + * "firmware_path" - the path to the firmware, relative to the host's + * filesystem. * * * Returns: @@ -948,16 +972,14 @@ int32_t krun_set_firmware(uint32_t ctx_id, const char *firmware_path); * "ctx_id" - the configuration context ID. * "kernel_path" - the path to the kernel, relative to the host's filesystem. * "kernel_format" - the kernel format. - * "initramfs" - the path to the initramfs, relative to the host's filesystem. - * "cmdline" - the kernel command line. + * "initramfs" - the path to the initramfs, relative to the host's + * filesystem. "cmdline" - the kernel command line. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_kernel(uint32_t ctx_id, - const char *kernel_path, - uint32_t kernel_format, - const char *initramfs, +int32_t krun_set_kernel(uint32_t ctx_id, const char *kernel_path, + uint32_t kernel_format, const char *initramfs, const char *cmdline); /** @@ -965,9 +987,9 @@ int32_t krun_set_kernel(uint32_t ctx_id, * * Arguments: * "ctx_id" - the configuration context ID. - * "envp" - an array of string pointers to be injected as environment variables into the - * context of the executable. If NULL, it will auto-generate an array collecting the - * the variables currently present in the environment. + * "envp" - an array of string pointers to be injected as environment + * variables into the context of the executable. If NULL, it will auto-generate + * an array collecting the the variables currently present in the environment. * * Returns: * Zero on success or a negative error number on failure. @@ -975,11 +997,13 @@ int32_t krun_set_kernel(uint32_t ctx_id, int32_t krun_set_env(uint32_t ctx_id, const char *const envp[]); /** - * Sets the file path to the TEE configuration file. Only available in libkrun-sev. + * Sets the file path to the TEE configuration file. Only available in + * libkrun-sev. * * Arguments: * "ctx_id" - the configuration context ID. - * "filepath" - a null-terminated string representing file path to the TEE config file. + * "filepath" - a null-terminated string representing file path to the TEE + * config file. * * Returns: * Zero on success or a negative error number on failure. @@ -995,8 +1019,7 @@ int32_t krun_set_tee_config_file(uint32_t ctx_id, const char *filepath); * "filepath" - a null-terminated string representing the path of the UNIX * socket in the host. */ -int32_t krun_add_vsock_port(uint32_t ctx_id, - uint32_t port, +int32_t krun_add_vsock_port(uint32_t ctx_id, uint32_t port, const char *c_filepath); /** @@ -1007,12 +1030,11 @@ int32_t krun_add_vsock_port(uint32_t ctx_id, * "port" - a vsock port that the guest will connect to for IPC. * "filepath" - a null-terminated string representing the path of the UNIX * socket in the host. - * "listen" - true if guest expects connections to be initiated from host side + * "listen" - true if guest expects connections to be initiated from host + * side */ -int32_t krun_add_vsock_port2(uint32_t ctx_id, - uint32_t port, - const char *c_filepath, - bool listen); +int32_t krun_add_vsock_port2(uint32_t ctx_id, uint32_t port, + const char *c_filepath, bool listen); /** * Add a vsock device with specified TSI features. @@ -1026,8 +1048,8 @@ int32_t krun_add_vsock_port2(uint32_t ctx_id, * * Arguments: * "ctx_id" - the configuration context ID. - * "tsi_features" - bitmask of TSI features (KRUN_TSI_HIJACK_INET, KRUN_TSI_HIJACK_UNIX) - * Use 0 to add vsock without any TSI hijacking. + * "tsi_features" - bitmask of TSI features (KRUN_TSI_HIJACK_INET, + * KRUN_TSI_HIJACK_UNIX) Use 0 to add vsock without any TSI hijacking. * * Returns: * Zero on success or a negative error number on failure. @@ -1035,8 +1057,8 @@ int32_t krun_add_vsock_port2(uint32_t ctx_id, int32_t krun_add_vsock(uint32_t ctx_id, uint32_t tsi_features); /** - * Returns the eventfd file descriptor to signal the guest to shut down orderly. This must be - * called before starting the microVM with "krun_start_event". + * Returns the eventfd file descriptor to signal the guest to shut down orderly. + * This must be called before starting the microVM with "krun_start_event". * * Arguments: * "ctx_id" - the configuration context ID. @@ -1047,18 +1069,19 @@ int32_t krun_add_vsock(uint32_t ctx_id, uint32_t tsi_features); int32_t krun_get_shutdown_eventfd(uint32_t ctx_id); /** - * Configures the console device to ignore stdin and write the output to "c_filepath". + * Configures the console device to ignore stdin and write the output to + * "c_filepath". * * Arguments: * "ctx_id" - the configuration context ID. - * "filepath" - a null-terminated string representing the path of the file to write the - * console output. + * "filepath" - a null-terminated string representing the path of the file to + * write the console output. * * Notes: - * This API only applies to the implicitly created console. If the implicit console is - * disabled via `krun_disable_implicit_console` the operation is a NOOP. Additionally, - * this API does not have any effect on consoles created via the `krun_add_*_console_default` - * APIs. + * This API only applies to the implicitly created console. If the implicit + * console is disabled via `krun_disable_implicit_console` the operation is a + * NOOP. Additionally, this API does not have any effect on consoles created via + * the `krun_add_*_console_default` APIs. */ int32_t krun_set_console_output(uint32_t ctx_id, const char *c_filepath); @@ -1102,9 +1125,9 @@ int32_t krun_setgid(uint32_t ctx_id, gid_t gid); * "enabled" - true to enable Nested Virtualization in the microVM. * * Returns: - * Zero on success or a negative error number on failure. Success doesn't imply that - * Nested Virtualization is supported on the system, only that it's going to be requested - * when the microVM is created after calling "krun_start_enter". + * Zero on success or a negative error number on failure. Success doesn't imply + * that Nested Virtualization is supported on the system, only that it's going + * to be requested when the microVM is created after calling "krun_start_enter". */ int32_t krun_set_nested_virt(uint32_t ctx_id, bool enabled); @@ -1150,12 +1173,14 @@ int32_t krun_has_feature(uint64_t feature); * Get the maximum number of vCPUs supported by the hypervisor. * * Returns: - * The maximum number of vCPUs that can be created, or a negative error number on failure. + * The maximum number of vCPUs that can be created, or a negative error number + * on failure. */ int32_t krun_get_max_vcpus(void); /** - * Specify whether to split IRQCHIP responsibilities between the host and the guest. + * Specify whether to split IRQCHIP responsibilities between the host and the + * guest. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1163,7 +1188,7 @@ int32_t krun_get_max_vcpus(void); * * Returns: * Zero on success or a negative error number on failure. -*/ + */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); /* @@ -1209,18 +1234,21 @@ int32_t krun_set_kernel_console(uint32_t ctx_id, const char *console_id); /* * Adds a virtio-console device to the guest. * - * The function can be called multiple times for adding multiple virtio-console devices. - * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "hvc0", the second "hvc1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console`, the first - * console created with the function will occupy the "hvc1" ID. - * - * This function attaches a multi port virtio-console to the guest. If the input, output and error - * file descriptors are TTYs, the device will be created with just a single console port (`err_fd` - * is ignored in this case, because error output just goes to the TTY). For each of the non-TTY file - * descriptors an additional non-console port is created ("krun-stdin"/"krun-stdout"/"krun-stderr"). - * The libkrun init process in the guest detects the existence of the additional ports and redirects - * the stdin/stdout/stderr of the application in the guest appropriately. + * The function can be called multiple times for adding multiple virtio-console + * devices. In the guest, the consoles will appear in the same order as they are + * added (that is, the first added console will be "hvc0", the second "hvc1", + * ...). However, if the implicit console is not disabled via + * `krun_disable_implicit_console`, the first console created with the function + * will occupy the "hvc1" ID. + * + * This function attaches a multi port virtio-console to the guest. If the + * input, output and error file descriptors are TTYs, the device will be created + * with just a single console port (`err_fd` is ignored in this case, because + * error output just goes to the TTY). For each of the non-TTY file descriptors + * an additional non-console port is created + * ("krun-stdin"/"krun-stdout"/"krun-stderr"). The libkrun init process in the + * guest detects the existence of the additional ports and redirects the + * stdin/stdout/stderr of the application in the guest appropriately. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1231,19 +1259,18 @@ int32_t krun_set_kernel_console(uint32_t ctx_id, const char *console_id); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtio_console_default(uint32_t ctx_id, - int input_fd, - int output_fd, - int err_fd); +int32_t krun_add_virtio_console_default(uint32_t ctx_id, int input_fd, + int output_fd, int err_fd); /* * Adds a legacy serial device to the guest. * * The function can be called multiple times for adding multiple serial devices. - * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "ttyS0", the second "ttyS1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console` on aarch64 or macOS, - * the first console created with the function will occupy the "ttyS1" ID. + * In the guest, the consoles will appear in the same order as they are added + * (that is, the first added console will be "ttyS0", the second "ttyS1", ...). + * However, if the implicit console is not disabled via + * `krun_disable_implicit_console` on aarch64 or macOS, the first console + * created with the function will occupy the "ttyS1" ID. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1253,22 +1280,25 @@ int32_t krun_add_virtio_console_default(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_serial_console_default(uint32_t ctx_id, - int input_fd, - int output_fd); +int32_t krun_add_serial_console_default(uint32_t ctx_id, int input_fd, + int output_fd); /* - * Adds a multi-port virtio-console device to the guest with explicitly configured ports. - * - * This function creates a new virtio-console device that can have multiple ports added to it - * via krun_add_console_port_tty() and krun_add_console_port_inout(). Unlike krun_add_virtio_console_default(), - * this does not do any automatic detections to configure ports based on the file descriptors. - * - * The function can be called multiple times for adding multiple virtio-console devices. - * Each device appears in the guest with port 0 accessible as /dev/hvcN (hvc0, hvc1, etc.) in the order - * devices are added. If the implicit console is not disabled via `krun_disable_implicit_console`, - * the first explicitly added device will occupy the "hvc1" ID. Additional ports within each device - * (port 1, 2, ...) appear as /dev/vportNpM character devices. + * Adds a multi-port virtio-console device to the guest with explicitly + * configured ports. + * + * This function creates a new virtio-console device that can have multiple + * ports added to it via krun_add_console_port_tty() and + * krun_add_console_port_inout(). Unlike krun_add_virtio_console_default(), this + * does not do any automatic detections to configure ports based on the file + * descriptors. + * + * The function can be called multiple times for adding multiple virtio-console + * devices. Each device appears in the guest with port 0 accessible as /dev/hvcN + * (hvc0, hvc1, etc.) in the order devices are added. If the implicit console is + * not disabled via `krun_disable_implicit_console`, the first explicitly added + * device will occupy the "hvc1" ID. Additional ports within each device (port + * 1, 2, ...) appear as /dev/vportNpM character devices. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1281,45 +1311,47 @@ int32_t krun_add_virtio_console_multiport(uint32_t ctx_id); /* * Adds a TTY port to a multi-port virtio-console device. * - * The TTY file descriptor is used for both input and output. This port will be marked with the - * VIRTIO_CONSOLE_CONSOLE_PORT flag, enabling console-specific features notably window resize. + * The TTY file descriptor is used for both input and output. This port will be + * marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, enabling console-specific + * features notably window resize. * * Arguments: * "ctx_id" - the configuration context ID - * "console_id" - the console ID returned by krun_add_virtio_console_multiport() - * "name" - the name of the port for identifying the port in the guest, can be empty ("") - * "tty_fd" - file descriptor for the TTY to use for both input, output, and determining terminal size + * "console_id" - the console ID returned by + * krun_add_virtio_console_multiport() "name" - the name of the port for + * identifying the port in the guest, can be empty ("") "tty_fd" - file + * descriptor for the TTY to use for both input, output, and determining + * terminal size * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_console_port_tty(uint32_t ctx_id, - uint32_t console_id, - const char *name, - int tty_fd); +int32_t krun_add_console_port_tty(uint32_t ctx_id, uint32_t console_id, + const char *name, int tty_fd); /* - * Adds a generic I/O port to a multi-port virtio-console device, suitable for arbitrary bidirectional - * data streams that don't require terminal functionality. + * Adds a generic I/O port to a multi-port virtio-console device, suitable for + * arbitrary bidirectional data streams that don't require terminal + * functionality. * - * This port will NOT be marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, meaning it won't support - * console-specific features like window resize signals. + * This port will NOT be marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, + * meaning it won't support console-specific features like window resize + * signals. * * Arguments: * "ctx_id" - the configuration context ID - * "console_id" - the console ID returned by krun_add_virtio_console_multiport() - * "name" - the name of the port for identifying the port in the guest, can be empty ("") - * "input_fd" - file descriptor to use for input (host writes, guest reads) - * "output_fd" - file descriptor to use for output (guest writes, host reads) + * "console_id" - the console ID returned by + * krun_add_virtio_console_multiport() "name" - the name of the port for + * identifying the port in the guest, can be empty ("") "input_fd" - file + * descriptor to use for input (host writes, guest reads) "output_fd" - file + * descriptor to use for output (guest writes, host reads) * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_console_port_inout(uint32_t ctx_id, - uint32_t console_id, - const char *name, - int input_fd, - int output_fd); +int32_t krun_add_console_port_inout(uint32_t ctx_id, uint32_t console_id, + const char *name, int input_fd, + int output_fd); /** * Configure block device to be used as root filesystem. @@ -1327,23 +1359,29 @@ int32_t krun_add_console_port_inout(uint32_t ctx_id, * Arguments: * "ctx_id" - the configuration context ID. * "device" - a null-terminated string specifying the root device - * (e.g. "/dev/vda1", must refer to a previously configured block device) - * "fstype" - a null-terminated string specifying the filesystem type (e.g. "ext4", can be set to "auto" or NULL) - * "options" - a null-terminated string with a comma-separated list of mount options (can be NULL) + * (e.g. "/dev/vda1", must refer to a previously configured block + * device) "fstype" - a null-terminated string specifying the filesystem type + * (e.g. "ext4", can be set to "auto" or NULL) "options" - a null-terminated + * string with a comma-separated list of mount options (can be NULL) * * Notes: - * This function can be used if you want a root filesystem backed by a block device instead of a virtiofs path. - * Because libkrun uses its own built-in init process (implemented as a virtual file in the virtiofs driver), - * you'd normally have to copy the executable into every filesystem image (or partition) you intend to boot from. - * This is obviously difficult to maintain, so instead we can create a dummy virtiofs root behind the scenes, - * execute init from it as usual and then switch to the actual root configured by this function. + * This function can be used if you want a root filesystem backed by a block + * device instead of a virtiofs path. Because libkrun uses its own built-in init + * process (implemented as a virtual file in the virtiofs driver), you'd + * normally have to copy the executable into every filesystem image (or + * partition) you intend to boot from. This is obviously difficult to maintain, + * so instead we can create a dummy virtiofs root behind the scenes, execute + * init from it as usual and then switch to the actual root configured by this + * function. */ -int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, const char *fstype, const char *options); +int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, + const char *fstype, const char *options); /** - * Starts and enters the microVM with the configured parameters. The VMM will attempt to take over - * stdin/stdout to manage them on behalf of the process running inside the isolated environment, - * simulating that the latter has direct control of the terminal. + * Starts and enters the microVM with the configured parameters. The VMM will + * attempt to take over stdin/stdout to manage them on behalf of the process + * running inside the isolated environment, simulating that the latter has + * direct control of the terminal. * * This function consumes the configuration pointed by the context ID. * @@ -1351,15 +1389,17 @@ int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, const ch * "ctx_id" - the configuration context ID. * * Notes: - * This function only returns if an error happens before starting the microVM. Otherwise, the - * VMM assumes it has full control of the process, and will call to exit() with the workload's exit - * code once the microVM shuts down. If an error occurred before running the workload the process - * will exit() with an error exit code. + * This function only returns if an error happens before starting the microVM. + * Otherwise, the VMM assumes it has full control of the process, and will call + * to exit() with the workload's exit code once the microVM shuts down. If an + * error occurred before running the workload the process will exit() with an + * error exit code. * * Error exit codes: * 125 - "init" cannot set up the environment inside the microVM. - * 126 - "init" can find the executable to be run inside the microVM but cannot execute it. - * 127 - "init" cannot find the executable to be run inside the microVM. + * 126 - "init" can find the executable to be run inside the microVM but + * cannot execute it. 127 - "init" cannot find the executable to be run + * inside the microVM. * * Returns: * -EINVAL - The VMM has detected an error in the microVM configuration. diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index 89e11cb45..197c01bd3 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -39,7 +39,6 @@ vm-memory = { version = "0.17", features = ["backend-mmap"] } zerocopy = { version = "0.8.26", optional = true, features = ["derive"] } krun_display = { package = "krun-display", version = "0.1.0", path = "../display", optional = true, features = ["bindgen_clang_runtime"] } krun_input = { package = "krun-input", version = "0.1.0", path = "../input", features = ["bindgen_clang_runtime"], optional = true } -init-blob = { path = "../init-blob" } arch = { package = "krun-arch", version = "=0.1.0-1.18.0", path = "../arch" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs new file mode 100644 index 000000000..9c507cc93 --- /dev/null +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -0,0 +1,614 @@ +// Virtual inode overlay for virtiofs. +// +// `AugmentFs` wraps an inner `FileSystem` implementation and intercepts +// FUSE operations for virtual inodes — synthetic read-only files that exist +// only in memory. All other operations are delegated to the inner filesystem. +// +// Virtual inodes are injected into the root directory (parent = ROOT_ID) and +// are currently only accessible via lookup (they do not appear in readdir). +// +// One-shot files can only be looked up once — the name is removed from the +// directory on first lookup so subsequent lookups return ENOENT. + +#[cfg(target_os = "macos")] +use crossbeam_channel::Sender; +use std::collections::HashMap; +use std::ffi::CStr; +use std::ffi::CString; +use std::io; +use std::sync::atomic::AtomicI32; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; + +#[cfg(target_os = "macos")] +use utils::worker_message::WorkerMessage; + +use super::filesystem::{ + Context, DirEntry, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, + OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use super::fuse; +use super::inode_alloc::InodeAllocator; +use super::virtual_inode::{VirtualEntry, VirtualFile}; +use crate::virtio::bindings; + +type Inode = u64; +type Handle = u64; + +/// Sentinel handle returned for virtual file opens. The inner filesystem's +/// handle allocator starts at 1 so this never collides. +const VIRTUAL_HANDLE: Handle = 0; + +/// Virtual entries never change; use a large cache timeout. +const VIRTUAL_TIMEOUT: Duration = Duration::from_secs(86400); + +// Use Linux errno values, not host values. The guest always runs Linux +// and the FUSE server passes error codes through without translation. +const LINUX_ENOENT: i32 = 2; +const LINUX_EACCES: i32 = 13; +const LINUX_EEXIST: i32 = 17; +const LINUX_EXDEV: i32 = 18; +const LINUX_EINVAL: i32 = 22; +const LINUX_EPERM: i32 = 1; +const LINUX_ENOSYS: i32 = 38; +const LINUX_ENODATA: i32 = 61; +const LINUX_ENXIO: i32 = 6; + +fn eperm() -> io::Error { + io::Error::from_raw_os_error(LINUX_EPERM) +} + +/// Overlay that injects virtual inodes into an inner `FileSystem`. +pub struct AugmentFs { + inner: T, + /// Maps (name in root dir) → virtual inode number. One-shot entries + /// are removed on first lookup so the file can only be opened once. + name_to_inode: RwLock>, + /// Maps virtual inode number → file data. One-shot entries are removed + /// from this map on release. + inodes: RwLock>, +} + +impl> AugmentFs { + /// Create a new overlay. + /// + /// `entries` are registered as virtual inodes in the root directory. + /// Inode numbers are obtained from `inode_alloc`, the same allocator + /// used by the inner filesystem. + pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { + let mut name_to_inode = HashMap::with_capacity(entries.len()); + let mut inodes = HashMap::with_capacity(entries.len()); + + for entry in entries { + let inode = inode_alloc.next(); + name_to_inode.insert(entry.name, inode); + inodes.insert(inode, entry.file); + } + + Self { + inner, + name_to_inode: RwLock::new(name_to_inode), + inodes: RwLock::new(inodes), + } + } + + fn is_virtual(&self, inode: Inode) -> bool { + self.inodes.read().unwrap().contains_key(&inode) + } +} + +impl> FileSystem for AugmentFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result { + self.inner.init(capable) + } + + fn destroy(&self) { + self.inner.destroy() + } + + fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { + if parent == fuse::ROOT_ID { + let inode = self.name_to_inode.read().unwrap().get(name).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some(file) = inodes.get(&inode) { + let one_shot = file.one_shot; + let st = file.stat(inode); + let entry_timeout = if one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; + + // One-shot: remove name so subsequent lookups fall + // through to the inner filesystem (or return ENOENT). + if one_shot { + // Drop the read lock first, before locking for write + drop(inodes); + self.name_to_inode.write().unwrap().remove(name); + } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: VIRTUAL_TIMEOUT, + entry_timeout, + }); + } + } + } + self.inner.lookup(ctx, parent, name) + } + + fn forget(&self, ctx: Context, inode: Inode, count: u64) { + if !self.is_virtual(inode) { + self.inner.forget(ctx, inode, count) + } + } + + fn batch_forget(&self, ctx: Context, requests: Vec<(Inode, u64)>) { + let real: Vec<_> = requests + .into_iter() + .filter(|(ino, _)| !self.is_virtual(*ino)) + .collect(); + if !real.is_empty() { + self.inner.batch_forget(ctx, real); + } + } + + fn getattr( + &self, + ctx: Context, + inode: Inode, + handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(file) = inodes.get(&inode) { + let st = file.stat(inode); + return Ok((st, VIRTUAL_TIMEOUT)); + } + } + self.inner.getattr(ctx, inode, handle) + } + + fn setattr( + &self, + ctx: Context, + inode: Inode, + attr: bindings::stat64, + handle: Option, + valid: SetattrValid, + ) -> io::Result<(bindings::stat64, Duration)> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.setattr(ctx, inode, attr, handle, valid) + } + + fn readlink(&self, ctx: Context, inode: Inode) -> io::Result> { + if self.is_virtual(inode) { + return Err(io::Error::from_raw_os_error(LINUX_EINVAL)); + } + self.inner.readlink(ctx, inode) + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + extensions: Extensions, + ) -> io::Result { + self.inner.symlink(ctx, linkname, parent, name, extensions) + } + + fn mknod( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + self.inner + .mknod(ctx, inode, name, mode, rdev, umask, extensions) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + .map_err(|_| io::Error::from_raw_os_error(LINUX_EINVAL))?; + if off >= data.len() { return Ok(0); + } + let remaining = file.data.len() - off; + let len = remaining.min(size as usize); + return w.write(&file.data[off..(off + len)]); + } + } + self.inner + .read(ctx, inode, handle, w, size, offset, lock_owner, flags) + } + + fn write( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + r: R, + size: u32, + offset: u64, + lock_owner: Option, + delayed_write: bool, + kill_priv: bool, + flags: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.write( + ctx, + inode, + handle, + r, + size, + offset, + lock_owner, + delayed_write, + kill_priv, + flags, + ) + } + + fn flush(&self, ctx: Context, inode: Inode, handle: Handle, lock_owner: u64) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.flush(ctx, inode, handle, lock_owner) + } + + fn fsync(&self, ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.fsync(ctx, inode, datasync, handle) + } + + fn fallocate( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner + .fallocate(ctx, inode, handle, mode, offset, length) + } + + fn release( + &self, + ctx: Context, + inode: Inode, + flags: u32, + handle: Handle, + flush: bool, + flock_release: bool, + lock_owner: Option, + ) -> io::Result<()> { + { + let mut inodes = self.inodes.write().unwrap(); + if let Some(file) = inodes.get(&inode) { + if file.one_shot { + inodes.remove(&inode); + } + return Ok(()); + } + } + self.inner + .release(ctx, inode, flags, handle, flush, flock_release, lock_owner) + } + + fn statfs(&self, ctx: Context, inode: Inode) -> io::Result { + self.inner.statfs(ctx, inode) + } + + fn getxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(io::Error::from_raw_os_error(LINUX_ENODATA)); + } + self.inner.getxattr(ctx, inode, name, size) + } + + fn listxattr(&self, ctx: Context, inode: Inode, size: u32) -> io::Result { + if self.is_virtual(inode) { + if size == 0 { + return Ok(ListxattrReply::Count(0)); + } + return Ok(ListxattrReply::Names(Vec::new())); + } + self.inner.listxattr(ctx, inode, size) + } + + fn setxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.setxattr(ctx, inode, name, value, flags) + } + + fn removexattr(&self, ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.removexattr(ctx, inode, name) + } + + fn opendir( + &self, + ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + self.inner.opendir(ctx, inode, flags) + } + + fn readdir( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result, + { + self.inner + .readdir(ctx, inode, handle, size, offset, add_entry) + } + + fn readdirplus( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result, + { + self.inner + .readdirplus(ctx, inode, handle, size, offset, add_entry) + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.inner.fsyncdir(ctx, inode, datasync, handle) + } + + fn releasedir(&self, ctx: Context, inode: Inode, flags: u32, handle: Handle) -> io::Result<()> { + self.inner.releasedir(ctx, inode, flags, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + if self.is_virtual(inode) { + if mask & (libc::W_OK as u32) != 0 { + return Err(io::Error::from_raw_os_error(LINUX_EACCES)); + } + return Ok(()); + } + self.inner.access(ctx, inode, mask) + } + + fn lseek( + &self, + ctx: Context, + inode: Inode, + _handle: Handle, + offset: u64, + whence: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(file) = inodes.get(&inode) { + let size = file.data.len() as u64; + // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. + return match whence as i32 { + libc::SEEK_DATA => { + if offset < size { + Ok(offset) + } else { + Err(io::Error::from_raw_os_error(LINUX_ENXIO)) + } + } + libc::SEEK_HOLE => { + if offset < size { + Ok(size) + } else { + Err(io::Error::from_raw_os_error(LINUX_ENXIO)) + } + } + _ => Err(io::Error::from_raw_os_error(LINUX_EINVAL)), + }; + } + } + self.inner.lseek(ctx, inode, _handle, offset, whence) + } + + fn copyfilerange( + &self, + ctx: Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Virtual inodes don't have real file descriptors, so copy_file_range + // cannot work. Return EXDEV to tell the kernel to fall back to + // read+write. + if self.is_virtual(inode_in) || self.is_virtual(inode_out) { + return Err(io::Error::from_raw_os_error(LINUX_EXDEV)); + } + self.inner.copyfilerange( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + fn setupmapping( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + foffset: u64, + len: u64, + flags: u64, + moffset: u64, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(file) = inodes.get(&inode) { + #[cfg(target_os = "linux")] + { + if (moffset + len) > shm_size { + return Err(io::Error::from_raw_os_error(LINUX_EINVAL)); + } + + let addr = host_shm_base + moffset; + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, + -1, + 0, + ) + }; + if std::ptr::eq(ret, libc::MAP_FAILED) { + return Err(io::Error::last_os_error()); + } + + let foff = foffset as usize; + if foff < file.data.len() { + let available = file.data.len() - foff; + let to_copy = (len as usize).min(available); + unsafe { + libc::memcpy( + addr as *mut libc::c_void, + file.data.as_ptr().add(foff) as *const _, + to_copy, + ) + }; + } + + return Ok(()); + } + + // TODO: implement DAX for virtual files on macOS using + // the ShmRegionManager once it exists (see dax-window-layering task). + #[cfg(target_os = "macos")] + { + let _ = data; + return Err(io::Error::from_raw_os_error(LINUX_ENOSYS)); } + } + } + self.inner.setupmapping( + ctx, + inode, + handle, + foffset, + len, + flags, + moffset, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn removemapping( + &self, + ctx: Context, + requests: Vec, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + self.inner.removemapping( + ctx, + requests, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn ioctl( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + flags: u32, + cmd: u32, + arg: u64, + in_size: u32, + out_size: u32, + exit_code: &Arc, + ) -> io::Result> { + // Always delegate: the exit-code and root-dir-removal ioctls are + // dispatched by command number, not by inode. + self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ) + } +} diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index bc877bc24..f870c2f08 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -17,6 +17,7 @@ use super::super::{ VirtioShmRegion, }; use super::passthrough; +use super::virtual_inode::VirtualEntry; use super::worker::FsWorker; use super::ExportTable; use super::{defs, defs::uapi}; @@ -48,6 +49,7 @@ pub struct Fs { shm_region: Option, passthrough_cfg: passthrough::Config, read_only: bool, + virtual_entries: Vec, worker_thread: Option>, worker_stopfd: EventFd, exit_code: Arc, @@ -62,6 +64,7 @@ impl Fs { exit_code: Arc, allow_root_dir_delete: bool, read_only: bool, + virtual_entries: Vec, ) -> super::Result { let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); @@ -84,6 +87,7 @@ impl Fs { shm_region: None, passthrough_cfg: fs_cfg, read_only, + virtual_entries, worker_thread: None, worker_stopfd: EventFd::new(EFD_NONBLOCK).map_err(FsError::EventFd)?, exit_code, @@ -180,6 +184,7 @@ impl VirtioDevice for Fs { queue_evts.push(dq.event); } + let virtual_entries = std::mem::take(&mut self.virtual_entries); let worker = FsWorker::new( worker_queues, queue_evts, @@ -188,6 +193,7 @@ impl VirtioDevice for Fs { self.shm_region.clone(), self.passthrough_cfg.clone(), self.read_only, + virtual_entries, self.worker_stopfd.try_clone().unwrap(), self.exit_code.clone(), #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index abda1ce53..08da133f0 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -32,9 +32,6 @@ const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; -const INIT_CSTR: &[u8] = b"init.krun\0"; - -static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; type Inode = u64; type Handle = u64; @@ -360,13 +357,11 @@ pub struct PassthroughFs { // do with an fd opened with this flag. inodes: RwLock>>, inode_alloc: Arc, - init_inode: u64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the @@ -440,11 +435,9 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), inode_alloc, - init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, proc_self_fd, @@ -993,25 +986,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("do_lookup: {name:?}"); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == init_name { - let mut st: libc::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1130,11 +1105,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1235,16 +1206,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset.try_into().map_err(|_| einval())?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -1825,10 +1786,6 @@ impl FileSystem for PassthroughFs { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } - if inode == self.init_inode { - return Err(io::Error::from_raw_os_error(libc::ENODATA)); - } - let mut buf = vec![0; size as usize]; // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we @@ -2088,36 +2045,6 @@ impl FileSystem for PassthroughFs { debug!("setupmapping: ino {inode:?} addr={addr:x} len={len}"); - if inode == self.init_inode { - let ret = unsafe { - libc::mmap( - addr as *mut libc::c_void, - len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, - -1, - 0, - ) - }; - if std::ptr::eq(ret, libc::MAP_FAILED) { - return Err(io::Error::last_os_error()); - } - - let to_copy = if len as usize > INIT_BINARY.len() { - INIT_BINARY.len() - } else { - len as usize - }; - unsafe { - libc::memcpy( - addr as *mut libc::c_void, - INIT_BINARY.as_ptr() as *const _, - to_copy, - ) - }; - return Ok(()); - } - let file = self.open_inode(inode, open_flags)?; let fd = file.as_raw_fd(); diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 3d27aec7f..d1a862d0c 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -32,14 +32,11 @@ use super::super::fuse; use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; -const INIT_CSTR: &[u8] = b"init.krun\0"; const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = init_blob::INIT_BINARY; - type Inode = u64; type Handle = u64; @@ -545,11 +542,9 @@ impl Default for Config { pub struct PassthroughFs { inodes: RwLock>>, inode_alloc: Arc, - init_inode: u64, handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, map_windows: Mutex>, @@ -581,11 +576,9 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), inode_alloc, - init_inode: fuse::ROOT_ID + 1, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, map_windows: Mutex::new(HashMap::new()), @@ -1202,25 +1195,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {name:?}"); - let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == _init_name { - let mut st: bindings::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1340,11 +1315,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1457,18 +1428,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset - .try_into() - .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -2054,10 +2013,6 @@ impl FileSystem for PassthroughFs { return Err(linux_error(io::Error::from_raw_os_error(libc::ENOSYS))); } - if inode == self.init_inode { - return Err(linux_error(io::Error::from_raw_os_error(libc::ENODATA))); - } - if name.to_bytes() == XATTR_KEY { return Err(linux_error(io::Error::from_raw_os_error(libc::EACCES))); } diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 179535131..3f36d5f05 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -7,6 +7,8 @@ mod inode_alloc; mod multikey; mod read_only; mod server; +mod augment_fs; +pub mod virtual_inode; mod worker; #[cfg(target_os = "linux")] diff --git a/src/devices/src/virtio/fs/virtual_inode.rs b/src/devices/src/virtio/fs/virtual_inode.rs new file mode 100644 index 000000000..8a4f6b195 --- /dev/null +++ b/src/devices/src/virtio/fs/virtual_inode.rs @@ -0,0 +1,37 @@ +// Virtual inode types for the virtiofs overlay. +// +// A `VirtualFile` represents a read-only file backed by static data that is +// injected into the guest filesystem without any corresponding host file. + +use std::ffi::CString; +use std::mem; + +use crate::virtio::bindings; + +/// A read-only virtual file backed by a static byte slice. +pub struct VirtualFile { + pub data: &'static [u8], + pub mode: u32, + /// If true, the file can only be looked up once. + pub one_shot: bool, +} + +impl VirtualFile { + /// Synthesize a stat result for this virtual file. + pub fn stat(&self, inode: u64) -> bindings::stat64 { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = inode; + st.st_size = self.data.len() as i64; + st.st_mode = self.mode as _; + st.st_nlink = 1; + st.st_blksize = 4096; + st.st_blocks = ((self.data.len() as i64) + 511) / 512; + st + } +} + +/// An entry to register as a virtual inode in the root directory. +pub struct VirtualEntry { + pub name: CString, + pub file: VirtualFile, +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index e554aa377..3d19c1dba 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -20,11 +20,13 @@ use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; +use super::augment_fs::AugmentFs; +use super::virtual_inode::VirtualEntry; use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { - ReadWrite(Server), - ReadOnly(Server), + ReadWrite(Server>), + ReadOnly(Server>), } impl FsServer { @@ -80,21 +82,26 @@ impl FsWorker { shm_region: Option, passthrough_cfg: passthrough::Config, read_only: bool, + virtual_entries: Vec, stop_fd: EventFd, exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { let inode_alloc = Arc::new(InodeAllocator::new()); let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new( - passthrough_cfg, - inode_alloc, - )?)) + let inner = PassthroughFsRo::new(passthrough_cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new( - passthrough_cfg, - inode_alloc, - )?)) + let inner = PassthroughFs::new(passthrough_cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) }; Ok(Self { queues, diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 24db7a9ff..3aa4402de 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -32,6 +32,7 @@ krun_display = { package = "krun-display", version = "0.1.0", path = "../display krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } +init-blob = { path = "../init-blob" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 1d8b3fcb1..bc59c60f8 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,6 +14,8 @@ use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; +#[cfg(not(feature = "tee"))] +use devices::virtio::fs::virtual_inode::{VirtualEntry, VirtualFile}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; @@ -23,7 +25,6 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; use std::env; -#[cfg(target_os = "linux")] use std::ffi::CString; use std::ffi::{c_void, CStr}; use std::fs::File; @@ -90,6 +91,20 @@ static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); // Path to the init binary to be executed inside the VM. const INIT_PATH: &str = "/init.krun"; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +const DEFAULT_INIT_PAYLOAD: &[u8] = init_blob::INIT_BINARY; + +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn init_virtual_entry() -> VirtualEntry { + VirtualEntry { + name: std::ffi::CString::new("init.krun").unwrap(), + file: VirtualFile { + data: DEFAULT_INIT_PAYLOAD, + mode: 0o100_755, + one_shot: true, + }, + } +} static KRUNFW: LazyLock> = LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); @@ -599,6 +614,7 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) shm_size: Some(1 << 29), allow_root_dir_delete: false, read_only: false, + virtual_entries: vec![init_virtual_entry()], }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -665,12 +681,17 @@ pub unsafe extern "C" fn krun_add_virtiofs3( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if tag == "/dev/root" { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), shared_dir: path.to_string(), shm_size: shm, allow_root_dir_delete: false, read_only, + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -2415,6 +2436,7 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( shm_size: Some(1 << 29), allow_root_dir_delete: true, read_only: false, + virtual_entries: vec![init_virtual_entry()], }); ctx_cfg.set_block_root(device, fstype, options); @@ -2817,7 +2839,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { let (sender, _receiver) = unbounded(); let _vmm = match vmm::builder::build_microvm( - &ctx_cfg.vmr, + &mut ctx_cfg.vmr, &mut event_manager, ctx_cfg.shutdown_efd, sender, diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 1aa9c5c48..49a39c009 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -571,7 +571,7 @@ fn choose_payload(vm_resources: &VmResources) -> Result, _sender: Sender, @@ -1064,7 +1064,7 @@ pub fn build_microvm( #[cfg(not(any(feature = "tee", feature = "aws-nitro")))] attach_fs_devices( &mut vmm, - &vm_resources.fs, + std::mem::take(&mut vm_resources.fs), &mut _shm_manager, #[cfg(not(feature = "tee"))] export_table, @@ -2040,7 +2040,7 @@ fn attach_mmio_device( #[cfg(not(any(feature = "tee", feature = "aws-nitro")))] fn attach_fs_devices( vmm: &mut Vmm, - fs_devs: &[FsDeviceConfig], + fs_devs: Vec, shm_manager: &mut ShmManager, #[cfg(not(feature = "tee"))] export_table: Option, intc: IrqChip, @@ -2049,14 +2049,15 @@ fn attach_fs_devices( ) -> std::result::Result<(), StartMicrovmError> { use self::StartMicrovmError::*; - for (i, config) in fs_devs.iter().enumerate() { + for (i, config) in fs_devs.into_iter().enumerate() { let fs = Arc::new(Mutex::new( devices::virtio::Fs::new( - config.fs_id.clone(), - config.shared_dir.clone(), + config.fs_id, + config.shared_dir, exit_code.clone(), config.allow_root_dir_delete, config.read_only, + config.virtual_entries, ) .unwrap(), )); diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index ccf86f5cd..b4d6682c7 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -1,8 +1,10 @@ -#[derive(Clone, Debug)] +use devices::virtio::fs::virtual_inode::VirtualEntry; + pub struct FsDeviceConfig { pub fs_id: String, pub shared_dir: String, pub shm_size: Option, pub allow_root_dir_delete: bool, pub read_only: bool, + pub virtual_entries: Vec, } From d214cc8c356ddb3411d6912681b73a58ee63b5c6 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 4 May 2026 16:24:47 +0200 Subject: [PATCH 04/13] lib: add krun_disable_implicit_init() Add API to prevent the default init binary (/init.krun) from being injected into the root filesystem. Follows the existing krun_disable_implicit_{console,vsock} pattern. Must be called before krun_set_root(). Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 12 ++++++ src/libkrun/src/lib.rs | 84 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index c0b09e25f..18c8c0d60 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1205,6 +1205,18 @@ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); */ int32_t krun_disable_implicit_console(uint32_t ctx_id); +/** + * Do not inject the default init binary (/init.krun) into the root + * filesystem. Must be called before krun_set_root(). + * + * Arguments: + * "ctx_id" - the configuration context ID. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_disable_implicit_init(uint32_t ctx_id); + /** * Disable the implicit vsock device. * diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index bc59c60f8..172a97f20 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -182,6 +182,8 @@ struct ContextConfig { console_output: Option, vmm_uid: Option, vmm_gid: Option, + #[cfg(not(feature = "tee"))] + disable_implicit_init: bool, } impl ContextConfig { @@ -607,6 +609,10 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if !cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, shared_dir, @@ -614,7 +620,7 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) shm_size: Some(1 << 29), allow_root_dir_delete: false, read_only: false, - virtual_entries: vec![init_virtual_entry()], + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -682,7 +688,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); let mut virtual_entries = Vec::new(); - if tag == "/dev/root" { + if tag == "/dev/root" && !cfg.disable_implicit_init { virtual_entries.push(init_virtual_entry()); } cfg.vmr.add_fs_device(FsDeviceConfig { @@ -2429,6 +2435,10 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } + let mut virtual_entries = Vec::new(); + if !ctx_cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), shared_dir: empty_root.to_string_lossy().into(), @@ -2436,7 +2446,7 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( shm_size: Some(1 << 29), allow_root_dir_delete: true, read_only: false, - virtual_entries: vec![init_virtual_entry()], + virtual_entries, }); ctx_cfg.set_block_root(device, fstype, options); @@ -2447,6 +2457,19 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( KRUN_SUCCESS } +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().disable_implicit_init = true; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { @@ -2896,3 +2919,58 @@ fn krun_start_enter_nitro(ctx_id: u32) -> i32 { } } } + +#[cfg(all(test, not(feature = "tee")))] +mod tests { + use super::*; + + use std::ffi::CString; + use std::ptr::null; + + static TEST_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + + #[test] + fn root_virtiofs_injects_default_init_by_default() { + let _guard = TEST_LOCK.lock().unwrap(); + + let tag = CString::new("/dev/root").unwrap(); + let ctx = krun_create_ctx() as u32; + + unsafe { + assert_eq!(krun_add_virtiofs3(ctx, tag.as_ptr(), null(), 0, false), KRUN_SUCCESS); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert_eq!(cfg.vmr.fs[0].virtual_entries.len(), 1); + assert_eq!(cfg.vmr.fs[0].virtual_entries[0].name.to_bytes(), b"init.krun"); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } + + #[test] + fn root_virtiofs_respects_disable_implicit_init() { + let _guard = TEST_LOCK.lock().unwrap(); + + let tag = CString::new("/dev/root").unwrap(); + let ctx = krun_create_ctx() as u32; + + assert_eq!(krun_disable_implicit_init(ctx), KRUN_SUCCESS); + unsafe { + assert_eq!(krun_add_virtiofs3(ctx, tag.as_ptr(), null(), 0, false), KRUN_SUCCESS); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert!( + cfg.vmr.fs[0].virtual_entries.is_empty(), + "root virtiofs should not inject init.krun after krun_disable_implicit_init()" + ); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } +} From 6eed1e1970b1abf869d241d56f26f84dc7602c04 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 4 May 2026 16:25:48 +0200 Subject: [PATCH 05/13] lib: add krun_fs_add_overlay_file() Add C API to inject arbitrary virtual files into a virtiofs device. The file appears in the root directory of the specified mount and is backed entirely by host memory. Supports one-shot semantics (the file can only be looked up once). The data pointer follows the same lifetime contract as other krun APIs: the caller must keep the memory valid until krun_start_enter() returns. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 24 +++++++++++++++++++ src/libkrun/src/lib.rs | 53 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/include/libkrun.h b/include/libkrun.h index 18c8c0d60..a445a1b6f 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1217,6 +1217,30 @@ int32_t krun_disable_implicit_console(uint32_t ctx_id); */ int32_t krun_disable_implicit_init(uint32_t ctx_id); +/** + * Add a virtual overlay file to a virtiofs device. + * + * The file will appear in the root directory of the specified virtiofs + * mount and is backed entirely by host memory (no host file). The data + * pointer is NOT copied — the caller must keep the memory valid for the + * full VM lifetime. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "filename" - name of the file in the root directory. + * "data" - pointer to the file content. + * "data_len" - length of the file content in bytes. + * "mode" - file mode bits (e.g. 0100644 for a regular file). + * "one_shot" - if true, the file can only be looked up once. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, + const char *filename, const uint8_t *data, + size_t data_len, uint32_t mode, bool one_shot); + /** * Disable the implicit vsock device. * diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 172a97f20..f4244a3ec 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -2470,6 +2470,59 @@ pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { KRUN_SUCCESS } +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_file( + ctx_id: u32, + c_fs_tag: *const c_char, + c_filename: *const c_char, + data: *const u8, + data_len: size_t, + mode: u32, + one_shot: bool, +) -> i32 { + if c_fs_tag.is_null() || c_filename.is_null() || data.is_null() || data_len == 0 { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let filename = match CString::new(CStr::from_ptr(c_filename).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + // SAFETY: The caller guarantees the memory remains valid for the VM + // lifetime (see the C header contract). + let payload: &'static [u8] = slice::from_raw_parts(data, data_len); + + let entry = VirtualEntry { + name: filename, + file: VirtualFile { + data: payload, + mode, + one_shot, + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { From 942d396cf706e4dc0fe02e9f524dbd56d6853c5b Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Wed, 6 May 2026 18:01:17 +0200 Subject: [PATCH 06/13] lib: add krun_get_default_init() Add API to retrieve the built-in default init binary. Callers that use krun_disable_implicit_init() can use this to obtain the init binary and inject it themselves via krun_fs_add_overlay_file(). Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 19 +++++++++++++++++++ src/libkrun/src/lib.rs | 15 +++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/libkrun.h b/include/libkrun.h index a445a1b6f..b2a5977d1 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1217,6 +1217,25 @@ int32_t krun_disable_implicit_console(uint32_t ctx_id); */ int32_t krun_disable_implicit_init(uint32_t ctx_id); +/** + * Get a pointer to the built-in default init binary. + * + * This is the same binary that libkrun injects as /init.krun by default. + * Callers that use krun_disable_implicit_init() can use this to inject the + * init binary themselves (e.g. via krun_fs_add_overlay_file with custom + * settings). + * + * The returned pointer is valid for the lifetime of the process (static data). + * + * Arguments: + * "data_out" - receives a pointer to the init binary bytes. + * "len_out" - receives the length in bytes. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_get_default_init(const uint8_t **data_out, size_t *len_out); + /** * Add a virtual overlay file to a virtiofs device. * diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index f4244a3ec..dfbc6edd8 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -2470,6 +2470,21 @@ pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { KRUN_SUCCESS } +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_get_default_init( + data_out: *mut *const u8, + len_out: *mut size_t, +) -> i32 { + if data_out.is_null() || len_out.is_null() { + return -libc::EINVAL; + } + *data_out = DEFAULT_INIT_PAYLOAD.as_ptr(); + *len_out = DEFAULT_INIT_PAYLOAD.len(); + KRUN_SUCCESS +} + #[allow(clippy::missing_safety_doc)] #[no_mangle] #[cfg(not(feature = "tee"))] From 51716c1bca8f9088fa2f4c8ad2d6505e53682519 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 4 May 2026 16:55:41 +0200 Subject: [PATCH 07/13] libkrun.h: document that implicit resource creation will become opt-in Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/libkrun.h b/include/libkrun.h index b2a5977d1..4b434bc5a 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1191,6 +1191,13 @@ int32_t krun_get_max_vcpus(void); */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); +/* + * NOTE: Implicit resource creation is a legacy convenience. The 2.0 API + * (see https://github.com/containers/libkrun/issues/634) will not create + * any implicit resources. Callers should start using the + * krun_disable_implicit_* functions now to ease migration. + */ + /* * Do not create an implicit console device in the guest. By using this API, * libkrun will create zero console devices on behalf of the user. Any From 24caab513ff8d6ae50c0efee1da546e3b33f6b2e Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 5 May 2026 18:30:38 +0200 Subject: [PATCH 08/13] virtio/fs: add NullFs, a minimal empty-root FileSystem NullFs implements the FileSystem trait with just an empty root directory. It can be wrapped with AugmentFs to serve virtual files without any host directory involvement. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/mod.rs | 1 + src/devices/src/virtio/fs/null_fs.rs | 44 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/devices/src/virtio/fs/null_fs.rs diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 3f36d5f05..6868a2495 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -5,6 +5,7 @@ pub mod fuse; mod inode_alloc; #[allow(dead_code)] mod multikey; +mod null_fs; mod read_only; mod server; mod augment_fs; diff --git a/src/devices/src/virtio/fs/null_fs.rs b/src/devices/src/virtio/fs/null_fs.rs new file mode 100644 index 000000000..f2dd93a68 --- /dev/null +++ b/src/devices/src/virtio/fs/null_fs.rs @@ -0,0 +1,44 @@ +// A minimal filesystem that serves an empty root directory. +// +// Used with AugmentFs to provide a virtual-only filesystem (e.g. for +// booting from a block device where the virtiofs root only needs init.krun). + +use std::io; +use std::mem; +use std::time::Duration; + +use super::filesystem::{Context, FileSystem, FsOptions}; +use super::fuse; +use crate::virtio::bindings; + +/// An empty filesystem with just a root directory and nothing in it. +pub struct NullFs; + +type Inode = u64; +type Handle = u64; + +impl FileSystem for NullFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, _capable: FsOptions) -> io::Result { + Ok(FsOptions::empty()) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + if inode == fuse::ROOT_ID { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = fuse::ROOT_ID; + st.st_mode = libc::S_IFDIR | 0o755; + st.st_nlink = 2; + st.st_blksize = 4096; + return Ok((st, Duration::from_secs(86400))); + } + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } +} From 1b942149646b5237fa2200d1090513f0e21e0a8a Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 5 May 2026 18:30:48 +0200 Subject: [PATCH 09/13] lib: rewrite krun_set_root_disk_remount to use newly introduced NullFs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit krun_set_root_disk_remount no longer creates a temporary empty host directory. Instead it configures a NullFs-backed virtiofs device (shared_dir: None) with init.krun overlaid via AugmentFs. Fs::new() now accepts Option for shared_dir — None selects NullFs. FsDeviceConfig and FsServer gain the corresponding variants. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/device.rs | 22 ++++++++------- src/devices/src/virtio/fs/worker.rs | 44 ++++++++++++++++++++--------- src/libkrun/src/lib.rs | 37 +++++++++++------------- src/vmm/src/builder.rs | 1 - src/vmm/src/vmm_config/fs.rs | 5 ++-- 5 files changed, 63 insertions(+), 46 deletions(-) diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index f870c2f08..b01a296e7 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -47,7 +47,7 @@ pub struct Fs { device_state: DeviceState, config: VirtioFsConfig, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, virtual_entries: Vec, worker_thread: Option>, @@ -60,9 +60,8 @@ pub struct Fs { impl Fs { pub fn new( fs_id: String, - shared_dir: String, + shared_dir: Option, exit_code: Arc, - allow_root_dir_delete: bool, read_only: bool, virtual_entries: Vec, ) -> super::Result { @@ -73,11 +72,10 @@ impl Fs { config.tag[..tag.len()].copy_from_slice(tag.as_slice()); config.num_request_queues = 1; - let fs_cfg = passthrough::Config { - root_dir: shared_dir, - allow_root_dir_delete, + let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { + root_dir, ..Default::default() - }; + }); Ok(Fs { avail_features, @@ -107,10 +105,14 @@ impl Fs { pub fn set_export_table(&mut self, export_table: ExportTable) -> u64 { static FS_UNIQUE_ID: AtomicU64 = AtomicU64::new(0); - self.passthrough_cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); - self.passthrough_cfg.export_table = Some(export_table); + let cfg = self + .passthrough_cfg + .as_mut() + .expect("export_table requires a passthrough filesystem"); + cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + cfg.export_table = Some(export_table); - self.passthrough_cfg.export_fsid + cfg.export_fsid } #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index 3d19c1dba..2809cb89c 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -17,6 +17,7 @@ use super::super::{FsError, Queue}; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; use super::inode_alloc::InodeAllocator; +use super::null_fs::NullFs; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; @@ -27,6 +28,7 @@ use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { ReadWrite(Server>), ReadOnly(Server>), + Null(Server>), } impl FsServer { @@ -55,6 +57,14 @@ impl FsServer { #[cfg(target_os = "macos")] map_sender, ), + FsServer::Null(s) => s.handle_message( + r, + w, + shm_region, + exit_code, + #[cfg(target_os = "macos")] + map_sender, + ), } } } @@ -80,7 +90,7 @@ impl FsWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, virtual_entries: Vec, stop_fd: EventFd, @@ -88,20 +98,28 @@ impl FsWorker { #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { let inode_alloc = Arc::new(InodeAllocator::new()); - let server = if read_only { - let inner = PassthroughFsRo::new(passthrough_cfg, inode_alloc.clone())?; - FsServer::ReadOnly(Server::new(AugmentFs::new( - inner, - &inode_alloc, - virtual_entries, - ))) - } else { - let inner = PassthroughFs::new(passthrough_cfg, inode_alloc.clone())?; - FsServer::ReadWrite(Server::new(AugmentFs::new( - inner, + let server = match passthrough_cfg { + Some(cfg) if read_only => { + let inner = PassthroughFsRo::new(cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + Some(cfg) => { + let inner = PassthroughFs::new(cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + None => FsServer::Null(Server::new(AugmentFs::new( + NullFs, &inode_alloc, virtual_entries, - ))) + ))), }; Ok(Self { queues, diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index dfbc6edd8..99d69c235 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -19,8 +19,6 @@ use devices::virtio::fs::virtual_inode::{VirtualEntry, VirtualFile}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; @@ -615,10 +613,9 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, - shared_dir, + shared_dir: Some(shared_dir), // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, virtual_entries, }); @@ -693,9 +690,8 @@ pub unsafe extern "C" fn krun_add_virtiofs3( } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: path.to_string(), + shared_dir: Some(path.to_string()), shm_size: shm, - allow_root_dir_delete: false, read_only, virtual_entries, }); @@ -2423,28 +2419,29 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; - } - + // Boot from a block device: the virtiofs root only needs to + // serve init.krun and provide mount points for /dev, /proc, /sys. + // Use a NullFs (no host directory) with the inode overlay. let mut virtual_entries = Vec::new(); if !ctx_cfg.disable_implicit_init { virtual_entries.push(init_virtual_entry()); } + // init.c needs these directories as mount points before + // pivoting to the block device root. + for name in ["dev", "proc", "sys", "newroot"] { + virtual_entries.push(VirtualEntry { + name: CString::new(name).unwrap(), + mode: libc::S_IFDIR as u32 | 0o755, + inode: VirtualInode::Dir { + children: Vec::new(), + }, + }); + } ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), + shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: true, read_only: false, virtual_entries, }); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 49a39c009..aa4b8cec4 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -2055,7 +2055,6 @@ fn attach_fs_devices( config.fs_id, config.shared_dir, exit_code.clone(), - config.allow_root_dir_delete, config.read_only, config.virtual_entries, ) diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index b4d6682c7..b95982bf7 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -2,9 +2,10 @@ use devices::virtio::fs::virtual_inode::VirtualEntry; pub struct FsDeviceConfig { pub fs_id: String, - pub shared_dir: String, + /// Host directory to pass through. None means a virtual-only filesystem + /// (NullFs + InodeOverlay, no host directory). + pub shared_dir: Option, pub shm_size: Option, - pub allow_root_dir_delete: bool, pub read_only: bool, pub virtual_entries: Vec, } From d413cdec5dbe61feed0ed95ce0471aa491fa3a40 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 5 May 2026 18:30:55 +0200 Subject: [PATCH 10/13] virtio/fs: remove REMOVE_ROOT_DIR ioctl and allow_root_dir_delete The temporary root directory hack is gone (replaced by NullFs), so the ioctl that cleaned it up and the config flag that gated it are no longer needed. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- src/devices/src/virtio/fs/linux/passthrough.rs | 10 ---------- src/devices/src/virtio/fs/macos/passthrough.rs | 7 ------- src/devices/src/virtio/fs/read_only.rs | 8 -------- 3 files changed, 25 deletions(-) diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index 08da133f0..2bfa46349 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -325,7 +325,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -340,7 +339,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -2122,10 +2120,6 @@ impl FileSystem for PassthroughFs { const VIRTIO_IOC_EXIT_CODE_REQ: u32 = request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - const VIRTIO_IOC_REMOVE_ROOT_DIR_CODE: u8 = 3; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_REMOVE_ROOT_DIR_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2160,10 +2154,6 @@ impl FileSystem for PassthroughFs { exit_code.store(arg as i32, Ordering::SeqCst); Ok(Vec::new()) } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index d1a862d0c..3a0500735 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -514,7 +514,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. Not supported for macos. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -529,7 +528,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -2441,17 +2439,12 @@ impl FileSystem for PassthroughFs { // We can't use nix::request_code_none here since it's system-dependent // and we need the value from Linux. const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; match cmd { VIRTIO_IOC_EXIT_CODE_REQ => { exit_code.store(arg as i32, Ordering::SeqCst); Ok(Vec::new()) } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index eb8aebef3..5495db1ed 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -36,10 +36,6 @@ fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } -// Keep the Linux ioctl number so read-only virtio-fs can still handle -// non-mutating control ioctls while rejecting host-side root deletion. -const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - fn read_only_open_flags(flags: u32) -> io::Result { let f = flags as i32; if f & libc::O_ACCMODE != libc::O_RDONLY { @@ -319,10 +315,6 @@ impl FileSystem for PassthroughFsRo { out_size: u32, exit_code: &Arc, ) -> io::Result> { - if cmd == VIRTIO_IOC_REMOVE_ROOT_DIR_REQ { - return Err(erofs()); - } - self.inner.ioctl( ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, ) From b3615e3df0672e72715856f743deb9b34bda00a4 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Tue, 5 May 2026 18:35:42 +0200 Subject: [PATCH 11/13] virtio/fs: move EXIT_CODE ioctl to AugmentFs The exit-code ioctl is a krun mechanism, not a filesystem operation. Move it to the AugmentFs where it is handled before any delegation to the inner filesystem. The Linux passthrough retains only EXPORT_FD (which needs access to passthrough-internal handle and export tables). The macOS passthrough no longer implements ioctl at all (the trait default returns ENOSYS for any cmd that reaches it). Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- include/libkrun.h | 18 ++ src/devices/src/virtio/fs/augment_fs.rs | 258 +++++++++++++----- .../src/virtio/fs/linux/passthrough.rs | 14 +- .../src/virtio/fs/macos/passthrough.rs | 25 -- src/devices/src/virtio/fs/virtual_inode.rs | 61 +++-- src/libkrun/src/lib.rs | 71 ++++- 6 files changed, 318 insertions(+), 129 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index 4b434bc5a..c5a64351f 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1267,6 +1267,24 @@ int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, const char *filename, const uint8_t *data, size_t data_len, uint32_t mode, bool one_shot); +/** + * Add a virtual overlay directory to a virtiofs device. + * + * The directory will appear in the root directory of the specified virtiofs + * mount. It is empty and read-only, useful as a mount point. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "dirname" - name of the directory in the root directory. + * "mode" - directory mode bits (e.g. 040755). + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_dir(uint32_t ctx_id, const char *fs_tag, + const char *dirname, uint32_t mode); + /** * Disable the implicit vsock device. * diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs index 9c507cc93..351f7bf98 100644 --- a/src/devices/src/virtio/fs/augment_fs.rs +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::ffi::CStr; use std::ffi::CString; use std::io; -use std::sync::atomic::AtomicI32; +use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::Arc; use std::sync::RwLock; use std::time::Duration; @@ -30,7 +30,7 @@ use super::filesystem::{ }; use super::fuse; use super::inode_alloc::InodeAllocator; -use super::virtual_inode::{VirtualEntry, VirtualFile}; +use super::virtual_inode::{VirtualEntry, VirtualInode}; use crate::virtio::bindings; type Inode = u64; @@ -62,12 +62,12 @@ fn eperm() -> io::Error { /// Overlay that injects virtual inodes into an inner `FileSystem`. pub struct AugmentFs { inner: T, - /// Maps (name in root dir) → virtual inode number. One-shot entries + /// Maps (parent_inode, name) → child inode number. One-shot entries /// are removed on first lookup so the file can only be opened once. - name_to_inode: RwLock>, - /// Maps virtual inode number → file data. One-shot entries are removed - /// from this map on release. - inodes: RwLock>, + name_to_inode: RwLock>, + /// Maps virtual inode number → (mode, inode data). One-shot entries are + /// removed from this map on release. + inodes: RwLock>, } impl> AugmentFs { @@ -77,14 +77,16 @@ impl> AugmentFs { /// Inode numbers are obtained from `inode_alloc`, the same allocator /// used by the inner filesystem. pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { - let mut name_to_inode = HashMap::with_capacity(entries.len()); - let mut inodes = HashMap::with_capacity(entries.len()); + let mut name_to_inode = HashMap::new(); + let mut inodes = HashMap::new(); - for entry in entries { - let inode = inode_alloc.next(); - name_to_inode.insert(entry.name, inode); - inodes.insert(inode, entry.file); - } + Self::register_entries( + fuse::ROOT_ID, + entries, + inode_alloc, + &mut name_to_inode, + &mut inodes, + ); Self { inner, @@ -93,6 +95,35 @@ impl> AugmentFs { } } + fn register_entries( + parent: Inode, + entries: Vec, + inode_alloc: &InodeAllocator, + name_to_inode: &mut HashMap<(Inode, CString), Inode>, + inodes: &mut HashMap, + ) { + for entry in entries { + let ino = inode_alloc.next(); + name_to_inode.insert((parent, entry.name), ino); + + // Recurse into directory children before moving the inode. + if let VirtualInode::Dir { children } = entry.inode { + Self::register_entries(ino, children, inode_alloc, name_to_inode, inodes); + inodes.insert( + ino, + ( + entry.mode, + VirtualInode::Dir { + children: Vec::new(), + }, + ), + ); + } else { + inodes.insert(ino, (entry.mode, entry.inode)); + } + } + } + fn is_virtual(&self, inode: Inode) -> bool { self.inodes.read().unwrap().contains_key(&inode) } @@ -111,36 +142,32 @@ impl> FileSystem for AugmentFs } fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { - if parent == fuse::ROOT_ID { - let inode = self.name_to_inode.read().unwrap().get(name).copied(); - if let Some(inode) = inode { - let inodes = self.inodes.read().unwrap(); - if let Some(file) = inodes.get(&inode) { - let one_shot = file.one_shot; - let st = file.stat(inode); - let entry_timeout = if one_shot { - Duration::ZERO - } else { - VIRTUAL_TIMEOUT - }; - - // One-shot: remove name so subsequent lookups fall - // through to the inner filesystem (or return ENOENT). - if one_shot { - // Drop the read lock first, before locking for write - drop(inodes); - self.name_to_inode.write().unwrap().remove(name); - } + let key = (parent, CString::from(name)); + let inode = self.name_to_inode.read().unwrap().get(&key).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some((mode, vnode)) = inodes.get(&inode) { + let one_shot = vnode.is_one_shot(); + let st = vnode.stat(inode, *mode); + let entry_timeout = if one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; - return Ok(Entry { - inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: VIRTUAL_TIMEOUT, - entry_timeout, - }); + if one_shot { + drop(inodes); + self.name_to_inode.write().unwrap().remove(&key); } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: VIRTUAL_TIMEOUT, + entry_timeout, + }); } } self.inner.lookup(ctx, parent, name) @@ -170,8 +197,8 @@ impl> FileSystem for AugmentFs ) -> io::Result<(bindings::stat64, Duration)> { { let inodes = self.inodes.read().unwrap(); - if let Some(file) = inodes.get(&inode) { - let st = file.stat(inode); + if let Some((mode, vnode)) = inodes.get(&inode) { + let st = vnode.stat(inode, *mode); return Ok((st, VIRTUAL_TIMEOUT)); } } @@ -233,12 +260,101 @@ impl> FileSystem for AugmentFs umask: u32, extensions: Extensions, ) -> io::Result { - .map_err(|_| io::Error::from_raw_os_error(LINUX_EINVAL))?; - if off >= data.len() { return Ok(0); - } - let remaining = file.data.len() - off; + let key = (parent, CString::from(name)); + if self.name_to_inode.read().unwrap().contains_key(&key) { + return Err(io::Error::from_raw_os_error(libc::EEXIST)); + } + self.inner.mkdir(ctx, parent, name, mode, umask, extensions) + } + + fn unlink(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.unlink(ctx, parent, name) + } + + fn rmdir(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.rmdir(ctx, parent, name) + } + + fn rename( + &self, + ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + self.inner + .rename(ctx, olddir, oldname, newdir, newname, flags) + } + + fn link( + &self, + ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.link(ctx, inode, newparent, newname) + } + + fn open( + &self, + ctx: Context, + inode: Inode, + kill_priv: bool, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + if self.is_virtual(inode) { + if (flags as i32 & libc::O_ACCMODE) != libc::O_RDONLY { + return Err(io::Error::from_raw_os_error(libc::EACCES)); + } + return Ok((Some(VIRTUAL_HANDLE), OpenOptions::empty())); + } + self.inner.open(ctx, inode, kill_priv, flags) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + kill_priv: bool, + flags: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result<(Entry, Option, OpenOptions)> { + self.inner + .create(ctx, parent, name, mode, kill_priv, flags, umask, extensions) + } + + fn read( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + lock_owner: Option, + flags: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some((_, vnode)) = inodes.get(&inode) { + let data = vnode.data(); + let off: usize = offset + .try_into() + .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; + if off >= data.len() { + return Ok(0); } + let remaining = data.len() - off; let len = remaining.min(size as usize); - return w.write(&file.data[off..(off + len)]); + return w.write(&data[off..(off + len)]); } } self.inner @@ -317,8 +433,8 @@ impl> FileSystem for AugmentFs ) -> io::Result<()> { { let mut inodes = self.inodes.write().unwrap(); - if let Some(file) = inodes.get(&inode) { - if file.one_shot { + if let Some((_, vnode)) = inodes.get(&inode) { + if vnode.is_one_shot() { inodes.remove(&inode); } return Ok(()); @@ -451,8 +567,8 @@ impl> FileSystem for AugmentFs ) -> io::Result { { let inodes = self.inodes.read().unwrap(); - if let Some(file) = inodes.get(&inode) { - let size = file.data.len() as u64; + if let Some((_, vnode)) = inodes.get(&inode) { + let size = vnode.data().len() as u64; // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. return match whence as i32 { libc::SEEK_DATA => { @@ -514,7 +630,8 @@ impl> FileSystem for AugmentFs ) -> io::Result<()> { { let inodes = self.inodes.read().unwrap(); - if let Some(file) = inodes.get(&inode) { + if let Some((_, vnode)) = inodes.get(&inode) { + let data = vnode.data(); #[cfg(target_os = "linux")] { if (moffset + len) > shm_size { @@ -537,13 +654,13 @@ impl> FileSystem for AugmentFs } let foff = foffset as usize; - if foff < file.data.len() { - let available = file.data.len() - foff; + if foff < data.len() { + let available = data.len() - foff; let to_copy = (len as usize).min(available); unsafe { libc::memcpy( addr as *mut libc::c_void, - file.data.as_ptr().add(foff) as *const _, + data.as_ptr().add(foff) as *const _, to_copy, ) }; @@ -552,13 +669,14 @@ impl> FileSystem for AugmentFs return Ok(()); } - // TODO: implement DAX for virtual files on macOS using - // the ShmRegionManager once it exists (see dax-window-layering task). + // TODO: implement DAX for virtual files on macOS. + // Needs a shared memory region manager (see setupmapping + // in macos/passthrough.rs for the real-file DAX path). #[cfg(target_os = "macos")] { let _ = data; - return Err(io::Error::from_raw_os_error(LINUX_ENOSYS)); } - } + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } } } self.inner.setupmapping( ctx, @@ -605,10 +723,18 @@ impl> FileSystem for AugmentFs out_size: u32, exit_code: &Arc, ) -> io::Result> { - // Always delegate: the exit-code and root-dir-removal ioctls are - // dispatched by command number, not by inode. - self.inner.ioctl( - ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, - ) + // The ioctl cmd values use Linux encoding regardless of host OS + // because the guest always runs Linux. + const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; + + match cmd { + VIRTIO_IOC_EXIT_CODE_REQ => { + exit_code.store(arg as i32, Ordering::SeqCst); + Ok(Vec::new()) + } + _ => self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ), + } } } diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index 2bfa46349..8272a7e01 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -16,7 +16,7 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use caps::{has_cap, CapSet, Capability}; -use nix::{request_code_none, request_code_read}; +use nix::request_code_read; use vm_memory::ByteValued; @@ -2101,10 +2101,10 @@ impl FileSystem for PassthroughFs { handle: Self::Handle, _flags: u32, cmd: u32, - arg: u64, + _arg: u64, _in_size: u32, out_size: u32, - exit_code: &Arc, + _exit_code: &Arc, ) -> io::Result> { const VIRTIO_IOC_MAGIC: u8 = b'v'; @@ -2116,10 +2116,6 @@ impl FileSystem for PassthroughFs { VIRTIO_IOC_EXPORT_FD_SIZE ) as u32; - const VIRTIO_IOC_TYPE_EXIT_CODE: u8 = 2; - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2150,10 +2146,6 @@ impl FileSystem for PassthroughFs { ret.extend_from_slice(&handle.to_ne_bytes()); Ok(ret) } - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 3a0500735..c09ecdd67 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -2423,29 +2423,4 @@ impl FileSystem for PassthroughFs { Ok(()) } - - fn ioctl( - &self, - _ctx: Context, - _inode: Self::Inode, - _handle: Self::Handle, - _flags: u32, - cmd: u32, - arg: u64, - _in_size: u32, - _out_size: u32, - exit_code: &Arc, - ) -> io::Result> { - // We can't use nix::request_code_none here since it's system-dependent - // and we need the value from Linux. - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - - match cmd { - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), - } - } } diff --git a/src/devices/src/virtio/fs/virtual_inode.rs b/src/devices/src/virtio/fs/virtual_inode.rs index 8a4f6b195..a5c795157 100644 --- a/src/devices/src/virtio/fs/virtual_inode.rs +++ b/src/devices/src/virtio/fs/virtual_inode.rs @@ -1,37 +1,64 @@ // Virtual inode types for the virtiofs overlay. // -// A `VirtualFile` represents a read-only file backed by static data that is -// injected into the guest filesystem without any corresponding host file. +// A `VirtualInode` represents a synthetic inode injected into the guest +// filesystem without any corresponding host file or directory. use std::ffi::CString; use std::mem; use crate::virtio::bindings; -/// A read-only virtual file backed by a static byte slice. -pub struct VirtualFile { - pub data: &'static [u8], - pub mode: u32, - /// If true, the file can only be looked up once. - pub one_shot: bool, +/// A synthetic inode that exists only in memory. +pub enum VirtualInode { + /// A read-only file backed by a static byte slice. + File { + data: &'static [u8], + /// If true, the file can only be looked up once. + one_shot: bool, + }, + /// A directory containing other virtual entries. + Dir { children: Vec }, } -impl VirtualFile { - /// Synthesize a stat result for this virtual file. - pub fn stat(&self, inode: u64) -> bindings::stat64 { +impl VirtualInode { + pub fn is_dir(&self) -> bool { + matches!(self, Self::Dir { .. }) + } + + pub fn is_one_shot(&self) -> bool { + matches!(self, Self::File { one_shot: true, .. }) + } + + pub fn data(&self) -> &'static [u8] { + match self { + Self::File { data, .. } => data, + Self::Dir { .. } => &[], + } + } + + /// Synthesize a stat result for this virtual inode. + pub fn stat(&self, inode: u64, mode: u32) -> bindings::stat64 { let mut st: bindings::stat64 = unsafe { mem::zeroed() }; st.st_ino = inode; - st.st_size = self.data.len() as i64; - st.st_mode = self.mode as _; - st.st_nlink = 1; + st.st_mode = mode as _; st.st_blksize = 4096; - st.st_blocks = ((self.data.len() as i64) + 511) / 512; + match self { + Self::File { data, .. } => { + st.st_size = data.len() as i64; + st.st_nlink = 1; + st.st_blocks = ((data.len() as i64) + 511) / 512; + } + Self::Dir { .. } => { + st.st_nlink = 2; + } + } st } } -/// An entry to register as a virtual inode in the root directory. +/// An entry to register as a virtual inode. pub struct VirtualEntry { pub name: CString, - pub file: VirtualFile, + pub mode: u32, + pub inode: VirtualInode, } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 99d69c235..1a9adafa3 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -15,7 +15,7 @@ use env_logger::{Env, Target}; use krun_display::DisplayBackend; #[cfg(not(feature = "tee"))] -use devices::virtio::fs::virtual_inode::{VirtualEntry, VirtualFile}; +use devices::virtio::fs::virtual_inode::{VirtualEntry, VirtualInode}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; @@ -96,9 +96,9 @@ const DEFAULT_INIT_PAYLOAD: &[u8] = init_blob::INIT_BINARY; fn init_virtual_entry() -> VirtualEntry { VirtualEntry { name: std::ffi::CString::new("init.krun").unwrap(), - file: VirtualFile { + mode: 0o100_755, + inode: VirtualInode::File { data: DEFAULT_INIT_PAYLOAD, - mode: 0o100_755, one_shot: true, }, } @@ -659,7 +659,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( shm_size: u64, read_only: bool, ) -> i32 { - if c_tag.is_null() || c_path.is_null() { + if c_tag.is_null() { return -libc::EINVAL; } @@ -667,9 +667,15 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Ok(tag) => tag, Err(_) => return -libc::EINVAL, }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, + + // NULL path means NullFs (virtual-only filesystem, no host directory). + let path = if c_path.is_null() { + None + } else { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(path), + Err(_) => return -libc::EINVAL, + } }; let shm = if shm_size > 0 { @@ -690,7 +696,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: Some(path.to_string()), + shared_dir: path.map(|p| p.to_string()), shm_size: shm, read_only, virtual_entries, @@ -2514,9 +2520,9 @@ pub unsafe extern "C" fn krun_fs_add_overlay_file( let entry = VirtualEntry { name: filename, - file: VirtualFile { + mode, + inode: VirtualInode::File { data: payload, - mode, one_shot, }, }; @@ -2535,6 +2541,51 @@ pub unsafe extern "C" fn krun_fs_add_overlay_file( KRUN_SUCCESS } +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_dir( + ctx_id: u32, + c_fs_tag: *const c_char, + c_dirname: *const c_char, + mode: u32, +) -> i32 { + if c_fs_tag.is_null() || c_dirname.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let dirname = match CString::new(CStr::from_ptr(c_dirname).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let entry = VirtualEntry { + name: dirname, + mode, + inode: VirtualInode::Dir { + children: Vec::new(), + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { From 1a5b09e4e4350bda42bf9e9d8b66582f819c79f0 Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Wed, 6 May 2026 18:31:53 +0200 Subject: [PATCH 12/13] tests: add augmentfs integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boot a VM with a pure NullFs root — no host directory at all. Every file in the root (init.krun, guest-agent, .krun_config.json, test data) is injected as a virtual overlay, and /dev, /proc, /sys are virtual empty directories used as mount points. The guest verifies: - One-shot files (init.krun, guest-agent, .krun_config.json) are gone after being consumed - Persistent files (marker.txt, testdata.bin) survive and are re-readable - Write access to virtual files is denied (EACCES) - stat reports correct sizes - Range reads at various offsets return correct data - Read past EOF returns zero bytes Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- tests/test_cases/src/lib.rs | 3 + tests/test_cases/src/test_augmentfs.rs | 278 +++++++++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 tests/test_cases/src/test_augmentfs.rs diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..5dd740040 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -27,6 +27,8 @@ use test_pjdfstest::TestPjdfstest; mod test_virtiofs_misc; use test_virtiofs_misc::TestVirtioFsMisc; +mod test_augmentfs; +use test_augmentfs::TestAugmentFs; pub enum TestOutcome { Pass, @@ -86,6 +88,7 @@ pub fn test_cases() -> Vec { TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), + TestCase::new("augmentfs", Box::new(TestAugmentFs)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), TestCase::new("perf-net-passt-rx", Box::new(TestNetPerf::new_passt_rx())), TestCase::new("perf-net-tap-tx", Box::new(TestNetPerf::new_tap_tx())), diff --git a/tests/test_cases/src/test_augmentfs.rs b/tests/test_cases/src/test_augmentfs.rs new file mode 100644 index 000000000..b25ec81c3 --- /dev/null +++ b/tests/test_cases/src/test_augmentfs.rs @@ -0,0 +1,278 @@ +// Test the AugmentFs overlay over a NullFs. +// +// Boots a VM with NO host filesystem — the root virtiofs is backed entirely +// by virtual inodes: init.krun (one-shot), the guest-agent binary (one-shot), +// a .krun_config.json (one-shot), persistent test files, and virtual +// directories as mount points for /dev, /proc, /sys. + +use macros::{guest, host}; + +pub struct TestAugmentFs; + +fn make_test_payload() -> Vec { + (0..8192u32).map(|i| (i % 251) as u8).collect() +} + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::ptr::null_mut; + + impl Test for TestAugmentFs { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let test_case = CString::new(test_setup.test_case)?; + + // Read the guest-agent binary into memory. Leaked because + // krun_start_enter never returns. + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + let guest_agent_bytes: &'static [u8] = + Vec::leak(std::fs::read(&guest_agent_path).expect("Failed to read guest-agent")); + + // Build JSON config: exec the guest-agent with our test name. + let json = format!( + r#"{{"args": ["/guest-agent", "{}"], "cwd": "/"}}"#, + test_case.to_str().unwrap() + ); + let json_bytes: &'static [u8] = Vec::leak(json.into_bytes()); + + // Deterministic test payload for range-read tests. + let payload: &'static [u8] = Vec::leak(make_test_payload()); + + // A small marker file to test persistent reads. + let marker: &'static [u8] = b"virtual-file-marker-content-12345"; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Disable the implicit init — we'll inject it ourselves. + krun_call!(krun_disable_implicit_init(ctx))?; + + // Get the default init binary. + let mut init_data: *const u8 = null_mut(); + let mut init_len: usize = 0; + krun_call!(krun_get_default_init(&mut init_data, &mut init_len))?; + + // Set up root with NO host directory (NullFs). + krun_call!(krun_add_virtiofs3( + ctx, + c"/dev/root".as_ptr(), + std::ptr::null(), // NULL path → NullFs + 0, // no SHM window + false, // not read-only + ))?; + + // Virtual directories needed by init as mount points. + for dir in [c"dev", c"proc", c"sys"] { + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + dir.as_ptr(), + 0o040_755, + ))?; + } + + // Overlay init.krun (one-shot, executable). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"init.krun".as_ptr(), + init_data, + init_len, + 0o100_755, + true, + ))?; + + // Overlay guest-agent (one-shot, executable). After init + // execs it, the file should no longer be visible. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"guest-agent".as_ptr(), + guest_agent_bytes.as_ptr(), + guest_agent_bytes.len(), + 0o100_755, + true, + ))?; + + // Overlay .krun_config.json (one-shot). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c".krun_config.json".as_ptr(), + json_bytes.as_ptr(), + json_bytes.len(), + 0o100_644, + true, + ))?; + + // Overlay a persistent marker file. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"marker.txt".as_ptr(), + marker.as_ptr(), + marker.len(), + 0o100_644, + false, + ))?; + + // Overlay a deterministic 8 KiB payload for range-read tests. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"testdata.bin".as_ptr(), + payload.as_ptr(), + payload.len(), + 0o100_444, + false, + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::io::{ErrorKind, Read, Seek, SeekFrom}; + use std::path::Path; + + impl Test for TestAugmentFs { + fn in_guest(self: Box) { + // --- One-shot files should be gone --- + assert!( + !Path::new("/.krun_config.json").exists(), + ".krun_config.json should be gone (one-shot)" + ); + assert!( + !Path::new("/init.krun").exists(), + "init.krun should be gone (one-shot)" + ); + + // --- One-shot guest-agent can't see itself --- + assert!( + !Path::new("/guest-agent").exists(), + "guest-agent should be gone (one-shot)" + ); + + // --- Virtual directories should be accessible --- + // init already mounted over these, but let's verify they + // exist as directories (the mount points came from our + // virtual dir overlay). + for dir in ["/dev", "/proc", "/sys"] { + let meta = fs::metadata(dir) + .unwrap_or_else(|e| panic!("{dir} should exist: {e}")); + assert!(meta.is_dir(), "{dir} should be a directory"); + } + + // Verify the mounts actually worked by checking known entries. + assert!( + Path::new("/dev/null").exists(), + "/dev/null should exist (devtmpfs)" + ); + assert!( + Path::new("/proc/self").exists(), + "/proc/self should exist (procfs)" + ); + assert!( + Path::new("/sys/kernel").exists(), + "/sys/kernel should exist (sysfs)" + ); + + // Verify directory listing works on each mounted fs. + let dev_entries: Vec<_> = fs::read_dir("/dev") + .expect("read_dir /dev") + .collect(); + assert!(!dev_entries.is_empty(), "/dev listing should not be empty"); + + let proc_entries: Vec<_> = fs::read_dir("/proc") + .expect("read_dir /proc") + .collect(); + assert!(!proc_entries.is_empty(), "/proc listing should not be empty"); + + let sys_entries: Vec<_> = fs::read_dir("/sys") + .expect("read_dir /sys") + .collect(); + assert!(!sys_entries.is_empty(), "/sys listing should not be empty"); + + // --- Persistent files should still exist --- + assert!(Path::new("/marker.txt").exists(), "marker.txt should exist"); + assert!( + Path::new("/testdata.bin").exists(), + "testdata.bin should exist" + ); + + // --- Read + verify marker content --- + let content = fs::read_to_string("/marker.txt").expect("read marker.txt"); + assert_eq!(content, "virtual-file-marker-content-12345"); + + // --- Repeated reads return the same data --- + let content2 = fs::read_to_string("/marker.txt").expect("re-read marker.txt"); + assert_eq!(content, content2, "repeated reads differ"); + + // --- Write should fail --- + let err = fs::OpenOptions::new() + .write(true) + .open("/marker.txt") + .expect_err("write-open should fail"); + assert_eq!(err.kind(), ErrorKind::PermissionDenied); + + // --- stat reports correct size --- + let meta = fs::metadata("/testdata.bin").expect("stat testdata.bin"); + assert_eq!(meta.len(), 8192, "testdata.bin size mismatch"); + + // --- Range reads on the 8 KiB payload --- + let expected = make_test_payload(); + let mut f = fs::File::open("/testdata.bin").expect("open testdata.bin"); + + // Full read. + let got = fs::read("/testdata.bin").expect("full read"); + assert_eq!(got, expected, "full read mismatch"); + + // Read first 256 bytes. + let mut buf = vec![0u8; 256]; + f.read_exact(&mut buf).expect("read first 256"); + assert_eq!(buf, &expected[..256], "first 256 bytes mismatch"); + + // Seek to offset 4000, read 512 bytes. + f.seek(SeekFrom::Start(4000)).expect("seek to 4000"); + let mut buf = vec![0u8; 512]; + f.read_exact(&mut buf).expect("read at offset 4000"); + assert_eq!(buf, &expected[4000..4512], "range [4000..4512] mismatch"); + + // Seek to last 10 bytes. + f.seek(SeekFrom::End(-10)).expect("seek to end-10"); + let mut buf = vec![0u8; 10]; + f.read_exact(&mut buf).expect("read last 10"); + assert_eq!(buf, &expected[8182..8192], "last 10 bytes mismatch"); + + // Read past EOF should return 0 bytes. + f.seek(SeekFrom::Start(8192)).expect("seek to EOF"); + let mut buf = vec![0u8; 100]; + let n = f.read(&mut buf).expect("read past EOF"); + assert_eq!(n, 0, "read past EOF should return 0"); + + // Seek back to start, re-read, verify consistency. + f.seek(SeekFrom::Start(0)).expect("seek to start"); + let mut full = Vec::new(); + f.read_to_end(&mut full).expect("read_to_end"); + assert_eq!(full, expected, "read_to_end mismatch"); + + println!("OK"); + } + } +} From 89a7101c5e81f59784897f8316497d22192201ed Mon Sep 17 00:00:00 2001 From: Matej Hrica Date: Mon, 11 May 2026 14:28:06 +0200 Subject: [PATCH 13/13] tests: add root-disk-remount integration test Boot from an ext4 block device via krun_set_root_disk_remount. The virtiofs root uses NullFs with init.krun and virtual mount-point directories overlaid. The guest verifies it pivoted to the block device root successfully. Assisted-by: OpenCode:claude-opus-4.6 Signed-off-by: Matej Hrica --- Makefile | 8 +- tests/guest-agent/Cargo.toml | 3 + tests/run.sh | 13 +- tests/runner/Cargo.toml | 3 + tests/test_cases/Cargo.toml | 1 + tests/test_cases/src/lib.rs | 7 + .../test_cases/src/test_root_disk_remount.rs | 124 ++++++++++++++++++ 7 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 tests/test_cases/src/test_root_disk_remount.rs diff --git a/Makefile b/Makefile index 17ec2f2d3..299622484 100644 --- a/Makefile +++ b/Makefile @@ -272,7 +272,13 @@ TEST_FLAGS ?= EXTRA_LIBPATH_Linux = EXTRA_LIBPATH_Darwin = /opt/homebrew/opt/libkrunfw/lib:/opt/homebrew/opt/llvm/lib +# Extra cargo features for the test runner (passed via KRUN_TEST_FEATURES). +TEST_FEATURE_FLAGS := +ifeq ($(BLK),1) + TEST_FEATURE_FLAGS += blk +endif + # On macOS, SIP strips DYLD_LIBRARY_PATH when executing scripts via a shebang, # so we pass the path via LIBKRUN_LIB_PATH and let run.sh set the real variable. test: test-prefix - cd tests; RUST_LOG=trace LIBKRUN_LIB_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/):$(EXTRA_LIBPATH_$(OS))" PKG_CONFIG_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/pkgconfig/)" ./run.sh test --test-case "$(TEST)" $(TEST_FLAGS) + cd tests; RUST_LOG=trace KRUN_TEST_FEATURES="$(TEST_FEATURE_FLAGS)" LIBKRUN_LIB_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/):$(EXTRA_LIBPATH_$(OS))" PKG_CONFIG_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/pkgconfig/)" ./run.sh test --test-case "$(TEST)" $(TEST_FLAGS) diff --git a/tests/guest-agent/Cargo.toml b/tests/guest-agent/Cargo.toml index 47617a0e4..9ab3fcbfc 100644 --- a/tests/guest-agent/Cargo.toml +++ b/tests/guest-agent/Cargo.toml @@ -2,6 +2,9 @@ name = "guest-agent" edition = "2021" +[features] +blk = ["test_cases/blk"] + [dependencies] test_cases = { path = "../test_cases", features = ["guest"] } anyhow = "1.0.95" diff --git a/tests/run.sh b/tests/run.sh index 3d7b1e6ef..13e57b0eb 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -44,8 +44,17 @@ if [ "$OS" = "Darwin" ]; then echo "Cross-compiling guest-agent for $GUEST_TARGET" fi -cargo build --target=$GUEST_TARGET -p guest-agent -cargo build -p runner +# KRUN_TEST_FEATURES can be set to pass extra features to test_cases/runner +# (e.g. "blk" when libkrun was built with BLK=1). +TEST_FEATURES="${KRUN_TEST_FEATURES:-}" + +if [ -n "$TEST_FEATURES" ]; then + cargo build --target=$GUEST_TARGET -p guest-agent --features "$TEST_FEATURES" + cargo build -p runner --features "$TEST_FEATURES" +else + cargo build --target=$GUEST_TARGET -p guest-agent + cargo build -p runner +fi # On macOS, the runner needs entitlements to use Hypervisor.framework if [ "$OS" = "Darwin" ]; then diff --git a/tests/runner/Cargo.toml b/tests/runner/Cargo.toml index 8133341b8..141d07a5f 100644 --- a/tests/runner/Cargo.toml +++ b/tests/runner/Cargo.toml @@ -2,6 +2,9 @@ name = "runner" edition = "2021" +[features] +blk = ["test_cases/blk"] + [dependencies] test_cases = { path = "../test_cases", features = ["host"] } anyhow = "1.0.95" diff --git a/tests/test_cases/Cargo.toml b/tests/test_cases/Cargo.toml index 8c9bcc924..4ecc6801b 100644 --- a/tests/test_cases/Cargo.toml +++ b/tests/test_cases/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [features] host = ["krun-sys", "serde", "serde_json"] guest = [] +blk = [] [lib] name = "test_cases" diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 5dd740040..ea9ec98f5 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -30,6 +30,11 @@ use test_virtiofs_misc::TestVirtioFsMisc; mod test_augmentfs; use test_augmentfs::TestAugmentFs; +#[cfg(feature = "blk")] +mod test_root_disk_remount; +#[cfg(feature = "blk")] +use test_root_disk_remount::TestRootDiskRemount; + pub enum TestOutcome { Pass, Fail(String), @@ -89,6 +94,8 @@ pub fn test_cases() -> Vec { TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), TestCase::new("augmentfs", Box::new(TestAugmentFs)), + #[cfg(feature = "blk")] + TestCase::new("root-disk-remount", Box::new(TestRootDiskRemount)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), TestCase::new("perf-net-passt-rx", Box::new(TestNetPerf::new_passt_rx())), TestCase::new("perf-net-tap-tx", Box::new(TestNetPerf::new_tap_tx())), diff --git a/tests/test_cases/src/test_root_disk_remount.rs b/tests/test_cases/src/test_root_disk_remount.rs new file mode 100644 index 000000000..d0dfaf567 --- /dev/null +++ b/tests/test_cases/src/test_root_disk_remount.rs @@ -0,0 +1,124 @@ +// Test that krun_set_root_disk_remount works with NullFs. +// +// Creates a tiny ext4 disk image containing only the guest-agent binary, +// boots from it via krun_set_root_disk_remount (which uses NullFs for the +// initial virtiofs root with init.krun overlaid), and verifies the guest +// successfully pivoted to the block device root. + +use macros::{guest, host}; + +pub struct TestRootDiskRemount; + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::process::Command; + use std::ptr::null; + + fn create_disk_image(guest_agent_path: &str, output_path: &str) { + // Populate from a staging directory using mke2fs -d (no root needed). + let staging = format!("{output_path}.staging"); + std::fs::create_dir_all(&staging).expect("mkdir staging"); + + std::fs::copy(guest_agent_path, format!("{staging}/guest-agent")) + .expect("copy guest-agent"); + + // Marker file to verify the guest booted from the block device. + std::fs::write( + format!("{staging}/block-marker"), + "booted-from-block-device", + ) + .expect("write marker"); + + let status = Command::new("mke2fs") + .args(["-q", "-t", "ext4", "-d", &staging, output_path, "32M"]) + .status() + .expect("mke2fs failed"); + assert!(status.success(), "mke2fs failed"); + + std::fs::remove_dir_all(&staging).expect("cleanup staging"); + } + + impl Test for TestRootDiskRemount { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + + let disk_path = format!("{}/rootfs.ext4", test_setup.tmp_dir.display()); + create_disk_image(&guest_agent_path, &disk_path); + + let c_disk_path = CString::new(disk_path)?; + let test_case = CString::new(test_setup.test_case)?; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Set up the exec path for the guest-agent on the block + // device root. + let argv = [test_case.as_ptr(), null()]; + let envp = [null()]; + krun_call!(krun_set_exec( + ctx, + c"/guest-agent".as_ptr(), + argv.as_ptr(), + envp.as_ptr(), + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + + // Add a block device with the ext4 image. + krun_call!(krun_add_disk( + ctx, + c"vda".as_ptr(), + c_disk_path.as_ptr(), + false, // not read-only + ))?; + + // Configure block device as root, pivot from NullFs. + krun_call!(krun_set_root_disk_remount( + ctx, + c"/dev/vda".as_ptr(), + c"ext4".as_ptr(), + std::ptr::null(), // no mount options + ))?; + + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::path::Path; + + impl Test for TestRootDiskRemount { + fn in_guest(self: Box) { + // Verify we're running from the block device root. + let marker = fs::read_to_string("/block-marker") + .expect("Failed to read /block-marker — not on block device root?"); + assert_eq!(marker, "booted-from-block-device"); + + // The init.krun virtual file should be gone (one-shot, and we + // pivoted away from the NullFs root anyway). + assert!(!Path::new("/init.krun").exists()); + + // /proc and /dev should be mounted (init re-mounts after pivot). + assert!(Path::new("/proc/self").exists(), "/proc/self missing"); + assert!(Path::new("/dev/null").exists(), "/dev/null missing"); + + println!("OK"); + } + } +}