diff --git a/Cargo.lock b/Cargo.lock index c066dd4eb..3b40bfbc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -563,6 +563,10 @@ dependencies = [ "serde_core", ] +[[package]] +name = "init-blob" +version = "0.1.0" + [[package]] name = "iocuddle" version = "0.1.1" @@ -885,6 +889,7 @@ version = "1.18.0" dependencies = [ "crossbeam-channel", "env_logger", + "init-blob", "krun-aws-nitro", "krun-devices", "krun-display", diff --git a/Cargo.toml b/Cargo.toml index 83db53c57..94960ad2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "src/libkrun", + "src/init-blob", "src/input", "src/display", "src/utils", diff --git a/include/libkrun.h b/include/libkrun.h index 87d5e1fa1..4c0167027 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -1166,6 +1166,14 @@ int32_t krun_get_max_vcpus(void); */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); +/* + * NOTE: Implicit resource creation is a legacy convenience. The 2.0 API + * (see https://github.com/containers/libkrun/issues/634) will not create + * any implicit resources. Callers should start using the + * krun_disable_implicit_* functions now to ease migration. + */ + + /* * Do not create an implicit console device in the guest. By using this API, * libkrun will create zero console devices on behalf of the user. Any @@ -1180,6 +1188,79 @@ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); */ int32_t krun_disable_implicit_console(uint32_t ctx_id); +/** + * Do not inject the default init binary (/init.krun) into the root + * filesystem. Must be called before krun_set_root(). + * + * Arguments: + * "ctx_id" - the configuration context ID. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_disable_implicit_init(uint32_t ctx_id); + +/** + * Get a pointer to the built-in default init binary. + * + * This is the same binary that libkrun injects as /init.krun by default. + * Callers that use krun_disable_implicit_init() can use this to inject the + * init binary themselves (e.g. via krun_fs_add_overlay_file with custom + * settings). + * + * The returned pointer is valid for the lifetime of the process (static data). + * + * Arguments: + * "data_out" - receives a pointer to the init binary bytes. + * "len_out" - receives the length in bytes. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_get_default_init(const uint8_t **data_out, size_t *len_out); + +/** + * Add a virtual overlay file to a virtiofs device. + * + * The file will appear in the root directory of the specified virtiofs + * mount and is backed entirely by host memory (no host file). The data + * pointer is NOT copied — the caller must keep the memory valid for the + * full VM lifetime. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "filename" - name of the file in the root directory. + * "data" - pointer to the file content. + * "data_len" - length of the file content in bytes. + * "mode" - file mode bits (e.g. 0100644 for a regular file). + * "one_shot" - if true, the file can only be looked up once. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, + const char *filename, const uint8_t *data, + size_t data_len, uint32_t mode, bool one_shot); + +/** + * Add a virtual overlay directory to a virtiofs device. + * + * The directory will appear in the root directory of the specified virtiofs + * mount. It is empty and read-only, useful as a mount point. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "dirname" - name of the directory in the root directory. + * "mode" - directory mode bits (e.g. 040755). + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_dir(uint32_t ctx_id, const char *fs_tag, + const char *dirname, uint32_t mode); + /** * Disable the implicit vsock device. * diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index ab6ecfe2a..197c01bd3 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -3,7 +3,7 @@ name = "krun-devices" version = "0.1.0-1.18.0" authors = ["The libkrun Authors"] edition = "2021" -build = "build.rs" + description = "Virtual device emulation for libkrun" license = "Apache-2.0" repository = "https://github.com/containers/libkrun" diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs new file mode 100644 index 000000000..5e8032f12 --- /dev/null +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -0,0 +1,747 @@ +// Virtual inode overlay for virtiofs. +// +// `AugmentFs` wraps an inner `FileSystem` implementation and intercepts +// FUSE operations for virtual inodes — synthetic read-only files that exist +// only in memory. All other operations are delegated to the inner filesystem. +// +// Virtual inodes are injected into the root directory (parent = ROOT_ID) and +// are currently only accessible via lookup (they do not appear in readdir). +// +// One-shot files can only be looked up once — the name is removed from the +// directory on first lookup so subsequent lookups return ENOENT. + +#[cfg(target_os = "macos")] +use crossbeam_channel::Sender; +use std::collections::HashMap; +use std::ffi::CStr; +use std::ffi::CString; +use std::io; +use std::mem; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; + +#[cfg(target_os = "macos")] +use utils::worker_message::WorkerMessage; + +use super::filesystem::{ + Context, DirEntry, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, + OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use super::fuse; +use super::inode_alloc::InodeAllocator; +use super::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent, VIRTUAL_BLKSIZE}; +use crate::virtio::bindings; +use crate::virtio::linux_errno; + +type Inode = u64; +type Handle = u64; + +/// Sentinel handle returned for all virtual file opens. This works because +/// virtual file operations dispatch on inode, not handle — there is no +/// per-open state. If per-fd state is ever needed (e.g. writable virtual +/// files), this must be replaced with a real handle allocator. +const VIRTUAL_HANDLE: Handle = 0; + +/// Persistent virtual entries never change. +const VIRTUAL_TIMEOUT: Duration = Duration::MAX; + +/// Overlay that injects virtual inodes into an inner `FileSystem`. +pub struct AugmentFs { + inner: T, + /// Maps (parent_inode, name) → child inode number. One-shot entries + /// are removed on first lookup so the file can only be opened once. + name_to_inode: RwLock>, + /// Maps virtual inode number → (mode, inode data). One-shot entries are + /// removed from this map on release. + inodes: RwLock>, +} + +impl> AugmentFs { + /// Create a new overlay. + /// + /// `entries` are registered as virtual inodes in the root directory. + /// Inode numbers are obtained from `inode_alloc`, the same allocator + /// used by the inner filesystem. + pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { + let mut name_to_inode = HashMap::new(); + let mut inodes = HashMap::new(); + + Self::register_entries( + fuse::ROOT_ID, + entries, + inode_alloc, + &mut name_to_inode, + &mut inodes, + ); + + Self { + inner, + name_to_inode: RwLock::new(name_to_inode), + inodes: RwLock::new(inodes), + } + } + + fn register_entries( + parent: Inode, + entries: Vec, + inode_alloc: &InodeAllocator, + name_to_inode: &mut HashMap<(Inode, CString), Inode>, + inodes: &mut HashMap, + ) { + for entry in entries { + let ino = inode_alloc.next(); + name_to_inode.insert((parent, entry.name), ino); + + // Recurse into directory children before moving the node. + if let VirtualEntryContent::Dir { children } = entry.entry.content { + Self::register_entries(ino, children, inode_alloc, name_to_inode, inodes); + inodes.insert( + ino, + VirtualEntry { + mode: entry.entry.mode, + one_shot: entry.entry.one_shot, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + ); + } else { + inodes.insert(ino, entry.entry); + } + } + } + + fn is_virtual(&self, inode: Inode) -> bool { + self.inodes.read().unwrap().contains_key(&inode) + } + + fn virtual_stat(ino: Inode, vnode: &VirtualEntry) -> (bindings::stat64, Duration) { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = ino; + st.st_mode = vnode.st_mode() as _; + st.st_blksize = VIRTUAL_BLKSIZE as _; + let timeout = if vnode.one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; + match &vnode.content { + VirtualEntryContent::File { data, .. } => { + st.st_size = data.len() as i64; + st.st_nlink = 1; + st.st_blocks = ((data.len() as i64) + 511) / 512; + } + VirtualEntryContent::Dir { .. } => { + st.st_nlink = 2; + } + } + (st, timeout) + } +} + +impl> FileSystem for AugmentFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result { + self.inner.init(capable) + } + + fn destroy(&self) { + self.inner.destroy() + } + + fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { + let key = (parent, CString::from(name)); + let inode = self.name_to_inode.read().unwrap().get(&key).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let one_shot = vnode.one_shot; + let (st, timeout) = Self::virtual_stat(inode, vnode); + + if one_shot { + drop(inodes); + self.name_to_inode.write().unwrap().remove(&key); + } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: timeout, + entry_timeout: timeout, + }); + } + } + self.inner.lookup(ctx, parent, name) + } + + fn forget(&self, ctx: Context, inode: Inode, count: u64) { + if !self.is_virtual(inode) { + self.inner.forget(ctx, inode, count) + } + } + + fn batch_forget(&self, ctx: Context, mut requests: Vec<(Inode, u64)>) { + requests.retain(|(ino, _)| !self.is_virtual(*ino)); + self.inner.batch_forget(ctx, requests); + } + + fn getattr( + &self, + ctx: Context, + inode: Inode, + handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + return Ok(Self::virtual_stat(inode, vnode)); + } + } + self.inner.getattr(ctx, inode, handle) + } + + fn setattr( + &self, + ctx: Context, + inode: Inode, + attr: bindings::stat64, + handle: Option, + valid: SetattrValid, + ) -> io::Result<(bindings::stat64, Duration)> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setattr(ctx, inode, attr, handle, valid) + } + + fn readlink(&self, ctx: Context, inode: Inode) -> io::Result> { + if self.is_virtual(inode) { + return Err(linux_errno::einval()); + } + self.inner.readlink(ctx, inode) + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + extensions: Extensions, + ) -> io::Result { + self.inner.symlink(ctx, linkname, parent, name, extensions) + } + + fn mknod( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + self.inner + .mknod(ctx, inode, name, mode, rdev, umask, extensions) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + let key = (parent, CString::from(name)); + if self.name_to_inode.read().unwrap().contains_key(&key) { + return Err(linux_errno::eexist()); + } + self.inner.mkdir(ctx, parent, name, mode, umask, extensions) + } + + fn unlink(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.unlink(ctx, parent, name) + } + + fn rmdir(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.rmdir(ctx, parent, name) + } + + fn rename( + &self, + ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + self.inner + .rename(ctx, olddir, oldname, newdir, newname, flags) + } + + fn link( + &self, + ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.link(ctx, inode, newparent, newname) + } + + fn open( + &self, + ctx: Context, + inode: Inode, + kill_priv: bool, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.is_dir() { + return Err(linux_errno::eisdir()); + } + if (flags as i32 & libc::O_ACCMODE) != libc::O_RDONLY { + return Err(linux_errno::eacces()); + } + return Ok((Some(VIRTUAL_HANDLE), OpenOptions::empty())); + } + } + self.inner.open(ctx, inode, kill_priv, flags) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + kill_priv: bool, + flags: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result<(Entry, Option, OpenOptions)> { + self.inner + .create(ctx, parent, name, mode, kill_priv, flags, umask, extensions) + } + + fn read( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + lock_owner: Option, + flags: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + let off: usize = offset.try_into().map_err(|_| linux_errno::einval())?; + if off >= data.len() { + return Ok(0); + } + let remaining = data.len() - off; + let len = remaining.min(size as usize); + return w.write(&data[off..(off + len)]); + } + } + self.inner + .read(ctx, inode, handle, w, size, offset, lock_owner, flags) + } + + fn write( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + r: R, + size: u32, + offset: u64, + lock_owner: Option, + delayed_write: bool, + kill_priv: bool, + flags: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.write( + ctx, + inode, + handle, + r, + size, + offset, + lock_owner, + delayed_write, + kill_priv, + flags, + ) + } + + fn flush(&self, ctx: Context, inode: Inode, handle: Handle, lock_owner: u64) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.flush(ctx, inode, handle, lock_owner) + } + + fn fsync(&self, ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.fsync(ctx, inode, datasync, handle) + } + + fn fallocate( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner + .fallocate(ctx, inode, handle, mode, offset, length) + } + + fn release( + &self, + ctx: Context, + inode: Inode, + flags: u32, + handle: Handle, + flush: bool, + flock_release: bool, + lock_owner: Option, + ) -> io::Result<()> { + { + let mut inodes = self.inodes.write().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + if vnode.one_shot { + inodes.remove(&inode); + } + return Ok(()); + } + } + self.inner + .release(ctx, inode, flags, handle, flush, flock_release, lock_owner) + } + + fn statfs(&self, ctx: Context, inode: Inode) -> io::Result { + self.inner.statfs(ctx, inode) + } + + fn getxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(linux_errno::enodata()); + } + self.inner.getxattr(ctx, inode, name, size) + } + + fn listxattr(&self, ctx: Context, inode: Inode, size: u32) -> io::Result { + if self.is_virtual(inode) { + if size == 0 { + return Ok(ListxattrReply::Count(0)); + } + return Ok(ListxattrReply::Names(Vec::new())); + } + self.inner.listxattr(ctx, inode, size) + } + + fn setxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.setxattr(ctx, inode, name, value, flags) + } + + fn removexattr(&self, ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(linux_errno::eperm()); + } + self.inner.removexattr(ctx, inode, name) + } + + fn opendir( + &self, + ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + self.inner.opendir(ctx, inode, flags) + } + + fn readdir( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result, + { + self.inner + .readdir(ctx, inode, handle, size, offset, add_entry) + } + + fn readdirplus( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result, + { + self.inner + .readdirplus(ctx, inode, handle, size, offset, add_entry) + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.inner.fsyncdir(ctx, inode, datasync, handle) + } + + fn releasedir(&self, ctx: Context, inode: Inode, flags: u32, handle: Handle) -> io::Result<()> { + self.inner.releasedir(ctx, inode, flags, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + if self.is_virtual(inode) { + if mask & (libc::W_OK as u32) != 0 { + return Err(linux_errno::eacces()); + } + return Ok(()); + } + self.inner.access(ctx, inode, mask) + } + + fn lseek( + &self, + ctx: Context, + inode: Inode, + _handle: Handle, + offset: u64, + whence: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let size = vnode.data().ok_or_else(linux_errno::eisdir)?.len() as u64; + // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. + return match whence as i32 { + libc::SEEK_DATA => { + if offset < size { + Ok(offset) + } else { + Err(linux_errno::enxio()) + } + } + libc::SEEK_HOLE => { + if offset < size { + Ok(size) + } else { + Err(linux_errno::enxio()) + } + } + _ => Err(linux_errno::einval()), + }; + } + } + self.inner.lseek(ctx, inode, _handle, offset, whence) + } + + fn copyfilerange( + &self, + ctx: Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Virtual inodes don't have real file descriptors, so copy_file_range + // cannot work. Return EXDEV to tell the kernel to fall back to + // read+write. + if self.is_virtual(inode_in) || self.is_virtual(inode_out) { + return Err(linux_errno::exdev()); + } + self.inner.copyfilerange( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + fn setupmapping( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + foffset: u64, + len: u64, + flags: u64, + moffset: u64, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + { + let inodes = self.inodes.read().unwrap(); + if let Some(vnode) = inodes.get(&inode) { + let data = vnode.data().ok_or_else(linux_errno::eisdir)?; + #[cfg(target_os = "linux")] + { + if (moffset + len) > shm_size { + return Err(linux_errno::einval()); + } + + let addr = host_shm_base + moffset; + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, + -1, + 0, + ) + }; + if std::ptr::eq(ret, libc::MAP_FAILED) { + return Err(io::Error::last_os_error()); + } + + let foff = foffset as usize; + if foff < data.len() { + let available = data.len() - foff; + let to_copy = (len as usize).min(available); + unsafe { + libc::memcpy( + addr as *mut libc::c_void, + data.as_ptr().add(foff) as *const _, + to_copy, + ) + }; + } + + return Ok(()); + } + + // TODO: implement DAX for virtual files on macOS. + // Needs a shared memory region manager (see setupmapping + // in macos/passthrough.rs for the real-file DAX path). + #[cfg(target_os = "macos")] + { + let _ = data; + return Err(linux_errno::enosys()); + } + } + } + self.inner.setupmapping( + ctx, + inode, + handle, + foffset, + len, + flags, + moffset, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn removemapping( + &self, + ctx: Context, + requests: Vec, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + self.inner.removemapping( + ctx, + requests, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn ioctl( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + flags: u32, + cmd: u32, + arg: u64, + in_size: u32, + out_size: u32, + exit_code: &Arc, + ) -> io::Result> { + // The ioctl cmd values use Linux encoding regardless of host OS + // because the guest always runs Linux. + const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; + + match cmd { + VIRTIO_IOC_EXIT_CODE_REQ => { + exit_code.store(arg as i32, Ordering::SeqCst); + Ok(Vec::new()) + } + _ => self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ), + } + } +} diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index bc877bc24..e66799c75 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -17,6 +17,7 @@ use super::super::{ VirtioShmRegion, }; use super::passthrough; +use super::virtual_entry::VirtualDirEntry; use super::worker::FsWorker; use super::ExportTable; use super::{defs, defs::uapi}; @@ -46,8 +47,9 @@ pub struct Fs { device_state: DeviceState, config: VirtioFsConfig, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, worker_thread: Option>, worker_stopfd: EventFd, exit_code: Arc, @@ -58,10 +60,10 @@ pub struct Fs { impl Fs { pub fn new( fs_id: String, - shared_dir: String, + shared_dir: Option, exit_code: Arc, - allow_root_dir_delete: bool, read_only: bool, + virtual_entries: Vec, ) -> super::Result { let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); @@ -70,11 +72,10 @@ impl Fs { config.tag[..tag.len()].copy_from_slice(tag.as_slice()); config.num_request_queues = 1; - let fs_cfg = passthrough::Config { - root_dir: shared_dir, - allow_root_dir_delete, + let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { + root_dir, ..Default::default() - }; + }); Ok(Fs { avail_features, @@ -84,6 +85,7 @@ impl Fs { shm_region: None, passthrough_cfg: fs_cfg, read_only, + virtual_entries, worker_thread: None, worker_stopfd: EventFd::new(EFD_NONBLOCK).map_err(FsError::EventFd)?, exit_code, @@ -103,10 +105,14 @@ impl Fs { pub fn set_export_table(&mut self, export_table: ExportTable) -> u64 { static FS_UNIQUE_ID: AtomicU64 = AtomicU64::new(0); - self.passthrough_cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); - self.passthrough_cfg.export_table = Some(export_table); + let cfg = self + .passthrough_cfg + .as_mut() + .expect("export_table requires a passthrough filesystem"); + cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + cfg.export_table = Some(export_table); - self.passthrough_cfg.export_fsid + cfg.export_fsid } #[cfg(target_os = "macos")] @@ -180,6 +186,7 @@ impl VirtioDevice for Fs { queue_evts.push(dq.event); } + let virtual_entries = self.virtual_entries.clone(); let worker = FsWorker::new( worker_queues, queue_evts, @@ -188,6 +195,7 @@ impl VirtioDevice for Fs { self.shm_region.clone(), self.passthrough_cfg.clone(), self.read_only, + virtual_entries, self.worker_stopfd.try_clone().unwrap(), self.exit_code.clone(), #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs new file mode 100644 index 000000000..1919b1406 --- /dev/null +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -0,0 +1,28 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use super::fuse; + +/// Allocates unique FUSE inode numbers. +/// +/// FUSE inode numbers are opaque identifiers with two reserved values: +/// - `0` — invalid / negative-entry cache sentinel (never allocated) +/// - `1` (`ROOT_ID`) — the root directory of the filesystem +/// +/// All other numbers are allocated sequentially starting from `ROOT_ID + 1`. +/// The allocator is `Send + Sync` and safe to share across threads. +pub struct InodeAllocator { + next: AtomicU64, +} + +impl InodeAllocator { + pub fn new() -> Self { + Self { + next: AtomicU64::new(fuse::ROOT_ID + 1), + } + } + + /// Allocate the next inode number. Each call returns a unique value. + pub fn next(&self) -> u64 { + self.next.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index e5ca21a03..8272a7e01 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -16,7 +16,7 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use caps::{has_cap, CapSet, Capability}; -use nix::{request_code_none, request_code_read}; +use nix::request_code_read; use vm_memory::ByteValued; @@ -25,15 +25,13 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; -const INIT_CSTR: &[u8] = b"init.krun\0"; - -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); type Inode = u64; type Handle = u64; @@ -327,7 +325,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -342,7 +339,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -358,14 +354,12 @@ pub struct PassthroughFs { // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the @@ -392,7 +386,7 @@ enum FileOrLink { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { @@ -438,12 +432,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, proc_self_fd, @@ -579,7 +571,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -992,25 +984,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("do_lookup: {name:?}"); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == init_name { - let mut st: libc::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1129,11 +1103,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1234,16 +1204,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset.try_into().map_err(|_| einval())?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -1824,10 +1784,6 @@ impl FileSystem for PassthroughFs { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } - if inode == self.init_inode { - return Err(io::Error::from_raw_os_error(libc::ENODATA)); - } - let mut buf = vec![0; size as usize]; // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we @@ -2087,36 +2043,6 @@ impl FileSystem for PassthroughFs { debug!("setupmapping: ino {inode:?} addr={addr:x} len={len}"); - if inode == self.init_inode { - let ret = unsafe { - libc::mmap( - addr as *mut libc::c_void, - len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, - -1, - 0, - ) - }; - if std::ptr::eq(ret, libc::MAP_FAILED) { - return Err(io::Error::last_os_error()); - } - - let to_copy = if len as usize > INIT_BINARY.len() { - INIT_BINARY.len() - } else { - len as usize - }; - unsafe { - libc::memcpy( - addr as *mut libc::c_void, - INIT_BINARY.as_ptr() as *const _, - to_copy, - ) - }; - return Ok(()); - } - let file = self.open_inode(inode, open_flags)?; let fd = file.as_raw_fd(); @@ -2175,10 +2101,10 @@ impl FileSystem for PassthroughFs { handle: Self::Handle, _flags: u32, cmd: u32, - arg: u64, + _arg: u64, _in_size: u32, out_size: u32, - exit_code: &Arc, + _exit_code: &Arc, ) -> io::Result> { const VIRTIO_IOC_MAGIC: u8 = b'v'; @@ -2190,14 +2116,6 @@ impl FileSystem for PassthroughFs { VIRTIO_IOC_EXPORT_FD_SIZE ) as u32; - const VIRTIO_IOC_TYPE_EXIT_CODE: u8 = 2; - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - - const VIRTIO_IOC_REMOVE_ROOT_DIR_CODE: u8 = 3; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_REMOVE_ROOT_DIR_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2228,14 +2146,6 @@ impl FileSystem for PassthroughFs { ret.extend_from_slice(&handle.to_ne_bytes()); Ok(ret) } - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 53680bd92..cf43e0d0c 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -8,11 +8,11 @@ use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; -use std::mem::{self, MaybeUninit}; +use std::mem::MaybeUninit; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::ptr::null_mut; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, AtomicI32, AtomicI64, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, RwLock}; use std::time::Duration; @@ -29,16 +29,14 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; -const INIT_CSTR: &[u8] = b"init.krun\0"; const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); - type Inode = u64; type Handle = u64; @@ -516,7 +514,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. Not supported for macos. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -531,7 +528,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -543,12 +539,10 @@ impl Default for Config { /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, map_windows: Mutex>, @@ -560,7 +554,7 @@ pub struct PassthroughFs { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let root = CString::new(cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. @@ -579,12 +573,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, map_windows: Mutex::new(HashMap::new()), @@ -723,7 +715,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -1201,25 +1193,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {name:?}"); - let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == _init_name { - let mut st: bindings::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1339,11 +1313,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1456,18 +1426,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset - .try_into() - .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -2053,10 +2011,6 @@ impl FileSystem for PassthroughFs { return Err(linux_error(io::Error::from_raw_os_error(libc::ENOSYS))); } - if inode == self.init_inode { - return Err(linux_error(io::Error::from_raw_os_error(libc::ENODATA))); - } - if name.to_bytes() == XATTR_KEY { return Err(linux_error(io::Error::from_raw_os_error(libc::EACCES))); } @@ -2469,34 +2423,4 @@ impl FileSystem for PassthroughFs { Ok(()) } - - fn ioctl( - &self, - _ctx: Context, - _inode: Self::Inode, - _handle: Self::Handle, - _flags: u32, - cmd: u32, - arg: u64, - _in_size: u32, - _out_size: u32, - exit_code: &Arc, - ) -> io::Result> { - // We can't use nix::request_code_none here since it's system-dependent - // and we need the value from Linux. - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - - match cmd { - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } - _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), - } - } } diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 7ce9d48c2..f8ef63295 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -1,11 +1,15 @@ +mod augment_fs; mod device; #[allow(dead_code)] mod filesystem; pub mod fuse; +mod inode_alloc; #[allow(dead_code)] mod multikey; +mod null_fs; mod read_only; mod server; +pub mod virtual_entry; mod worker; #[cfg(target_os = "linux")] diff --git a/src/devices/src/virtio/fs/null_fs.rs b/src/devices/src/virtio/fs/null_fs.rs new file mode 100644 index 000000000..0c9d895f4 --- /dev/null +++ b/src/devices/src/virtio/fs/null_fs.rs @@ -0,0 +1,45 @@ +// A minimal filesystem that serves an empty root directory. +// +// Used with AugmentFs to provide a virtual-only filesystem (e.g. for +// booting from a block device where the virtiofs root only needs init.krun). + +use std::io; +use std::mem; +use std::time::Duration; + +use super::filesystem::{Context, FileSystem, FsOptions}; +use super::fuse; +use super::virtual_entry::VIRTUAL_BLKSIZE; +use crate::virtio::bindings; + +/// An empty filesystem with just a root directory and nothing in it. +pub struct NullFs; + +type Inode = u64; +type Handle = u64; + +impl FileSystem for NullFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, _capable: FsOptions) -> io::Result { + Ok(FsOptions::empty()) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + if inode == fuse::ROOT_ID { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = fuse::ROOT_ID; + st.st_mode = libc::S_IFDIR | 0o755; + st.st_nlink = 2; + st.st_blksize = VIRTUAL_BLKSIZE as _; + return Ok((st, Duration::MAX)); + } + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } +} diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index e975f2dda..5495db1ed 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -25,6 +25,7 @@ use super::filesystem::{ OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::fuse; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use crate::virtio::bindings; @@ -35,10 +36,6 @@ fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } -// Keep the Linux ioctl number so read-only virtio-fs can still handle -// non-mutating control ioctls while rejecting host-side root deletion. -const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - fn read_only_open_flags(flags: u32) -> io::Result { let f = flags as i32; if f & libc::O_ACCMODE != libc::O_RDONLY { @@ -60,9 +57,9 @@ pub struct PassthroughFsRo { } impl PassthroughFsRo { - pub fn new(cfg: passthrough::Config) -> io::Result { + pub fn new(cfg: passthrough::Config, inode_alloc: Arc) -> io::Result { Ok(Self { - inner: PassthroughFs::new(cfg)?, + inner: PassthroughFs::new(cfg, inode_alloc)?, }) } } @@ -318,10 +315,6 @@ impl FileSystem for PassthroughFsRo { out_size: u32, exit_code: &Arc, ) -> io::Result> { - if cmd == VIRTIO_IOC_REMOVE_ROOT_DIR_REQ { - return Err(erofs()); - } - self.inner.ioctl( ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, ) diff --git a/src/devices/src/virtio/fs/virtual_entry.rs b/src/devices/src/virtio/fs/virtual_entry.rs new file mode 100644 index 000000000..06f6915b3 --- /dev/null +++ b/src/devices/src/virtio/fs/virtual_entry.rs @@ -0,0 +1,56 @@ +// Virtual entry types for the virtiofs overlay. + +use std::ffi::CString; + +/// Block size reported by virtual entries in st_blksize. +pub const VIRTUAL_BLKSIZE: i64 = 4096; + +/// A synthetic filesystem entry that exists only in memory. +#[derive(Clone, Debug)] +pub struct VirtualEntry { + /// Permission bits. File type bits (S_IFMT) are ignored — the type + /// is derived from the `content` variant. + pub mode: u32, + /// If true, the entry can only be looked up once. + pub one_shot: bool, + pub content: VirtualEntryContent, +} + +#[derive(Clone, Debug)] +pub enum VirtualEntryContent { + /// A read-only file backed by a static byte slice. + File { data: &'static [u8] }, + /// A directory containing other virtual entries. + Dir { children: Vec }, +} + +impl VirtualEntry { + pub fn is_dir(&self) -> bool { + matches!(self.content, VirtualEntryContent::Dir { .. }) + } + + /// Returns the full st_mode: file type bits from the variant OR'd + /// with the permission bits from self.mode. + #[allow(clippy::unnecessary_cast)] // libc::S_IF* is u16 on macOS, u32 on Linux + pub fn st_mode(&self) -> u32 { + let file_type = match self.content { + VirtualEntryContent::File { .. } => libc::S_IFREG as u32, + VirtualEntryContent::Dir { .. } => libc::S_IFDIR as u32, + }; + file_type | (self.mode & !(libc::S_IFMT as u32)) + } + + pub fn data(&self) -> Option<&'static [u8]> { + match &self.content { + VirtualEntryContent::File { data } => Some(data), + VirtualEntryContent::Dir { .. } => None, + } + } +} + +/// A named entry in a virtual directory. +#[derive(Clone, Debug)] +pub struct VirtualDirEntry { + pub name: CString, + pub entry: VirtualEntry, +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index c612b3e9b..b8e722b5d 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -14,16 +14,21 @@ use utils::eventfd::EventFd; use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; +use super::augment_fs::AugmentFs; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; +use super::inode_alloc::InodeAllocator; +use super::null_fs::NullFs; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; +use super::virtual_entry::VirtualDirEntry; use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { - ReadWrite(Server), - ReadOnly(Server), + ReadWrite(Server>), + ReadOnly(Server>), + Null(Server>), } impl FsServer { @@ -52,6 +57,14 @@ impl FsServer { #[cfg(target_os = "macos")] map_sender, ), + FsServer::Null(s) => s.handle_message( + r, + w, + shm_region, + exit_code, + #[cfg(target_os = "macos")] + map_sender, + ), } } } @@ -77,16 +90,36 @@ impl FsWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, stop_fd: EventFd, exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { - let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new(passthrough_cfg)?)) - } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new(passthrough_cfg)?)) + let inode_alloc = Arc::new(InodeAllocator::new()); + let server = match passthrough_cfg { + Some(cfg) if read_only => { + let inner = PassthroughFsRo::new(cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + Some(cfg) => { + let inner = PassthroughFs::new(cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + None => FsServer::Null(Server::new(AugmentFs::new( + NullFs, + &inode_alloc, + virtual_entries, + ))), }; Ok(Self { queues, diff --git a/src/devices/src/virtio/linux_errno.rs b/src/devices/src/virtio/linux_errno.rs index 59aca5789..105f977b5 100644 --- a/src/devices/src/virtio/linux_errno.rs +++ b/src/devices/src/virtio/linux_errno.rs @@ -183,3 +183,37 @@ pub fn linux_errno_raw(errno: i32) -> i32 { _ => LINUX_EIO, } } + +// Helper functions returning io::Error with Linux errno values. +use std::io; + +pub fn eperm() -> io::Error { + io::Error::from_raw_os_error(LINUX_EPERM) +} +pub fn enoent() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOENT) +} +pub fn eacces() -> io::Error { + io::Error::from_raw_os_error(LINUX_EACCES) +} +pub fn eexist() -> io::Error { + io::Error::from_raw_os_error(LINUX_EEXIST) +} +pub fn einval() -> io::Error { + io::Error::from_raw_os_error(LINUX_EINVAL) +} +pub fn eisdir() -> io::Error { + io::Error::from_raw_os_error(LINUX_EISDIR) +} +pub fn exdev() -> io::Error { + io::Error::from_raw_os_error(LINUX_EXDEV) +} +pub fn enosys() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENOSYS) +} +pub fn enodata() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENODATA) +} +pub fn enxio() -> io::Error { + io::Error::from_raw_os_error(LINUX_ENXIO) +} diff --git a/src/init-blob/Cargo.toml b/src/init-blob/Cargo.toml new file mode 100644 index 000000000..c984f1ea6 --- /dev/null +++ b/src/init-blob/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "init-blob" +version = "0.1.0" +edition = "2021" +description = "Default init binary blob for libkrun guests" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" +build = "build.rs" + +[lib] +path = "src/lib.rs" diff --git a/src/devices/build.rs b/src/init-blob/build.rs similarity index 75% rename from src/devices/build.rs rename to src/init-blob/build.rs index 49a4346d2..0482edf3d 100644 --- a/src/devices/build.rs +++ b/src/init-blob/build.rs @@ -46,20 +46,31 @@ fn build_default_init() -> PathBuf { .unwrap_or_else(|e| panic!("failed to execute {cc}: {e}")); if !status.success() { - panic!("failed to compile init/init.c: {status}"); + panic!("failed to compile init/init.c with {cc}: {status}"); } + init_bin } fn main() { + let manifest_dir = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); + let repo_init_bin = manifest_dir.join("../..").join("init/init"); + println!("cargo:rerun-if-changed={}", repo_init_bin.display()); + let init_binary_path = std::env::var_os("KRUN_INIT_BINARY_PATH") .map(PathBuf::from) - .unwrap_or_else(|| { - let init_path = build_default_init(); - // SAFETY: The build script is single threaded. - unsafe { std::env::set_var("KRUN_INIT_BINARY_PATH", &init_path) }; - init_path - }); + .or_else(|| { + if repo_init_bin.exists() { + Some(repo_init_bin) + } else { + None + } + }) + .unwrap_or_else(build_default_init); + + // SAFETY: The build script is single threaded. + unsafe { std::env::set_var("KRUN_INIT_BINARY_PATH", &init_binary_path) }; + println!( "cargo:rustc-env=KRUN_INIT_BINARY_PATH={}", init_binary_path.display() diff --git a/src/init-blob/src/lib.rs b/src/init-blob/src/lib.rs new file mode 100644 index 000000000..4397da679 --- /dev/null +++ b/src/init-blob/src/lib.rs @@ -0,0 +1 @@ +pub static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 24db7a9ff..3aa4402de 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -32,6 +32,7 @@ krun_display = { package = "krun-display", version = "0.1.0", path = "../display krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } +init-blob = { path = "../init-blob" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 1d8b3fcb1..d7e5f6100 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,16 +14,15 @@ use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; +#[cfg(not(feature = "tee"))] +use devices::virtio::fs::virtual_entry::{VirtualDirEntry, VirtualEntry, VirtualEntryContent}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; use std::env; -#[cfg(target_os = "linux")] use std::ffi::CString; use std::ffi::{c_void, CStr}; use std::fs::File; @@ -90,6 +89,23 @@ static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); // Path to the init binary to be executed inside the VM. const INIT_PATH: &str = "/init.krun"; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +const DEFAULT_INIT_PAYLOAD: &[u8] = init_blob::INIT_BINARY; + +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn init_virtual_entry() -> VirtualDirEntry { + VirtualDirEntry { + name: CString::new("init.krun").unwrap(), + entry: VirtualEntry { + mode: 0o755, + one_shot: true, + content: VirtualEntryContent::File { + data: DEFAULT_INIT_PAYLOAD, + }, + }, + } +} + static KRUNFW: LazyLock> = LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); @@ -167,6 +183,8 @@ struct ContextConfig { console_output: Option, vmm_uid: Option, vmm_gid: Option, + #[cfg(not(feature = "tee"))] + disable_implicit_init: bool, } impl ContextConfig { @@ -594,11 +612,17 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) let cfg = ctx_cfg.get_mut(); cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, - shared_dir, + shared_dir: Some(shared_dir), // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, + virtual_entries: { + let mut v = Vec::new(); + if !cfg.disable_implicit_init { + v.push(init_virtual_entry()); + } + v + }, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -640,7 +664,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( shm_size: u64, read_only: bool, ) -> i32 { - if c_tag.is_null() || c_path.is_null() { + if c_tag.is_null() { return -libc::EINVAL; } @@ -648,9 +672,15 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Ok(tag) => tag, Err(_) => return -libc::EINVAL, }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, + + // NULL path means NullFs (virtual-only filesystem, no host directory). + let path = if c_path.is_null() { + None + } else { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(path), + Err(_) => return -libc::EINVAL, + } }; let shm = if shm_size > 0 { @@ -665,12 +695,16 @@ pub unsafe extern "C" fn krun_add_virtiofs3( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if tag == "/dev/root" && !cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: path.to_string(), + shared_dir: path.map(|p| p.to_string()), shm_size: shm, - allow_root_dir_delete: false, read_only, + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -2396,25 +2430,35 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; + // Boot from a block device: the virtiofs root only needs to + // serve init.krun and provide mount points for /dev, /proc, /sys. + // Use a NullFs (no host directory) with the inode overlay. + let mut virtual_entries = Vec::new(); + if !ctx_cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } + // init.c needs these directories as mount points before + // pivoting to the block device root. + for name in ["dev", "proc", "sys", "newroot"] { + virtual_entries.push(VirtualDirEntry { + name: CString::new(name).unwrap(), + entry: VirtualEntry { + mode: 0o755, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + }); } ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), + shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: true, read_only: false, + virtual_entries, }); ctx_cfg.set_block_root(device, fstype, options); @@ -2425,6 +2469,135 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( KRUN_SUCCESS } +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().disable_implicit_init = true; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_get_default_init( + data_out: *mut *const u8, + len_out: *mut size_t, +) -> i32 { + if data_out.is_null() || len_out.is_null() { + return -libc::EINVAL; + } + *data_out = DEFAULT_INIT_PAYLOAD.as_ptr(); + *len_out = DEFAULT_INIT_PAYLOAD.len(); + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_file( + ctx_id: u32, + c_fs_tag: *const c_char, + c_filename: *const c_char, + data: *const u8, + data_len: size_t, + mode: u32, + one_shot: bool, +) -> i32 { + if c_fs_tag.is_null() || c_filename.is_null() || data.is_null() || data_len == 0 { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let filename = match CString::new(CStr::from_ptr(c_filename).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + // SAFETY: The caller guarantees the memory remains valid for the VM + // lifetime (see the C header contract). + let payload: &'static [u8] = slice::from_raw_parts(data, data_len); + + let entry = VirtualDirEntry { + name: filename, + entry: VirtualEntry { + mode, + one_shot, + content: VirtualEntryContent::File { data: payload }, + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_dir( + ctx_id: u32, + c_fs_tag: *const c_char, + c_dirname: *const c_char, + mode: u32, +) -> i32 { + if c_fs_tag.is_null() || c_dirname.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let dirname = match CString::new(CStr::from_ptr(c_dirname).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let entry = VirtualDirEntry { + name: dirname, + entry: VirtualEntry { + mode, + one_shot: false, + content: VirtualEntryContent::Dir { + children: Vec::new(), + }, + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { @@ -2874,3 +3047,28 @@ fn krun_start_enter_nitro(ctx_id: u32) -> i32 { } } } + +#[cfg(all(test, not(feature = "tee")))] +mod test_disable_implicit_init { + use super::*; + + #[test] + fn test_disable_implicit_init() { + let ctx = unsafe { krun_create_ctx() } as u32; + unsafe { + krun_disable_implicit_init(ctx); + krun_set_root(ctx, c"/tmp".as_ptr()); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert!( + cfg.vmr.fs[0].virtual_entries.is_empty(), + "root virtiofs should not inject init.krun after krun_disable_implicit_init()" + ); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } +} diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 1aa9c5c48..d22d26f8e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -2055,8 +2055,8 @@ fn attach_fs_devices( config.fs_id.clone(), config.shared_dir.clone(), exit_code.clone(), - config.allow_root_dir_delete, config.read_only, + config.virtual_entries.clone(), ) .unwrap(), )); diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index ccf86f5cd..0a2064587 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -1,8 +1,12 @@ +use devices::virtio::fs::virtual_entry::VirtualDirEntry; + #[derive(Clone, Debug)] pub struct FsDeviceConfig { pub fs_id: String, - pub shared_dir: String, + /// Host directory to pass through. None means a virtual-only filesystem + /// (NullFs + AugmentFs, no host directory). + pub shared_dir: Option, pub shm_size: Option, - pub allow_root_dir_delete: bool, pub read_only: bool, + pub virtual_entries: Vec, } diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..0f0b88290 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -22,6 +22,12 @@ use test_multiport_console::TestMultiportConsole; mod test_virtiofs_root_ro; use test_virtiofs_root_ro::TestVirtiofsRootRo; +mod test_augmentfs; +use test_augmentfs::TestAugmentFs; + +mod test_root_disk_remount; +use test_root_disk_remount::TestRootDiskRemount; + mod test_pjdfstest; use test_pjdfstest::TestPjdfstest; @@ -84,6 +90,8 @@ pub fn test_cases() -> Vec { TestCase::new("net-vmnet-helper", Box::new(TestNet::new_vmnet_helper())), TestCase::new("multiport-console", Box::new(TestMultiportConsole)), TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), + TestCase::new("augmentfs", Box::new(TestAugmentFs)), + TestCase::new("root-disk-remount", Box::new(TestRootDiskRemount)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), diff --git a/tests/test_cases/src/test_augmentfs.rs b/tests/test_cases/src/test_augmentfs.rs new file mode 100644 index 000000000..d402fdfdc --- /dev/null +++ b/tests/test_cases/src/test_augmentfs.rs @@ -0,0 +1,274 @@ +// Test the AugmentFs overlay over a NullFs. +// +// Boots a VM with NO host filesystem — the root virtiofs is backed entirely +// by virtual inodes: init.krun (one-shot), the guest-agent binary (one-shot), +// a .krun_config.json (one-shot), persistent test files, and virtual +// directories as mount points for /dev, /proc, /sys. + +use macros::{guest, host}; + +pub struct TestAugmentFs; + +fn make_test_payload() -> Vec { + (0..8192u32).map(|i| (i % 251) as u8).collect() +} + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::ptr::null_mut; + + impl Test for TestAugmentFs { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let test_case = CString::new(test_setup.test_case)?; + + // Read the guest-agent binary into memory. Leaked because + // krun_start_enter never returns. + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + let guest_agent_bytes: &'static [u8] = + Vec::leak(std::fs::read(&guest_agent_path).expect("Failed to read guest-agent")); + + // Build JSON config: exec the guest-agent with our test name. + let json = format!( + r#"{{"args": ["/guest-agent", "{}"], "cwd": "/"}}"#, + test_case.to_str().unwrap() + ); + let json_bytes: &'static [u8] = Vec::leak(json.into_bytes()); + + // Deterministic test payload for range-read tests. + let payload: &'static [u8] = Vec::leak(make_test_payload()); + + // A small marker file to test persistent reads. + let marker: &'static [u8] = b"virtual-file-marker-content-12345"; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Disable the implicit init — we'll inject it ourselves. + krun_call!(krun_disable_implicit_init(ctx))?; + + // Get the default init binary. + let mut init_data: *const u8 = null_mut(); + let mut init_len: usize = 0; + krun_call!(krun_get_default_init(&mut init_data, &mut init_len))?; + + // Set up root with NO host directory (NullFs). + krun_call!(krun_add_virtiofs3( + ctx, + c"/dev/root".as_ptr(), + std::ptr::null(), // NULL path → NullFs + 0, // no SHM window + false, // not read-only + ))?; + + // Virtual directories needed by init as mount points. + for dir in [c"dev", c"proc", c"sys"] { + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + dir.as_ptr(), + 0o040_755, + ))?; + } + + // Overlay init.krun (one-shot, executable). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"init.krun".as_ptr(), + init_data, + init_len, + 0o100_755, + true, + ))?; + + // Overlay guest-agent (one-shot, executable). After init + // execs it, the file should no longer be visible. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"guest-agent".as_ptr(), + guest_agent_bytes.as_ptr(), + guest_agent_bytes.len(), + 0o100_755, + true, + ))?; + + // Overlay .krun_config.json (one-shot). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c".krun_config.json".as_ptr(), + json_bytes.as_ptr(), + json_bytes.len(), + 0o100_644, + true, + ))?; + + // Overlay a persistent marker file. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"marker.txt".as_ptr(), + marker.as_ptr(), + marker.len(), + 0o100_644, + false, + ))?; + + // Overlay a deterministic 8 KiB payload for range-read tests. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"testdata.bin".as_ptr(), + payload.as_ptr(), + payload.len(), + 0o100_444, + false, + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::io::{ErrorKind, Read, Seek, SeekFrom}; + use std::path::Path; + + impl Test for TestAugmentFs { + fn in_guest(self: Box) { + // --- One-shot files should be gone --- + assert!( + !Path::new("/.krun_config.json").exists(), + ".krun_config.json should be gone (one-shot)" + ); + assert!( + !Path::new("/init.krun").exists(), + "init.krun should be gone (one-shot)" + ); + + // --- One-shot guest-agent can't see itself --- + assert!( + !Path::new("/guest-agent").exists(), + "guest-agent should be gone (one-shot)" + ); + + // --- Virtual directories should be accessible --- + // init already mounted over these, but let's verify they + // exist as directories (the mount points came from our + // virtual dir overlay). + for dir in ["/dev", "/proc", "/sys"] { + let meta = fs::metadata(dir).unwrap_or_else(|e| panic!("{dir} should exist: {e}")); + assert!(meta.is_dir(), "{dir} should be a directory"); + } + + // Verify the mounts actually worked by checking known entries. + assert!( + Path::new("/dev/null").exists(), + "/dev/null should exist (devtmpfs)" + ); + assert!( + Path::new("/proc/self").exists(), + "/proc/self should exist (procfs)" + ); + assert!( + Path::new("/sys/kernel").exists(), + "/sys/kernel should exist (sysfs)" + ); + + // Verify directory listing works on each mounted fs. + let dev_entries: Vec<_> = fs::read_dir("/dev").expect("read_dir /dev").collect(); + assert!(!dev_entries.is_empty(), "/dev listing should not be empty"); + + let proc_entries: Vec<_> = fs::read_dir("/proc").expect("read_dir /proc").collect(); + assert!( + !proc_entries.is_empty(), + "/proc listing should not be empty" + ); + + let sys_entries: Vec<_> = fs::read_dir("/sys").expect("read_dir /sys").collect(); + assert!(!sys_entries.is_empty(), "/sys listing should not be empty"); + + // --- Persistent files should still exist --- + assert!(Path::new("/marker.txt").exists(), "marker.txt should exist"); + assert!( + Path::new("/testdata.bin").exists(), + "testdata.bin should exist" + ); + + // --- Read + verify marker content --- + let content = fs::read_to_string("/marker.txt").expect("read marker.txt"); + assert_eq!(content, "virtual-file-marker-content-12345"); + + // --- Repeated reads return the same data --- + let content2 = fs::read_to_string("/marker.txt").expect("re-read marker.txt"); + assert_eq!(content, content2, "repeated reads differ"); + + // --- Write should fail --- + let err = fs::OpenOptions::new() + .write(true) + .open("/marker.txt") + .expect_err("write-open should fail"); + assert_eq!(err.kind(), ErrorKind::PermissionDenied); + + // --- stat reports correct size --- + let meta = fs::metadata("/testdata.bin").expect("stat testdata.bin"); + assert_eq!(meta.len(), 8192, "testdata.bin size mismatch"); + + // --- Range reads on the 8 KiB payload --- + let expected = make_test_payload(); + let mut f = fs::File::open("/testdata.bin").expect("open testdata.bin"); + + // Full read. + let got = fs::read("/testdata.bin").expect("full read"); + assert_eq!(got, expected, "full read mismatch"); + + // Read first 256 bytes. + let mut buf = vec![0u8; 256]; + f.read_exact(&mut buf).expect("read first 256"); + assert_eq!(buf, &expected[..256], "first 256 bytes mismatch"); + + // Seek to offset 4000, read 512 bytes. + f.seek(SeekFrom::Start(4000)).expect("seek to 4000"); + let mut buf = vec![0u8; 512]; + f.read_exact(&mut buf).expect("read at offset 4000"); + assert_eq!(buf, &expected[4000..4512], "range [4000..4512] mismatch"); + + // Seek to last 10 bytes. + f.seek(SeekFrom::End(-10)).expect("seek to end-10"); + let mut buf = vec![0u8; 10]; + f.read_exact(&mut buf).expect("read last 10"); + assert_eq!(buf, &expected[8182..8192], "last 10 bytes mismatch"); + + // Read past EOF should return 0 bytes. + f.seek(SeekFrom::Start(8192)).expect("seek to EOF"); + let mut buf = vec![0u8; 100]; + let n = f.read(&mut buf).expect("read past EOF"); + assert_eq!(n, 0, "read past EOF should return 0"); + + // Seek back to start, re-read, verify consistency. + f.seek(SeekFrom::Start(0)).expect("seek to start"); + let mut full = Vec::new(); + f.read_to_end(&mut full).expect("read_to_end"); + assert_eq!(full, expected, "read_to_end mismatch"); + + println!("OK"); + } + } +} diff --git a/tests/test_cases/src/test_root_disk_remount.rs b/tests/test_cases/src/test_root_disk_remount.rs new file mode 100644 index 000000000..31db6e8f3 --- /dev/null +++ b/tests/test_cases/src/test_root_disk_remount.rs @@ -0,0 +1,162 @@ +// Test that krun_set_root_disk_remount works with NullFs. +// +// Creates a tiny ext4 disk image containing only the guest-agent binary, +// boots from it via krun_set_root_disk_remount (which uses NullFs for the +// initial virtiofs root with init.krun overlaid), and verifies the guest +// successfully pivoted to the block device root. + +use macros::{guest, host}; + +pub struct TestRootDiskRemount; + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32, ShouldRun}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use nix::libc; + use std::ffi::CString; + use std::process::Command; + use std::ptr::null; + + type KrunAddDiskFn = unsafe extern "C" fn( + ctx_id: u32, + block_id: *const std::ffi::c_char, + disk_path: *const std::ffi::c_char, + read_only: bool, + ) -> i32; + + type KrunSetRootDiskRemountFn = unsafe extern "C" fn( + ctx_id: u32, + device: *const std::ffi::c_char, + fstype: *const std::ffi::c_char, + options: *const std::ffi::c_char, + ) -> i32; + + fn get_krun_add_disk() -> KrunAddDiskFn { + let symbol = CString::new("krun_add_disk").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_add_disk not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn get_krun_set_root_disk_remount() -> KrunSetRootDiskRemountFn { + let symbol = CString::new("krun_set_root_disk_remount").unwrap(); + let ptr = unsafe { libc::dlsym(libc::RTLD_DEFAULT, symbol.as_ptr()) }; + assert!(!ptr.is_null(), "krun_set_root_disk_remount not found"); + unsafe { std::mem::transmute(ptr) } + } + + fn create_disk_image(guest_agent_path: &str, output_path: &str) { + // Populate from a staging directory using mke2fs -d (no root needed). + let staging = format!("{output_path}.staging"); + std::fs::create_dir_all(&staging).expect("mkdir staging"); + + std::fs::copy(guest_agent_path, format!("{staging}/guest-agent")) + .expect("copy guest-agent"); + + // Marker file to verify the guest booted from the block device. + std::fs::write( + format!("{staging}/block-marker"), + "booted-from-block-device", + ) + .expect("write marker"); + + let status = Command::new("mke2fs") + .args(["-q", "-t", "ext4", "-d", &staging, output_path, "32M"]) + .status() + .expect("mke2fs failed"); + assert!(status.success(), "mke2fs failed"); + + std::fs::remove_dir_all(&staging).expect("cleanup staging"); + } + + impl Test for TestRootDiskRemount { + fn should_run(&self) -> ShouldRun { + if unsafe { krun_call_u32!(krun_has_feature(KRUN_FEATURE_BLK.into())) }.ok() != Some(1) + { + return ShouldRun::No("libkrun compiled without BLK"); + } + ShouldRun::Yes + } + + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let krun_add_disk = get_krun_add_disk(); + let krun_set_root_disk_remount = get_krun_set_root_disk_remount(); + + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + + let disk_path = format!("{}/rootfs.ext4", test_setup.tmp_dir.display()); + create_disk_image(&guest_agent_path, &disk_path); + + let c_disk_path = CString::new(disk_path)?; + let test_case = CString::new(test_setup.test_case)?; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + let argv = [test_case.as_ptr(), null()]; + let envp = [null()]; + krun_call!(krun_set_exec( + ctx, + c"/guest-agent".as_ptr(), + argv.as_ptr(), + envp.as_ptr(), + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + + // Add a block device with the ext4 image. + krun_call!(krun_add_disk( + ctx, + c"vda".as_ptr(), + c_disk_path.as_ptr(), + false, + ))?; + + // Configure block device as root, pivot from NullFs. + krun_call!(krun_set_root_disk_remount( + ctx, + c"/dev/vda".as_ptr(), + c"ext4".as_ptr(), + std::ptr::null(), + ))?; + + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::path::Path; + + impl Test for TestRootDiskRemount { + fn in_guest(self: Box) { + // Verify we're running from the block device root. + let marker = fs::read_to_string("/block-marker") + .expect("Failed to read /block-marker — not on block device root?"); + assert_eq!(marker, "booted-from-block-device"); + + // The init.krun virtual file should be gone (one-shot, and we + // pivoted away from the NullFs root anyway). + assert!(!Path::new("/init.krun").exists()); + + // /proc and /dev should be mounted (init re-mounts after pivot). + assert!(Path::new("/proc/self").exists(), "/proc/self missing"); + assert!(Path::new("/dev/null").exists(), "/dev/null missing"); + + println!("OK"); + } + } +}