From 00000efe400156fa41d0acc3ae4c18f6ad5e0a36 Mon Sep 17 00:00:00 2001 From: brian049 Date: Fri, 8 May 2026 21:24:14 +0800 Subject: [PATCH] Shadow-fd: cache sealed memfds across repeated opens kbox_shadow_create() rebuilt a fresh memfd on every read-only open by copying the entire file contents through LKL via a pread64 loop. For hot files opened repeatedly across exec like dynamic linker, libc, and other shared objects re-opened by every fork+exec, and this redundant copy dominated the open path. This commit add an inode-keyed cache that reuses sealed memfds across opens. The cache holds up to 64 entries in an LRU list, and each lookup is a linear scan of a small array. The key is (dev, ino, mtime_sec, mtime_nsec, size) Cached fds are always sealed (F_SEAL_WRITE | GROW | SHRINK | SEAL) before insertion, and only the read-only path enters the cache. The new entry point kbox_shadow_create_cached() replaces every read-only and seal-bound call site. The two main-binary exec_memfd paths in dispatch-exec.c and image.c keep the original kbox_shadow_create() because they patch PT_INTERP via pwrite() and must own a private and writable memfd. Callers always receive a dup() of the cached entry, never the cached fd itself, so closing the returned fd never affects the cache. Change-Id: I31c148d2513983562628feaccdf0abfba6e70ed5 --- src/dispatch-exec.c | 9 +- src/image.c | 3 +- src/seccomp-dispatch.c | 5 +- src/shadow-fd.c | 270 +++++++++++++++++++++++++++++++++++++++++ src/shadow-fd.h | 14 +++ 5 files changed, 294 insertions(+), 7 deletions(-) diff --git a/src/dispatch-exec.c b/src/dispatch-exec.c index fda6da6..d3a2907 100644 --- a/src/dispatch-exec.c +++ b/src/dispatch-exec.c @@ -118,10 +118,10 @@ struct kbox_dispatch forward_mmap(const struct kbox_syscall_request *req, if ((mmap_flags & MAP_SHARED) && (mmap_prot & PROT_WRITE)) return kbox_dispatch_errno(ENODEV); - int memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + int memfd = kbox_shadow_create_cached(ctx->sysnrs, lkl_fd); if (memfd < 0) return kbox_dispatch_errno(ENODEV); - kbox_shadow_seal(memfd); + /* kbox_shadow_create_cached() returns a sealed fd. */ int injected = request_addfd_at(ctx, req, memfd, (int) fd, 0); if (injected < 0) { close(memfd); @@ -542,7 +542,7 @@ static struct kbox_dispatch trap_userspace_exec( goto fail_early; } - interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); + interp_memfd = kbox_shadow_create_cached(ctx->sysnrs, interp_lkl); lkl_close_and_invalidate(ctx, interp_lkl); if (interp_memfd < 0) { @@ -872,7 +872,8 @@ struct kbox_dispatch forward_execve(const struct kbox_syscall_request *req, return kbox_dispatch_errno((int) (-interp_lkl)); } - int interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); + int interp_memfd = + kbox_shadow_create_cached(ctx->sysnrs, interp_lkl); lkl_close_and_invalidate(ctx, interp_lkl); if (interp_memfd < 0) { diff --git a/src/image.c b/src/image.c index 94160d9..3aee7d2 100644 --- a/src/image.c +++ b/src/image.c @@ -1147,7 +1147,8 @@ int kbox_run_image(const struct kbox_image_args *args) goto err_net; } - interp_memfd = kbox_shadow_create(sysnrs, interp_lkl_fd); + interp_memfd = + kbox_shadow_create_cached(sysnrs, interp_lkl_fd); kbox_lkl_close(sysnrs, interp_lkl_fd); if (interp_memfd < 0) { diff --git a/src/seccomp-dispatch.c b/src/seccomp-dispatch.c index 3eb2b35..00af17b 100644 --- a/src/seccomp-dispatch.c +++ b/src/seccomp-dispatch.c @@ -1217,10 +1217,10 @@ int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, if (flags < 0 || (flags & O_ACCMODE) != O_RDONLY) return 0; - memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + memfd = kbox_shadow_create_cached(ctx->sysnrs, lkl_fd); if (memfd < 0) return -1; - kbox_shadow_seal(memfd); + /* kbox_shadow_create_cached() returns sealed fds. */ cur_off = (off_t) kbox_lkl_lseek(ctx->sysnrs, lkl_fd, 0, SEEK_CUR); if (cur_off >= 0 && lseek(memfd, cur_off, SEEK_SET) < 0) { @@ -1451,6 +1451,7 @@ void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx) ctx->path_shadow_cache[i].memfd = -1; } invalidate_translated_path_cache(ctx); + kbox_shadow_cache_reset(); } static struct kbox_path_shadow_cache_entry *find_path_shadow_cache( diff --git a/src/shadow-fd.c b/src/shadow-fd.c index 2ccf374..ecd87b2 100644 --- a/src/shadow-fd.c +++ b/src/shadow-fd.c @@ -5,6 +5,11 @@ * containing the file's contents and inject it into the tracee. * This lets the host kernel handle mmap natively; critical for * dynamic linkers that mmap .so files with MAP_PRIVATE. + * + * For read-only shadows, kbox_shadow_create_cached() reuses sealed + * memfds across repeated opens of the same (inode, mtime, size) + * tuple. Hot files (ld-musl, libc.so ...) become a single dup() + * instead of a full pread64 copy loop. */ #include @@ -40,6 +45,204 @@ /* Read chunk size: 128 KB, matches KBOX_IO_CHUNK_LEN. */ #define SHADOW_CHUNK_LEN (128 * 1024) +/* Array of KBOX_SHADOW_CACHE_MAX entries with an embedded LRU + * doubly-linked list. + */ +struct shadow_cache_entry { + int memfd; /* sealed read-only memfd, or -1 if free */ + uint64_t dev; + uint64_t ino; + int64_t mtime_sec; + int64_t mtime_nsec; + int64_t size; + /* LRU list: lru_prev/lru_next are indices into cache.entries, + * or -1 for list ends. Free slots are not on the list. + */ + int lru_prev; + int lru_next; +}; + +static struct { + struct shadow_cache_entry entries[KBOX_SHADOW_CACHE_MAX]; + int lru_head; /* most recently used, -1 if empty */ + int lru_tail; /* least recently used, -1 if empty */ + unsigned size; /* number of occupied slots */ + unsigned long hits; + unsigned long misses; + int initialised; +} cache; + +static void cache_lazy_init(void) +{ + if (cache.initialised) + return; + for (int i = 0; i < KBOX_SHADOW_CACHE_MAX; i++) { + cache.entries[i].memfd = -1; + cache.entries[i].lru_prev = -1; + cache.entries[i].lru_next = -1; + } + cache.lru_head = -1; + cache.lru_tail = -1; + cache.size = 0; + cache.hits = 0; + cache.misses = 0; + cache.initialised = 1; +} + +static void lru_unlink(int idx) +{ + struct shadow_cache_entry *e = &cache.entries[idx]; + if (e->lru_prev >= 0) + cache.entries[e->lru_prev].lru_next = e->lru_next; + else if (cache.lru_head == idx) + cache.lru_head = e->lru_next; + if (e->lru_next >= 0) + cache.entries[e->lru_next].lru_prev = e->lru_prev; + else if (cache.lru_tail == idx) + cache.lru_tail = e->lru_prev; + e->lru_prev = -1; + e->lru_next = -1; +} + +static void lru_push_front(int idx) +{ + struct shadow_cache_entry *e = &cache.entries[idx]; + e->lru_prev = -1; + e->lru_next = cache.lru_head; + if (cache.lru_head >= 0) + cache.entries[cache.lru_head].lru_prev = idx; + cache.lru_head = idx; + if (cache.lru_tail < 0) + cache.lru_tail = idx; +} + +static void cache_evict_slot(int idx) +{ + struct shadow_cache_entry *e = &cache.entries[idx]; + if (e->memfd < 0) + return; + close(e->memfd); + e->memfd = -1; + lru_unlink(idx); + cache.size--; +} + +static int cache_lookup(uint64_t dev, + uint64_t ino, + int64_t mtime_sec, + int64_t mtime_nsec, + int64_t size) +{ + for (int i = 0; i < KBOX_SHADOW_CACHE_MAX; i++) { + const struct shadow_cache_entry *e = &cache.entries[i]; + if (e->memfd < 0) + continue; + if (e->dev == dev && e->ino == ino && e->mtime_sec == mtime_sec && + e->mtime_nsec == mtime_nsec && e->size == size) { + return i; + } + } + return -1; +} + +static int cache_find_free_slot(void) +{ + for (int i = 0; i < KBOX_SHADOW_CACHE_MAX; i++) { + if (cache.entries[i].memfd < 0) + return i; + } + return -1; +} + +/* Insert (memfd, key) into cache and takes ownership of memfd. */ +static int cache_insert(int memfd, + uint64_t dev, + uint64_t ino, + int64_t mtime_sec, + int64_t mtime_nsec, + int64_t size) +{ + int slot; + + /* If a stale entry for the same key still lingers (e.g. mtime + * matched but content was replaced atomically with identical + * stat), evict it first. + */ + slot = cache_lookup(dev, ino, mtime_sec, mtime_nsec, size); + if (slot >= 0) + cache_evict_slot(slot); + + if (cache.size >= KBOX_SHADOW_CACHE_MAX) { + if (cache.lru_tail < 0) + return -1; + cache_evict_slot(cache.lru_tail); + } + + slot = cache_find_free_slot(); + if (slot < 0) + return -1; + + struct shadow_cache_entry *e = &cache.entries[slot]; + e->memfd = memfd; + e->dev = dev; + e->ino = ino; + e->mtime_sec = mtime_sec; + e->mtime_nsec = mtime_nsec; + e->size = size; + lru_push_front(slot); + cache.size++; + return 0; +} + +/* Evict any cached entry whose (dev, ino) matches but whose + * (mtime, size) does not and called on cache miss for an in-use inode. + */ +static void cache_evict_stale(uint64_t dev, + uint64_t ino, + int64_t mtime_sec, + int64_t mtime_nsec, + int64_t size) +{ + for (int i = 0; i < KBOX_SHADOW_CACHE_MAX; i++) { + struct shadow_cache_entry *e = &cache.entries[i]; + if (e->memfd < 0) + continue; + if (e->dev == dev && e->ino == ino && + (e->mtime_sec != mtime_sec || e->mtime_nsec != mtime_nsec || + e->size != size)) { + cache_evict_slot(i); + } + } +} + +void kbox_shadow_cache_reset(void) +{ + if (!cache.initialised) + return; + for (int i = 0; i < KBOX_SHADOW_CACHE_MAX; i++) + cache_evict_slot(i); + cache.lru_head = -1; + cache.lru_tail = -1; + cache.size = 0; + cache.hits = 0; + cache.misses = 0; +} + + +unsigned kbox_shadow_cache_size(void) +{ + return cache.size; +} +unsigned long kbox_shadow_cache_hits(void) +{ + return cache.hits; +} +unsigned long kbox_shadow_cache_misses(void) +{ + return cache.misses; +} + + int kbox_shadow_create(const struct kbox_sysnrs *s, long lkl_fd) { /* Use kbox_lkl_stat (generic-arch layout) instead of struct stat @@ -127,3 +330,70 @@ int kbox_shadow_seal(int memfd) return fcntl(memfd, F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL); } + +int kbox_shadow_create_cached(const struct kbox_sysnrs *s, long lkl_fd) +{ + struct kbox_lkl_stat kst; + long ret; + + cache_lazy_init(); + + /* One fstat serves both the eligibility check and the cache key. */ + memset(&kst, 0, sizeof(kst)); + ret = kbox_lkl_fstat(s, lkl_fd, &kst); + if (ret < 0) + return (int) ret; + + if (!S_ISREG(kst.st_mode)) + return -ENODEV; + if (kst.st_size > KBOX_SHADOW_MAX_SIZE) + return -EFBIG; + + uint64_t dev = (uint64_t) kst.st_dev; + uint64_t ino = (uint64_t) kst.st_ino; + int64_t msec = (int64_t) kst.st_mtime_sec; + int64_t mns = (int64_t) kst.st_mtime_nsec; + int64_t sz = (int64_t) kst.st_size; + + /* Cache hit fast path. */ + int slot = cache_lookup(dev, ino, msec, mns, sz); + if (slot >= 0) { + int dup_fd = fcntl(cache.entries[slot].memfd, F_DUPFD_CLOEXEC, 0); + if (dup_fd >= 0) { + /* Promote to MRU. */ + lru_unlink(slot); + lru_push_front(slot); + cache.hits++; + return dup_fd; + } + /* Fall through to miss path and keeping the cache entry + * intact if dup failed. */ + } + + /* If cache miss. Evict any stale entry for this inode + * before creating a new shadow. */ + cache_evict_stale(dev, ino, msec, mns, sz); + cache.misses++; + + int memfd = kbox_shadow_create(s, lkl_fd); + if (memfd < 0) + return memfd; + if (kbox_shadow_seal(memfd) < 0) { + /* Sealing is required for safe sharing, if on failure, + * return the unsealed fd and skip caching. */ + return memfd; + } + + /* Hand out a dup so the caller's close() never affects the cache. */ + int dup_fd = fcntl(memfd, F_DUPFD_CLOEXEC, 0); + if (dup_fd < 0) { + /* Return the original fd uncached when dup failed. */ + return memfd; + } + + if (cache_insert(memfd, dev, ino, msec, mns, sz) < 0) { + /* Insertion failed: caller owns dup_fd, drop the original. */ + close(memfd); + } + return dup_fd; +} diff --git a/src/shadow-fd.h b/src/shadow-fd.h index 51342bd..7873aa3 100644 --- a/src/shadow-fd.h +++ b/src/shadow-fd.h @@ -6,7 +6,21 @@ struct kbox_sysnrs; #define KBOX_SHADOW_MAX_SIZE (256L * 1024 * 1024) +/* Maximum number of memfds the shadow cache will retain. */ +#define KBOX_SHADOW_CACHE_MAX 64 + int kbox_shadow_create(const struct kbox_sysnrs *s, long lkl_fd); int kbox_shadow_seal(int memfd); +/* Cached variant for read-only shadow promotion. */ +int kbox_shadow_create_cached(const struct kbox_sysnrs *s, long lkl_fd); + +/* Drop all cached entries and close their memfds. */ +void kbox_shadow_cache_reset(void); + +/* Create for make check-unit */ +unsigned kbox_shadow_cache_size(void); +unsigned long kbox_shadow_cache_hits(void); +unsigned long kbox_shadow_cache_misses(void); + #endif /* KBOX_SHADOW_FD_H */