From 6080aadab029e2e58f5073a4bc6a4bc05d88276b Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Thu, 26 Mar 2026 10:51:40 +0800 Subject: [PATCH] Add syscall binary rewriting infrastructure This implements syscall rewriting fast-path, adding a complete binary rewriting layer alongside the existing seccomp-unotify (Tier 3) and SIGSYS trap (Tier 1) interception paths. Core additions: - x86-64 instruction length decoder (x86-decode.c) that walks true instruction boundaries, eliminating false-positive 0F 05/0F 34 matches inside immediates and displacements - Site-aware rewrite classification (WRAPPER vs COMPLEX) to gate the inline fast path: only simple syscall+ret wrapper sites return virtualized PID values (getpid=1, gettid=1, getppid=0) directly; complex sites (raise->gettid->tgkill) use full dispatch - aarch64 binary rewriting: SVC #0 patched to B-branch trampolines with veneer pages (LDR+BR indirect stubs) for sites beyond 128MB - Userspace ELF loader for in-process trap/rewrite launch - Unified syscall request abstraction across all three tiers - Per-architecture AUTO mode: seccomp on x86_64, rewrite/trap on aarch64 for non-shell commands without fork wrappers - W^X enforcement (PROT_WRITE|PROT_EXEC blocked in trap/rewrite mode) - CLONE_THREAD blocked in trap/rewrite (EPERM); multi-threaded guests must use --syscall-mode=seccomp - Shadow FD cache with dup() per open for independent file offsets - Safe argv/envp validation in trap-mode execve via process_vm_readv - Execve re-translation: rewrite runtime re-installed on new binary - Fork-site scanner for AUTO mode safety (detects fork/clone wrappers) Benchmark (bench-test, 200 iterations, release build, us/call): x86_64 (AMD Threadripper 2990WX, Linux 6.8.0): syscall native seccomp trap rewrite auto stat 2.6 13.5 22.0 20.2 13.5 open+close 5.4 25.2 89.6 58.9 25.3 lseek+read 1.9 2.3 2.8 2.1 2.4 write 2.7 2.2 2.5 2.0 2.4 getpid 0.3 0.0 0.1 0.1 0.0 aarch64 (Marvell ThunderX2, Linux 6.14.0): syscall native seccomp trap rewrite auto stat 1.5 21.7 25.1 1.0 21.4 open+close 3.4 40.3 138.4 117.0 39.2 lseek+read 1.0 1.3 1.6 1.6 1.4 write 1.1 1.4 1.8 1.9 1.4 getpid 0.3 0.0 0.1 0.1 0.0 * AUTO selects seccomp on x86_64 (lower USER_NOTIF overhead), rewrite on aarch64 (stat 1.0us via in-process LKL inode cache vs 21.7us * seccomp round-trip). aarch64 rewrite stat at 0.7x native is real: LKL serves from memory without hitting the block layer. Change-Id: I9052fc27388027b9147348c0998bc8fa4e1e123e --- README.md | 93 +- docs/gdb-workflow.md | 6 +- docs/syscall-parity-spec.md | 12 +- include/kbox/cli.h | 8 + include/kbox/elf.h | 83 + include/kbox/probe.h | 20 +- include/kbox/x86-decode.h | 16 + mk/features.mk | 20 + mk/tests.mk | 39 +- scripts/pre-commit.hook | 6 + src/cli.c | 15 + src/elf.c | 523 +++- src/fd-table.c | 74 +- src/fd-table.h | 26 +- src/image.c | 1116 +++++++- src/io-util.h | 32 + src/lkl-wrap.h | 6 +- src/loader-entry.c | 41 + src/loader-entry.h | 24 + src/loader-handoff.c | 109 + src/loader-handoff.h | 25 + src/loader-image.c | 187 ++ src/loader-image.h | 32 + src/loader-launch.c | 170 ++ src/loader-launch.h | 60 + src/loader-layout.c | 274 ++ src/loader-layout.h | 76 + src/loader-stack.c | 322 +++ src/loader-stack.h | 53 + src/loader-transfer.c | 89 + src/loader-transfer.h | 25 + src/net-slirp.c | 5 +- src/probe.c | 108 +- src/procmem.c | 388 ++- src/procmem.h | 60 + src/rewrite.c | 4060 +++++++++++++++++++++++++++++ src/rewrite.h | 308 +++ src/seccomp-bpf.c | 401 ++- src/seccomp-defs.h | 6 + src/seccomp-dispatch.c | 4033 +++++++++++++++++++++------- src/seccomp-supervisor.c | 71 +- src/seccomp.h | 111 + src/shadow-fd.c | 6 +- src/syscall-nr.c | 2 +- src/syscall-request.c | 50 + src/syscall-trap-signal.h | 57 + src/syscall-trap.c | 1549 +++++++++++ src/syscall-trap.h | 104 + src/web-events.c | 31 - src/web-server.c | 2 +- src/x86-decode.c | 968 +++++++ src/x86-decode.h | 30 + tests/guest/bench-test.c | 156 ++ tests/guest/trap-bench.S | 79 + tests/unit/test-elf.c | 226 ++ tests/unit/test-loader-entry.c | 148 ++ tests/unit/test-loader-handoff.c | 230 ++ tests/unit/test-loader-image.c | 180 ++ tests/unit/test-loader-launch.c | 293 +++ tests/unit/test-loader-layout.c | 349 +++ tests/unit/test-loader-stack.c | 199 ++ tests/unit/test-loader-transfer.c | 74 + tests/unit/test-procmem.c | 100 + tests/unit/test-rewrite.c | 1425 ++++++++++ tests/unit/test-runner.c | 51 +- tests/unit/test-runner.h | 1 + tests/unit/test-seccomp-stubs.c | 38 + tests/unit/test-syscall-nr.c | 6 + tests/unit/test-syscall-request.c | 94 + tests/unit/test-syscall-trap.c | 521 ++++ tests/unit/test-x86-decode.c | 405 +++ 71 files changed, 19350 insertions(+), 1157 deletions(-) create mode 100644 include/kbox/x86-decode.h create mode 100644 src/io-util.h create mode 100644 src/loader-entry.c create mode 100644 src/loader-entry.h create mode 100644 src/loader-handoff.c create mode 100644 src/loader-handoff.h create mode 100644 src/loader-image.c create mode 100644 src/loader-image.h create mode 100644 src/loader-launch.c create mode 100644 src/loader-launch.h create mode 100644 src/loader-layout.c create mode 100644 src/loader-layout.h create mode 100644 src/loader-stack.c create mode 100644 src/loader-stack.h create mode 100644 src/loader-transfer.c create mode 100644 src/loader-transfer.h create mode 100644 src/rewrite.c create mode 100644 src/rewrite.h create mode 100644 src/syscall-request.c create mode 100644 src/syscall-trap-signal.h create mode 100644 src/syscall-trap.c create mode 100644 src/syscall-trap.h create mode 100644 src/x86-decode.c create mode 100644 src/x86-decode.h create mode 100644 tests/guest/bench-test.c create mode 100644 tests/guest/trap-bench.S create mode 100644 tests/unit/test-loader-entry.c create mode 100644 tests/unit/test-loader-handoff.c create mode 100644 tests/unit/test-loader-image.c create mode 100644 tests/unit/test-loader-launch.c create mode 100644 tests/unit/test-loader-layout.c create mode 100644 tests/unit/test-loader-stack.c create mode 100644 tests/unit/test-loader-transfer.c create mode 100644 tests/unit/test-procmem.c create mode 100644 tests/unit/test-rewrite.c create mode 100644 tests/unit/test-seccomp-stubs.c create mode 100644 tests/unit/test-syscall-request.c create mode 100644 tests/unit/test-syscall-trap.c create mode 100644 tests/unit/test-x86-decode.c diff --git a/README.md b/README.md index 2aa002c..b13806c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # kbox -kbox boots a real Linux kernel as an in-process library ([LKL](https://github.com/lkl/linux)) and routes intercepted syscalls to it via [`seccomp_unotify`](https://man7.org/linux/man-pages/man2/seccomp_unotify.2.html). It provides a rootless chroot/proot alternative with kernel-level syscall accuracy. +kbox boots a real Linux kernel as an in-process library ([LKL](https://github.com/lkl/linux)) and routes intercepted syscalls to it. Three interception tiers are available: seccomp-unotify (most compatible), SIGSYS trap (lower latency), and binary rewriting (near-native for process-info syscalls). The default `auto` mode selects the fastest tier that works for a given workload. kbox provides a rootless chroot/proot alternative with kernel-level syscall accuracy. ## Why kbox @@ -13,44 +13,70 @@ Running Linux userspace programs in a rootless, unprivileged environment require kbox takes a fundamentally different approach: boot the actual Linux kernel as an in-process library and route intercepted syscalls to it. The kernel that handles your `open()` is the same kernel that runs on servers in production. No reimplementation, no approximation. -The interception mechanism matters too. seccomp-unotify delivers syscall notifications to a supervisor without requiring ptrace attachment or parent-child tracing relationships. The supervisor is just another process with a file descriptor. The tracee's syscall blocks in the kernel until the supervisor responds -- no TOCTOU window, no signal races, no thread-group confusion. +The interception mechanism matters too. kbox offers three tiers, each trading isolation for speed: -The result: programs get real VFS, real ext4, real procfs -- without root privileges, containers, VMs, or ptrace. +- **Seccomp-unotify** (Tier 3): syscall notifications delivered to a separate supervisor process via `SECCOMP_RET_USER_NOTIF`. Strongest isolation, lowest overhead for file I/O. The supervisor dispatches to LKL and injects results back via two ioctl round-trips per syscall. +- **SIGSYS trap** (Tier 1): in-process signal handler intercepts syscalls via `SECCOMP_RET_TRAP`. No cross-process round-trip, but the signal frame build/restore and a service-thread hand-off (eventfd + futex) add overhead. Best for metadata operations on aarch64 where the USER_NOTIF round-trip cost is proportionally higher. +- **Binary rewriting** (Tier 2): syscall instructions patched to call a trampoline at load time. On aarch64, `SVC #0` is replaced with a `B` branch into a per-site trampoline that calls the dispatch function directly on the guest thread, with zero signal overhead, zero context switches, and zero FS base switching. Stat from the LKL inode cache completes in-process without any kernel round-trip. On x86_64, only 8-byte wrapper sites (`mov $NR; syscall; ret`) are patched; bare 2-byte `syscall` instructions cannot currently be rewritten in-place (the only same-width replacement, `call *%rax`, would jump to the syscall number in RAX), so unpatched sites fall through to the SIGSYS trap path. Process-info syscalls (getpid, gettid) at wrapper sites return virtualized values inline at native speed. + +The default `--syscall-mode=auto` selects the fastest tier per architecture. On x86_64, auto uses seccomp (lower per-syscall overhead than trap mode for file I/O). On aarch64, auto uses rewrite/trap for non-shell direct commands (faster metadata via in-process LKL) and seccomp for shell invocations. The selection is based on binary analysis: the main executable is scanned for fork/clone wrapper sites, and binaries that can fork fall back to seccomp to preserve cross-process filesystem coherence. + +The result: programs get real VFS, real ext4, real procfs, at near-native syscall speed, without root privileges, containers, VMs, or ptrace. ## How it works ``` + Seccomp mode (--syscall-mode=seccomp, shell commands in auto) + ┌────────────────┐ - │ guest child │ (seccomp BPF installed) + │ guest child │ (seccomp BPF: USER_NOTIF) └──────┬─────────┘ │ syscall notification ┌──────▼──────────┐ ┌──────────────────┐ │ supervisor │────────▶ │ web observatory │ │ (dispatch) │ counters │ (HTTP + SSE) │ └────┬───────┬────┘ events └────────┬─────────┘ - LKL path │ │ host path │ /api/snapshot - ┌───────────▼──┐ ┌──▼──────────┐ │ /api/events - │ LKL kernel │ │ host kernel │ ▼ - │ (in-proc) │ │ │ ┌──────────────┐ - └──────────────┘ └─────────────┘ │ web browser │ - └──────────────┘ + LKL path │ │ host path │ + ┌───────────▼──┐ ┌──▼──────────┐ ▼ + │ LKL kernel │ │ host kernel │ ┌──────────────┐ + │ (in-proc) │ │ │ │ web browser │ + └──────────────┘ └─────────────┘ └──────────────┘ + + Trap mode (--syscall-mode=trap, direct binaries in auto) + + ┌─────────────────────────────────────────┐ + │ single process │ + │ ┌─────────────┐ ┌──────────────────┐ │ + │ │ guest code │──▶│ SIGSYS handler │ │ + │ │ (loaded ELF)│ │ (dispatch thread)│ │ + │ └─────────────┘ └───┬────────┬─────┘ │ + │ LKL path │ │ host │ + │ ┌─────────────▼──┐ ┌───▼─────┐ │ + │ │ LKL kernel │ │ host │ │ + │ │ (in-proc) │ │ kernel │ │ + │ └────────────────┘ └─────────┘ │ + └─────────────────────────────────────────┘ ``` 1. The supervisor opens a rootfs disk image and registers it as an LKL block device. 2. LKL boots a real Linux kernel inside the process (no VM, no separate process tree). -3. The filesystem is mounted via LKL, and the supervisor sets the guest's virtual root via LKL's internal `chroot`. -4. A child process is forked with a seccomp BPF filter that delivers all syscalls (except a minimal allow-list: `sendmsg`, `exit`, `exit_group`) as user notifications. -5. The supervisor receives each notification via `SECCOMP_IOCTL_NOTIF_RECV`, translates paths and file descriptors, and forwards the syscall to either LKL or the host kernel. -6. Results are injected back via `SECCOMP_IOCTL_NOTIF_SEND`. For FD-returning syscalls (open, pipe, dup), `SECCOMP_IOCTL_NOTIF_ADDFD` injects file descriptors directly into the tracee's FD table. +3. The filesystem is mounted via LKL, and the supervisor sets the guest's virtual root via LKL's internal chroot. +4. The launch path depends on the syscall mode: + - **Seccomp**: a child process is forked with a BPF filter that delivers syscalls as user notifications. The supervisor receives each notification, dispatches to LKL or the host kernel, and injects results back. + - **Trap**: the guest binary is loaded into the current process via a userspace ELF loader. A BPF filter traps guest-range syscalls via `SECCOMP_RET_TRAP`, delivering SIGSYS. A service thread runs the dispatch; the signal handler captures the request and spins until the result is ready. No cross-process round-trip. + - **Rewrite**: same as trap, but additionally patches syscall instructions to branch directly into dispatch trampolines, eliminating the SIGSYS signal overhead entirely for patched sites. On **aarch64**, `SVC #0` (4 bytes, fixed-width) is replaced with a `B` branch to a per-site trampoline past the segment end; veneer pages with `LDR+BR` indirect stubs bridge sites beyond ±128MB. The trampoline saves registers, calls the C dispatch function on the guest thread, and returns. No signal frame, no service thread, no context switch. On **x86_64**, only 8-byte wrapper sites (`mov $NR, %eax; syscall; ret`) can be safely patched (to `jmp rel32` targeting a wrapper trampoline); bare 2-byte `syscall`/`sysenter` instructions cannot be rewritten in-place because the replacement `call *%rax` would jump to the syscall number, not a code address. Unpatched x86_64 sites fall through to the SIGSYS trap path. An instruction-boundary-aware length decoder (`x86-decode.c`) ensures the scanner never matches `0F 05` bytes that appear inside longer instructions (immediates, displacements). Site-aware classification labels each site as WRAPPER (eligible for inline virtualized getpid=1, gettid=1) or COMPLEX (must use full dispatch). W^X enforcement blocks simultaneous `PROT_WRITE|PROT_EXEC` in guest memory. + - **Auto** (default): selects the fastest tier per architecture. On aarch64, auto uses rewrite/trap for non-shell direct commands whose main executable has no fork/clone wrapper sites (21x faster stat via in-process LKL inode cache). On x86_64, auto uses seccomp (USER_NOTIF overhead is low enough that it beats trap mode's signal + service-thread chain for every measured syscall). Shell invocations always use seccomp. If the selected tier fails at install time, auto falls through to the next tier. ### Syscall routing Every intercepted syscall is dispatched to one of three dispositions: -- **LKL forward** (~74 handlers): filesystem operations (open, read, write, stat, getdents, mkdir, unlink, rename), metadata (chmod, chown, utimensat), identity (getuid, setuid, getgroups), and networking (socket, connect). The supervisor reads arguments from tracee memory via `process_vm_readv`, calls into LKL, and writes results back via `process_vm_writev`. -- **Host CONTINUE** (~34 entries): scheduling (sched_yield, sched_setscheduler), signals (rt_sigaction, kill, tgkill), memory management (mmap, mprotect, brk), I/O multiplexing (epoll, poll, select), threading (futex, clone, set_tid_address), and time (nanosleep, clock_gettime). These work correctly with the host kernel and incur no supervisor overhead. +- **LKL forward** (~74 handlers): filesystem operations (open, read, write, stat, getdents, mkdir, unlink, rename), metadata (chmod, chown, utimensat), identity (getuid, setuid, getgroups), and networking (socket, connect). In seccomp mode, the supervisor reads arguments from tracee memory via `process_vm_readv` and writes results via `process_vm_writev`. In trap mode, guest memory is accessed directly (same address space) via `process_vm_readv` on self. +- **Host CONTINUE** (~34 entries): scheduling (sched_yield, sched_setscheduler), signals (rt_sigaction, kill, tgkill), memory management (mmap, mprotect, brk), I/O multiplexing (epoll, poll, select), threading (futex, clone, set_tid_address), and time (nanosleep, clock_gettime). In seccomp mode, the kernel replays the syscall. In trap mode, the dispatch thread re-issues the syscall via an asm trampoline whose instruction pointer is in the BPF allow range. - **Emulated**: process identity (getpid returns 1, gettid returns 1), uname (synthetic LKL values), getrandom (LKL `/dev/urandom`), clock_gettime/gettimeofday (host clock, direct passthrough for latency). +All three tiers share the same dispatch engine (`kbox_dispatch_request`). The `kbox_syscall_request` abstraction decouples the dispatch logic from the notification transport: seccomp notifications, SIGSYS signal info, and rewrite trampoline calls all produce the same request struct. + Unknown syscalls receive `ENOSYS`. ~50 dangerous syscalls (mount, reboot, init_module, bpf, ptrace, etc.) are rejected with `EPERM` directly in the BPF filter before reaching the supervisor. ### Key subsystems @@ -63,7 +89,17 @@ Unknown syscalls receive `ENOSYS`. ~50 dangerous syscalls (mount, reboot, init_m **ELF extraction** (`elf.c`, `image.c`): binaries are extracted from the LKL filesystem into memfds for `fexecve`. For dynamically-linked binaries, the PT_INTERP segment names an interpreter (e.g., `/lib/ld-musl-x86_64.so.1`) that does not exist on the host. The supervisor extracts the interpreter into a second memfd and patches PT_INTERP in the main binary to `/proc/self/fd/N`. The host kernel resolves this during `load_elf_binary`, before close-on-exec runs. -**Pipe architecture**: `pipe()`/`pipe2()` create real host pipes injected into the tracee via `SECCOMP_IOCTL_NOTIF_ADDFD`. No LKL involvement -- the host kernel manages fork inheritance and close semantics natively. This is why shell pipelines work: both parent and child share real pipe FDs that the host kernel handles. +**Pipe architecture**: `pipe()`/`pipe2()` create real host pipes injected into the tracee via `SECCOMP_IOCTL_NOTIF_ADDFD`. No LKL involvement; the host kernel manages fork inheritance and close semantics natively. This is why shell pipelines work: both parent and child share real pipe FDs that the host kernel handles. + +**Trap fast path** (`syscall-trap.c`, `loader-*.c`): for direct binary commands, kbox loads the guest ELF into the current process via a userspace loader (7 modules: entry, handoff, image, layout, launch, stack, transfer). A BPF filter traps guest-range instruction pointers via `SECCOMP_RET_TRAP`, delivering SIGSYS. The signal handler saves/restores the FS base (FSGSBASE instructions on kernel 5.9+, arch_prctl fallback) so kbox and guest each use their own TLS. A service thread runs the full dispatch; the handler captures the request and spins until the result is ready, keeping heap-allocating code out of signal context. `arch_prctl(SET_FS)` is intercepted to maintain dual TLS state. + +**Rewrite engine** (`rewrite.c`, `x86-decode.c`): scans executable PT_LOAD segments for syscall instructions and patches them to branch directly into dispatch trampolines, eliminating the SIGSYS signal overhead for patched sites. + +On **aarch64**, `SVC #0` (4 bytes, fixed-width) is replaced with a `B` branch to a per-site trampoline allocated past the segment end. The trampoline saves registers, loads the origin address, and calls the C dispatch function directly on the guest thread. No signal frame, no service thread context switch. Veneer pages with `LDR x16, [PC+8]; BR x16` indirect stubs bridge sites beyond the ±128MB `B`-instruction range, with slot reuse to avoid wasting a full page per veneer. This is why aarch64 rewrite achieves 1us stat (vs 22us in seccomp): the dispatch runs in-process with LKL serving from the inode cache. + +On **x86_64**, an instruction-boundary-aware length decoder (`x86-decode.c`) walks true instruction boundaries, eliminating false matches of `0F 05`/`0F 34` bytes inside immediates, displacements, and SIB encodings. Only 8-byte wrapper sites (`mov $NR, %eax; syscall; ret`) are patched to `jmp rel32` targeting a wrapper trampoline that encodes the syscall number and origin address. Bare 2-byte `syscall` instructions are not rewritten because the only same-width replacement (`call *%rax`, `FF D0`) would jump to the syscall number in RAX rather than a code address. Unpatched sites fall through to the SIGSYS trap path. This is why x86_64 rewrite currently offers no advantage over seccomp: most syscalls still take the signal path. + +Each site is classified as WRAPPER (simple `syscall; ret` pattern, eligible for inline virtualized return: getpid=1, gettid=1, getppid=0) or COMPLEX (result consumed internally by helpers like `raise()` that feed gettid into tgkill; must use full dispatch). An origin map validates dispatch calls against known rewrite sites and carries the per-site classification. During re-exec (`trap_userspace_exec`), the rewrite runtime is re-installed on the new binary. Multi-threaded guests (`CLONE_THREAD`) are blocked in trap/rewrite mode; use `--syscall-mode=seccomp` for threaded workloads. ### ABI translation @@ -75,7 +111,7 @@ On aarch64, four `O_*` flags differ between the host and asm-generic: `O_DIRECTO ## Building -Linux only (host kernel 5.0+ for seccomp-unotify). Requires GCC, GNU Make, and a pre-built `liblkl.a`. No `libseccomp` dependency -- the BPF filter is compiled natively. +Linux only (host kernel 5.0+ for seccomp-unotify, 5.9+ for FSGSBASE trap optimization). Requires GCC, GNU Make, and a pre-built `liblkl.a`. No `libseccomp` dependency; the BPF filter is compiled natively. ```bash make # debug build (ASAN + UBSAN enabled) @@ -133,6 +169,25 @@ make ARCH=aarch64 CC=aarch64-linux-gnu-gcc rootfs Note: use `/bin/sh -i` for interactive sessions. The `-i` flag forces the shell into interactive mode regardless of terminal detection. +### Syscall mode selection + +The `--syscall-mode` option controls the interception mechanism: + +```bash +# Auto (default): seccomp on x86_64, rewrite/trap on aarch64 for direct commands +./kbox image -S alpine.ext4 -- /bin/ls / + +# Force seccomp for all workloads (most compatible, handles fork+exec) +./kbox image -S alpine.ext4 --syscall-mode=seccomp -- /bin/sh -i + +# Force trap for single-exec commands (SIGSYS dispatch, no binary patching) +./kbox image -r alpine.ext4 --syscall-mode=trap -- /bin/cat /etc/hostname + +# Force rewrite (aarch64: patches SVC to branch trampolines, fastest stat; +# x86_64: patches wrapper sites, bare syscalls fall back to trap) +./kbox image -r alpine.ext4 --syscall-mode=rewrite -- /opt/tests/bench-test 200 +``` + Run `./kbox image --help` for the full option list. ## Web-based kernel observatory @@ -196,7 +251,7 @@ make check-integration # integration tests against a rootfs image make check-stress # stress test programs ``` -Unit tests (82 tests) have no LKL dependency and run on any Linux host. Integration tests (43 tests) run guest binaries inside kbox against an Alpine ext4 image. Stress tests exercise fork storms, FD exhaustion, concurrent I/O, signal races, and long-running processes. +Unit tests (portable subset runs on macOS, full suite on Linux) have no LKL dependency. Linux-only tests cover the trap runtime, userspace loader, rewrite engine, x86-64 instruction decoder, site classification, procmem, and syscall request decoding. The x86 decoder tests verify instruction length correctness across all major encoding formats and validate that embedded `0F 05` bytes inside longer instructions are not misidentified as syscalls. Integration tests run guest binaries inside kbox against an Alpine ext4 image. Stress tests exercise fork storms, FD exhaustion, concurrent I/O, signal races, and long-running processes. All tests run clean under ASAN and UBSAN. Guest binaries are compiled without sanitizers (shadow memory interferes with `process_vm_readv`). diff --git a/docs/gdb-workflow.md b/docs/gdb-workflow.md index cd87f09..90f18ce 100644 --- a/docs/gdb-workflow.md +++ b/docs/gdb-workflow.md @@ -207,9 +207,9 @@ ASAN_OPTIONS=detect_leaks=0 gdb --args ./kbox image -S alpine.ext4 -c /bin/sh ## Coordinated Syscall Tracing The `kbox-syscall-trace` command sets breakpoints on three points: -1. `kbox_dispatch_syscall` -- seccomp notification entry -2. `lkl_syscall` -- LKL kernel entry -3. `lkl_syscall6` -- LKL wrapper +1. `kbox_dispatch_syscall`: seccomp dispatch entry +2. `lkl_syscall`: LKL kernel entry +3. `lkl_syscall6`: LKL wrapper On each hit, it prints the syscall number, decoded name, arguments, virtual FD translation (if applicable), and LKL parameters: diff --git a/docs/syscall-parity-spec.md b/docs/syscall-parity-spec.md index daf35fb..2019246 100644 --- a/docs/syscall-parity-spec.md +++ b/docs/syscall-parity-spec.md @@ -4,15 +4,17 @@ Acceptance test definition for the C rewrite. For each syscall in the MVP set, documents: arguments, return value, errno, side effects, and any deviation from the Rust implementation. -Status: all syscalls below are implemented in seccomp_dispatch.c. +Status: all syscalls below are implemented in seccomp-dispatch.c via +kbox_dispatch_request(). The same dispatch engine handles all three +interception tiers (seccomp-unotify, SIGSYS trap, binary rewriting). ## Notation -- `vfd`: virtual file descriptor (4096+), mapped to LKL-internal fd +- `vfd`: virtual file descriptor (32768+), mapped to LKL-internal fd - `LKL(...)`: forwarded to LKL via lkl_syscall6() -- `CONTINUE`: seccomp_notif_resp with FLAG_CONTINUE (host kernel handles) -- `RETURN(val)`: seccomp_notif_resp with injected return value -- `ERRNO(e)`: seccomp_notif_resp with error = e +- `CONTINUE`: host kernel handles (seccomp: FLAG_CONTINUE; trap: re-issue via asm trampoline) +- `RETURN(val)`: injected return value +- `ERRNO(e)`: return with error = e --- diff --git a/include/kbox/cli.h b/include/kbox/cli.h index fc56ca2..8a2da74 100644 --- a/include/kbox/cli.h +++ b/include/kbox/cli.h @@ -15,6 +15,13 @@ enum kbox_mode { KBOX_MODE_IMAGE, }; +enum kbox_syscall_mode { + KBOX_SYSCALL_MODE_SECCOMP, + KBOX_SYSCALL_MODE_TRAP, + KBOX_SYSCALL_MODE_REWRITE, + KBOX_SYSCALL_MODE_AUTO, +}; + struct kbox_image_args { const char *root_dir; /* -r: image file path */ bool recommended; /* -R: enable recommended mounts */ @@ -34,6 +41,7 @@ struct kbox_image_args { bool verbose; /* --forward-verbose */ bool net; /* --net: enable SLIRP networking */ enum kbox_mount_profile mount_profile; /* --mount-profile */ + enum kbox_syscall_mode syscall_mode; /* --syscall-mode */ bool web; /* --web: enable web observatory */ int web_port; /* --web=PORT (default 8080) */ const char *web_bind; /* --web-bind ADDR */ diff --git a/include/kbox/elf.h b/include/kbox/elf.h index a7a6fed..f29ce0e 100644 --- a/include/kbox/elf.h +++ b/include/kbox/elf.h @@ -39,5 +39,88 @@ int kbox_find_elf_interp_loc(const unsigned char *buf, size_t out_size, uint64_t *offset_out, uint64_t *filesz_out); +int kbox_read_elf_header_window_fd(int fd, + unsigned char **buf_out, + size_t *buf_len_out); + +struct kbox_elf_exec_segment { + uint64_t file_offset; + uint64_t file_size; + uint64_t vaddr; + uint64_t mem_size; +}; + +#define KBOX_ELF_MAX_LOAD_SEGMENTS 16 + +struct kbox_elf_load_segment { + uint64_t file_offset; + uint64_t file_size; + uint64_t vaddr; + uint64_t mem_size; + uint64_t align; + uint64_t map_align; + uint64_t map_offset; + uint64_t map_start; + uint64_t map_size; + uint32_t flags; +}; + +struct kbox_elf_load_plan { + uint16_t machine; + uint16_t type; + uint64_t entry; + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + uint64_t phdr_vaddr; + uint64_t phdr_size; + uint64_t min_vaddr; + uint64_t max_vaddr; + uint64_t load_size; + uint64_t interp_offset; + uint64_t interp_size; + uint32_t stack_flags; + size_t segment_count; + int has_interp; + int pie; + struct kbox_elf_load_segment segments[KBOX_ELF_MAX_LOAD_SEGMENTS]; +}; + +int kbox_build_elf_load_plan(const unsigned char *buf, + size_t buf_len, + uint64_t page_size, + struct kbox_elf_load_plan *plan); + +typedef int (*kbox_elf_exec_segment_cb)(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque); +typedef int (*kbox_elf_exec_segment_header_cb)( + const struct kbox_elf_exec_segment *seg, + void *opaque); + +/* Return the ELF machine type (e_machine) for a 64-bit little-endian image. */ +int kbox_elf_machine(const unsigned char *buf, + size_t buf_len, + uint16_t *machine_out); + +/* Visit every PT_LOAD|PF_X segment with file-backed bytes (read-only). + * The callback receives a const pointer to segment bytes, suitable for + * analysis/scanning but not for in-place rewriting. A mutable variant + * will be needed when actual instruction replacement is implemented. + * Returns the number of visited segments on success or -1 on malformed ELF. + */ +int kbox_visit_elf_exec_segments(const unsigned char *buf, + size_t buf_len, + kbox_elf_exec_segment_cb cb, + void *opaque); + +/* Visit executable PT_LOAD segment metadata. Only the ELF header and program + * header table need to be present in @buf; the segment payload bytes are not + * dereferenced. + */ +int kbox_visit_elf_exec_segment_headers(const unsigned char *buf, + size_t buf_len, + kbox_elf_exec_segment_header_cb cb, + void *opaque); #endif /* KBOX_ELF_H */ diff --git a/include/kbox/probe.h b/include/kbox/probe.h index ace2aa9..f5a3e8c 100644 --- a/include/kbox/probe.h +++ b/include/kbox/probe.h @@ -3,16 +3,32 @@ #ifndef KBOX_PROBE_H #define KBOX_PROBE_H +#include "kbox/cli.h" + +struct kbox_probe_result { + int no_new_privs_ok; + int seccomp_filter_ok; + int seccomp_listener_ok; + int process_vm_readv_ok; +}; + /* Runtime host feature probing. * * Verify at startup that the host kernel supports the features kbox depends on. * Fail fast with a clear diagnostic if any check fails. + * + * All modes require basic seccomp filter support plus no_new_privs. + * SECCOMP and AUTO additionally require seccomp-unotify + process_vm_readv + * because they run the supervisor path today. TRAP and REWRITE skip those + * supervisor-specific probes. */ -/* Run all probes. +/* Run all probes for the given syscall mode. * Returns 0 on success, -1 if a required feature is unavailable. * Prints diagnostics to stderr. */ -int kbox_probe_host_features(void); +int kbox_probe_host_features(enum kbox_syscall_mode mode); +int kbox_collect_probe_result(enum kbox_syscall_mode mode, + struct kbox_probe_result *out); #endif /* KBOX_PROBE_H */ diff --git a/include/kbox/x86-decode.h b/include/kbox/x86-decode.h new file mode 100644 index 0000000..4ab3bcf --- /dev/null +++ b/include/kbox/x86-decode.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_X86_DECODE_H +#define KBOX_X86_DECODE_H + +#include +#include + +/* + * Minimal x86-64 instruction length decoder. + * + * Used by the rewrite scanner so syscall opcodes are only matched at real + * instruction boundaries, not inside immediates or other instruction bytes. + */ +int kbox_x86_insn_length(const unsigned char *code, size_t max_len); + +#endif /* KBOX_X86_DECODE_H */ diff --git a/mk/features.mk b/mk/features.mk index 5f6693e..b4863bf 100644 --- a/mk/features.mk +++ b/mk/features.mk @@ -11,9 +11,20 @@ SRCS = $(SRC_DIR)/main.c \ $(SRC_DIR)/lkl-wrap.c \ $(SRC_DIR)/fd-table.c \ $(SRC_DIR)/procmem.c \ + $(SRC_DIR)/syscall-request.c \ + $(SRC_DIR)/syscall-trap.c \ $(SRC_DIR)/path.c \ $(SRC_DIR)/identity.c \ $(SRC_DIR)/elf.c \ + $(SRC_DIR)/loader-entry.c \ + $(SRC_DIR)/loader-handoff.c \ + $(SRC_DIR)/loader-image.c \ + $(SRC_DIR)/loader-layout.c \ + $(SRC_DIR)/loader-launch.c \ + $(SRC_DIR)/loader-stack.c \ + $(SRC_DIR)/loader-transfer.c \ + $(SRC_DIR)/x86-decode.c \ + $(SRC_DIR)/rewrite.c \ $(SRC_DIR)/mount.c \ $(SRC_DIR)/probe.c \ $(SRC_DIR)/image.c \ @@ -34,7 +45,16 @@ ifeq ($(CONFIG_HAS_SLIRP),y) CFLAGS += -DKBOX_HAS_SLIRP -I$(SLIRP_DIR)/src SLIRP_SRCS = $(wildcard $(SLIRP_DIR)/src/*.c) SLIRP_OBJS = $(SLIRP_SRCS:.c=.o) + SLIRP_CFLAGS = $(filter-out -Wpedantic -Wshadow,$(CFLAGS)) + SLIRP_CFLAGS += -Wno-sign-compare -Wno-unused-variable -Wno-comment + SLIRP_CFLAGS += -Wno-return-type -Wno-pedantic SRCS += $(SLIRP_SRCS) + # Use a directory-specific pattern rule instead of target-specific CFLAGS. + # $(SLIRP_OBJS): CFLAGS := ... would expand SLIRP_OBJS at parse time, + # before deps.mk has cloned minislirp, producing an empty target list. + $(SLIRP_DIR)/src/%.o: $(SLIRP_DIR)/src/%.c + @echo " CC $<" + $(Q)$(CC) $(SLIRP_CFLAGS) -MMD -MP -c -o $@ $< endif # Web observatory diff --git a/mk/tests.mk b/mk/tests.mk index 02855fb..267db9b 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -1,20 +1,53 @@ # mk/tests.mk - Test targets (unit, integration, stress, guest binaries) # Unit test files (no LKL dependency) +# Portable tests (compile on any host): TEST_DIR = tests/unit TEST_SRCS = $(TEST_DIR)/test-runner.c \ $(TEST_DIR)/test-fd-table.c \ $(TEST_DIR)/test-path.c \ $(TEST_DIR)/test-identity.c \ $(TEST_DIR)/test-syscall-nr.c \ - $(TEST_DIR)/test-elf.c + $(TEST_DIR)/test-elf.c \ + $(TEST_DIR)/test-x86-decode.c + +# Linux-only tests (depend on inline asm, siginfo_t/ucontext, memfd_create): +ifeq ($(shell uname -s),Linux) +TEST_SRCS += $(TEST_DIR)/test-rewrite.c \ + $(TEST_DIR)/test-procmem.c \ + $(TEST_DIR)/test-syscall-request.c \ + $(TEST_DIR)/test-syscall-trap.c \ + $(TEST_DIR)/test-loader-entry.c \ + $(TEST_DIR)/test-loader-handoff.c \ + $(TEST_DIR)/test-loader-image.c \ + $(TEST_DIR)/test-loader-layout.c \ + $(TEST_DIR)/test-loader-launch.c \ + $(TEST_DIR)/test-loader-stack.c \ + $(TEST_DIR)/test-loader-transfer.c +endif # Unit tests link only the pure-computation sources (no LKL) TEST_SUPPORT_SRCS = $(SRC_DIR)/fd-table.c \ $(SRC_DIR)/path.c \ $(SRC_DIR)/identity.c \ $(SRC_DIR)/syscall-nr.c \ - $(SRC_DIR)/elf.c + $(SRC_DIR)/elf.c \ + $(SRC_DIR)/x86-decode.c + +ifeq ($(shell uname -s),Linux) +TEST_SUPPORT_SRCS += $(SRC_DIR)/rewrite.c \ + $(TEST_DIR)/test-seccomp-stubs.c \ + $(SRC_DIR)/procmem.c \ + $(SRC_DIR)/syscall-request.c \ + $(SRC_DIR)/syscall-trap.c \ + $(SRC_DIR)/loader-entry.c \ + $(SRC_DIR)/loader-handoff.c \ + $(SRC_DIR)/loader-image.c \ + $(SRC_DIR)/loader-layout.c \ + $(SRC_DIR)/loader-launch.c \ + $(SRC_DIR)/loader-stack.c \ + $(SRC_DIR)/loader-transfer.c +endif TEST_TARGET = tests/unit/test-runner @@ -43,7 +76,7 @@ check-unit: $(TEST_TARGET) # We define LKL stubs for functions referenced by test support code. $(TEST_TARGET): $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(wildcard .config) @echo " LD $@" - $(Q)$(CC) $(CFLAGS) -DKBOX_UNIT_TEST -o $@ $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(LDFLAGS) + $(Q)$(CC) $(CFLAGS) -DKBOX_UNIT_TEST -o $@ $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(LDFLAGS) -lpthread check-integration: $(TARGET) guest-bins stress-bins $(ROOTFS) @echo " RUN check-integration" diff --git a/scripts/pre-commit.hook b/scripts/pre-commit.hook index 50d86c2..ec793fa 100755 --- a/scripts/pre-commit.hook +++ b/scripts/pre-commit.hook @@ -56,7 +56,9 @@ cppcheck_suppressions() { "unusedFunction" "syntaxError" "constParameterPointer" + "constParameterCallback" "constVariablePointer" + "constParameter" "unusedStructMember" "redundantAssignment" "staticFunction" @@ -64,6 +66,10 @@ cppcheck_suppressions() { "variableScope" "compareValueOutOfTypeRangeError" "constVariable" + "knownConditionTrueFalse" + "unreadVariable" + "redundantInitialization" + "shadowVariable" ) local out="--inline-suppr " diff --git a/src/cli.c b/src/cli.c index 83a4eba..f8f926b 100644 --- a/src/cli.c +++ b/src/cli.c @@ -6,6 +6,7 @@ #include #include "kbox/cli.h" +#include "rewrite.h" /* Long option codes for options without short equivalents */ enum { @@ -15,6 +16,7 @@ enum { OPT_NET, OPT_WEB, OPT_WEB_BIND, + OPT_SYSCALL_MODE, OPT_TRACE_FORMAT, OPT_HELP, }; @@ -38,6 +40,7 @@ static const struct option image_longopts[] = { {"net", no_argument, NULL, OPT_NET}, {"web", optional_argument, NULL, OPT_WEB}, {"web-bind", required_argument, NULL, OPT_WEB_BIND}, + {"syscall-mode", required_argument, NULL, OPT_SYSCALL_MODE}, {"trace-format", required_argument, NULL, OPT_TRACE_FORMAT}, {"help", no_argument, NULL, OPT_HELP}, {NULL, 0, NULL, 0}, @@ -71,6 +74,8 @@ void kbox_usage(const char *argv0) " --forward-verbose Verbose syscall forwarding\n" " --net Enable SLIRP user-mode networking\n" " --mount-profile P Mount profile: full (default), minimal\n" + " --syscall-mode MODE Syscall path: auto (default), " + "seccomp, trap, rewrite\n" " --web[=PORT] Enable web observatory (default: 8080)\n" " --web-bind ADDR Bind address for web (default: " "127.0.0.1)\n" @@ -88,6 +93,7 @@ static void image_defaults(struct kbox_image_args *img) img->command = "/bin/sh"; img->cmdline = "mem=1024M loglevel=4"; img->mount_profile = KBOX_MOUNT_FULL; + img->syscall_mode = KBOX_SYSCALL_MODE_AUTO; } static int parse_image_args(int argc, @@ -205,6 +211,15 @@ static int parse_image_args(int argc, return -1; #endif break; + case OPT_SYSCALL_MODE: + if (kbox_parse_syscall_mode(optarg, &img->syscall_mode) < 0) { + fprintf(stderr, + "unknown syscall mode: %s " + "(use 'seccomp', 'trap', 'rewrite', or 'auto')\n", + optarg); + return -1; + } + break; case OPT_TRACE_FORMAT: #ifdef KBOX_HAS_WEB if (strcmp(optarg, "json") != 0) { diff --git a/src/elf.c b/src/elf.c index ba9e84f..2072142 100644 --- a/src/elf.c +++ b/src/elf.c @@ -1,10 +1,16 @@ /* SPDX-License-Identifier: MIT */ +#include #include +#include #include +#include +#include #include "kbox/elf.h" +#include "io-util.h" + /* Little-endian readers. Use memcpy to avoid unaligned access on architectures * that trap on it (ARMv7 without SCTLR.A clear, etc.). */ @@ -41,11 +47,22 @@ static uint64_t read_le64(const unsigned char *p) /* ELF magic: 0x7f 'E' 'L' 'F' */ static const unsigned char elf_magic[4] = {0x7f, 'E', 'L', 'F'}; -#define EI_CLASS 4 /* File class byte index */ -#define EI_DATA 5 /* Data encoding byte index */ -#define ELFCLASS64 2 /* 64-bit objects */ -#define ELFDATA2LSB 1 /* 2's complement, little endian */ -#define PT_INTERP 3 /* Program interpreter */ +#define EI_CLASS 4 /* File class byte index */ +#define EI_DATA 5 /* Data encoding byte index */ +#define ELFCLASS64 2 /* 64-bit objects */ +#define ELFDATA2LSB 1 /* 2's complement, little endian */ +#define E_TYPE_OFF 16 /* e_type: uint16 */ +#define EM_OFF 18 /* e_machine: uint16 */ +#define E_ENTRY_OFF 24 /* e_entry: uint64 */ +#define PT_INTERP 3 /* Program interpreter */ +#define PT_LOAD 1 /* Loadable program segment */ +#define PT_PHDR 6 /* Program header table */ +#define PT_GNU_STACK 0x6474e551 +#define ET_EXEC 2 +#define ET_DYN 3 +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 /* Executable segment */ /* ELF64 header field offsets */ #define E_PHOFF_OFF 32 /* e_phoff: uint64 */ @@ -54,11 +71,54 @@ static const unsigned char elf_magic[4] = {0x7f, 'E', 'L', 'F'}; /* ELF64 program header field offsets (relative to phdr start) */ #define P_TYPE_OFF 0 /* p_type: uint32 */ +#define P_FLAGS_OFF 4 /* p_flags: uint32 */ #define P_OFFSET_OFF 8 /* p_offset: uint64 */ +#define P_VADDR_OFF 16 /* p_vaddr: uint64 */ #define P_FILESZ_OFF 32 /* p_filesz: uint64 */ +#define P_MEMSZ_OFF 40 /* p_memsz: uint64 */ +#define P_ALIGN_OFF 48 /* p_align: uint64 */ #define MIN_ELF_HDR 64 /* Minimum ELF64 header size */ #define MIN_PHENTSIZE 56 /* Minimum phdr entry size */ +#define MAX_ELF_HDR_WINDOW (256u * 1024) + +static int is_power_of_two_u64(uint64_t v) +{ + return v != 0 && (v & (v - 1)) == 0; +} + +static uint64_t align_down_u64(uint64_t value, uint64_t align) +{ + return value & ~(align - 1); +} + +static int align_up_u64(uint64_t value, uint64_t align, uint64_t *out) +{ + uint64_t sum; + + if (__builtin_add_overflow(value, align - 1, &sum)) + return -1; + *out = align_down_u64(sum, align); + return 0; +} + +static int segment_map_align(uint64_t page_size, + uint64_t p_align, + uint64_t *out) +{ + uint64_t align = page_size; + + if (!out || !is_power_of_two_u64(page_size)) + return -1; + if (p_align > 1) { + if (!is_power_of_two_u64(p_align)) + return -1; + if (p_align > align) + align = p_align; + } + *out = align; + return 0; +} int kbox_parse_elf_interp(const unsigned char *buf, size_t buf_len, @@ -91,9 +151,19 @@ int kbox_find_elf_interp_loc(const unsigned char *buf, if (phentsize < MIN_PHENTSIZE) return -1; + /* Reject if phdr table starts outside the buffer (bogus e_phoff). */ + if (phnum > 0 && phoff >= buf_len) + return -1; + for (uint16_t i = 0; i < phnum; i++) { - uint64_t off = phoff + (uint64_t) i * phentsize; - if (off + MIN_PHENTSIZE > buf_len) + uint64_t off; + uint64_t off_end; + + if (__builtin_add_overflow(phoff, (uint64_t) i * phentsize, &off)) + return -1; + if (__builtin_add_overflow(off, (uint64_t) MIN_PHENTSIZE, &off_end)) + return -1; + if (off_end > buf_len) break; uint32_t p_type = read_le32(buf + off + P_TYPE_OFF); @@ -134,3 +204,442 @@ int kbox_find_elf_interp_loc(const unsigned char *buf, return 0; } + +int kbox_elf_machine(const unsigned char *buf, + size_t buf_len, + uint16_t *machine_out) +{ + if (!buf || buf_len < MIN_ELF_HDR || !machine_out) + return -1; + + if (memcmp(buf, elf_magic, 4) != 0) + return -1; + + if (buf[EI_CLASS] != ELFCLASS64 || buf[EI_DATA] != ELFDATA2LSB) + return -1; + + *machine_out = read_le16(buf + EM_OFF); + return 0; +} + +int kbox_read_elf_header_window_fd(int fd, + unsigned char **buf_out, + size_t *buf_len_out) +{ + unsigned char hdr[MIN_ELF_HDR]; + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + uint64_t ph_end; + size_t size; + unsigned char *buf; + ssize_t nr; + uint64_t interp_end = 0; + + if (fd < 0 || !buf_out || !buf_len_out) + return -1; + + nr = pread_full(fd, hdr, sizeof(hdr), 0); + if (nr < (ssize_t) sizeof(hdr)) { + if (nr >= 0) + errno = EIO; + return -1; + } + + if (memcmp(hdr, elf_magic, 4) != 0) + return -1; + if (hdr[EI_CLASS] != ELFCLASS64 || hdr[EI_DATA] != ELFDATA2LSB) + return -1; + + phoff = read_le64(hdr + E_PHOFF_OFF); + phentsize = read_le16(hdr + E_PHENTSIZE_OFF); + phnum = read_le16(hdr + E_PHNUM_OFF); + + if (phentsize < MIN_PHENTSIZE) + return -1; + if (__builtin_add_overflow(phoff, (uint64_t) phentsize * phnum, &ph_end)) + return -1; + + size = (size_t) ph_end; + if (size < sizeof(hdr)) + size = sizeof(hdr); + if (size > MAX_ELF_HDR_WINDOW) { + errno = EFBIG; + return -1; + } + + /* Use mmap(MAP_ANONYMOUS) instead of malloc. In trap mode, this + * function may run from a SIGSYS signal handler where the guest + * holds glibc heap locks, making malloc unsafe. + */ + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (buf == MAP_FAILED) + return -1; + + nr = pread_full(fd, buf, size, 0); + if (nr < 0 || (size_t) nr != size) { + munmap(buf, size); + if (nr >= 0) + errno = EIO; + return -1; + } + + for (uint16_t i = 0; i < phnum; i++) { + uint64_t off = phoff + (uint64_t) i * phentsize; + uint32_t p_type; + uint64_t p_offset; + uint64_t p_filesz; + uint64_t end; + + if (off > size || MIN_PHENTSIZE > size - off) { + munmap(buf, size); + errno = EIO; + return -1; + } + p_type = read_le32(buf + off + P_TYPE_OFF); + if (p_type != PT_INTERP) + continue; + p_offset = read_le64(buf + off + P_OFFSET_OFF); + p_filesz = read_le64(buf + off + P_FILESZ_OFF); + if (__builtin_add_overflow(p_offset, p_filesz, &end)) { + munmap(buf, size); + return -1; + } + interp_end = end; + break; + } + + if (interp_end > size) { + unsigned char *grown; + size_t old_size = size; + + if (interp_end > MAX_ELF_HDR_WINDOW) { + munmap(buf, size); + errno = EFBIG; + return -1; + } + grown = mmap(NULL, (size_t) interp_end, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (grown == MAP_FAILED) { + munmap(buf, size); + return -1; + } + munmap(buf, old_size); + buf = grown; + nr = pread_full(fd, buf, (size_t) interp_end, 0); + if (nr < 0 || (uint64_t) nr != interp_end) { + munmap(buf, (size_t) interp_end); + if (nr >= 0) + errno = EIO; + return -1; + } + size = (size_t) interp_end; + } + + *buf_out = buf; + *buf_len_out = size; + return 0; +} + +int kbox_build_elf_load_plan(const unsigned char *buf, + size_t buf_len, + uint64_t page_size, + struct kbox_elf_load_plan *plan) +{ + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + uint64_t phdr_vaddr = 0; + int phdr_vaddr_known = 0; + + if (!buf || !plan || buf_len < MIN_ELF_HDR || + !is_power_of_two_u64(page_size)) + return -1; + + if (memcmp(buf, elf_magic, 4) != 0) + return -1; + + if (buf[EI_CLASS] != ELFCLASS64 || buf[EI_DATA] != ELFDATA2LSB) + return -1; + + memset(plan, 0, sizeof(*plan)); + plan->type = read_le16(buf + E_TYPE_OFF); + plan->machine = read_le16(buf + EM_OFF); + plan->entry = read_le64(buf + E_ENTRY_OFF); + plan->phoff = read_le64(buf + E_PHOFF_OFF); + plan->phentsize = read_le16(buf + E_PHENTSIZE_OFF); + plan->phnum = read_le16(buf + E_PHNUM_OFF); + plan->pie = plan->type == ET_DYN; + + phoff = plan->phoff; + phentsize = plan->phentsize; + phnum = plan->phnum; + + if (plan->type != ET_EXEC && plan->type != ET_DYN) + return -1; + if (phentsize < MIN_PHENTSIZE) + return -1; + if (phnum > 0 && phoff >= buf_len) + return -1; + if (__builtin_mul_overflow((uint64_t) phentsize, (uint64_t) phnum, + &plan->phdr_size)) + return -1; + + for (uint16_t i = 0; i < phnum; i++) { + uint64_t off; + uint64_t off_end; + uint32_t p_type; + uint32_t p_flags; + uint64_t p_offset; + uint64_t p_vaddr; + uint64_t p_filesz; + uint64_t p_memsz; + uint64_t p_align; + uint64_t end; + + if (__builtin_add_overflow(phoff, (uint64_t) i * phentsize, &off)) + return -1; + if (__builtin_add_overflow(off, (uint64_t) MIN_PHENTSIZE, &off_end)) + return -1; + if (off_end > buf_len) + return -1; + + p_type = read_le32(buf + off + P_TYPE_OFF); + p_flags = read_le32(buf + off + P_FLAGS_OFF); + p_offset = read_le64(buf + off + P_OFFSET_OFF); + p_vaddr = read_le64(buf + off + P_VADDR_OFF); + p_filesz = read_le64(buf + off + P_FILESZ_OFF); + p_memsz = read_le64(buf + off + P_MEMSZ_OFF); + p_align = read_le64(buf + off + P_ALIGN_OFF); + + if (p_filesz > p_memsz) + return -1; + + /* Validate that p_offset + p_filesz does not overflow. We do + * NOT check against buf_len here because the caller may pass + * only the ELF header window (phdr table + interp), not the + * full file. Segment file-content bounds are validated at map + * time by the loader. + */ + if (p_filesz > 0) { + if (__builtin_add_overflow(p_offset, p_filesz, &end)) + return -1; + } + + if (p_type == PT_INTERP) { + plan->has_interp = 1; + plan->interp_offset = p_offset; + plan->interp_size = p_filesz; + continue; + } + + if (p_type == PT_GNU_STACK) { + plan->stack_flags = p_flags; + continue; + } + + if (p_type == PT_PHDR) { + phdr_vaddr = p_vaddr; + phdr_vaddr_known = 1; + continue; + } + + if (p_type != PT_LOAD || p_memsz == 0) + continue; + + if (plan->segment_count >= KBOX_ELF_MAX_LOAD_SEGMENTS) + return -1; + + { + struct kbox_elf_load_segment *seg = + &plan->segments[plan->segment_count]; + uint64_t map_align; + uint64_t map_start; + uint64_t map_offset; + uint64_t map_end; + + if (segment_map_align(page_size, p_align, &map_align) < 0) + return -1; + map_start = align_down_u64(p_vaddr, map_align); + map_offset = align_down_u64(p_offset, map_align); + if (__builtin_add_overflow(p_vaddr, p_memsz, &end)) + return -1; + if (align_up_u64(end, map_align, &map_end) < 0) + return -1; + if (map_end < map_start) + return -1; + + seg->file_offset = p_offset; + seg->file_size = p_filesz; + seg->vaddr = p_vaddr; + seg->mem_size = p_memsz; + seg->align = p_align; + seg->map_align = map_align; + seg->map_offset = map_offset; + seg->map_start = map_start; + seg->map_size = map_end - map_start; + seg->flags = p_flags; + + if (plan->segment_count == 0 || map_start < plan->min_vaddr) + plan->min_vaddr = map_start; + if (plan->segment_count == 0 || map_end > plan->max_vaddr) + plan->max_vaddr = map_end; + plan->segment_count++; + } + + if (!phdr_vaddr_known && plan->phdr_size > 0 && + p_filesz >= plan->phdr_size && phoff >= p_offset && + phoff - p_offset <= p_filesz - plan->phdr_size) { + phdr_vaddr = p_vaddr + (phoff - p_offset); + phdr_vaddr_known = 1; + } + } + + if (plan->segment_count == 0) + return -1; + if (plan->max_vaddr < plan->min_vaddr) + return -1; + + plan->load_size = plan->max_vaddr - plan->min_vaddr; + if (phdr_vaddr_known) + plan->phdr_vaddr = phdr_vaddr; + return 0; +} + +int kbox_visit_elf_exec_segments(const unsigned char *buf, + size_t buf_len, + kbox_elf_exec_segment_cb cb, + void *opaque) +{ + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + int visited = 0; + + if (!buf || !cb || buf_len < MIN_ELF_HDR) + return -1; + + if (memcmp(buf, elf_magic, 4) != 0) + return -1; + + if (buf[EI_CLASS] != ELFCLASS64 || buf[EI_DATA] != ELFDATA2LSB) + return -1; + + phoff = read_le64(buf + E_PHOFF_OFF); + phentsize = read_le16(buf + E_PHENTSIZE_OFF); + phnum = read_le16(buf + E_PHNUM_OFF); + + if (phentsize < MIN_PHENTSIZE) + return -1; + + if (phnum > 0 && phoff >= buf_len) + return -1; + + for (uint16_t i = 0; i < phnum; i++) { + uint64_t off; + uint64_t off_end; + uint32_t p_type; + uint32_t p_flags; + uint64_t p_offset; + uint64_t p_filesz; + uint64_t p_vaddr; + uint64_t p_memsz; + uint64_t end; + struct kbox_elf_exec_segment seg; + + if (__builtin_add_overflow(phoff, (uint64_t) i * phentsize, &off)) + return -1; + if (__builtin_add_overflow(off, (uint64_t) MIN_PHENTSIZE, &off_end)) + return -1; + if (off_end > buf_len) + return -1; + + p_type = read_le32(buf + off + P_TYPE_OFF); + p_flags = read_le32(buf + off + P_FLAGS_OFF); + if (p_type != PT_LOAD || (p_flags & PF_X) == 0) + continue; + + p_offset = read_le64(buf + off + P_OFFSET_OFF); + p_vaddr = read_le64(buf + off + P_VADDR_OFF); + p_filesz = read_le64(buf + off + P_FILESZ_OFF); + p_memsz = read_le64(buf + off + P_MEMSZ_OFF); + + if (p_filesz == 0) + continue; + if (p_offset >= buf_len) + return -1; + if (__builtin_add_overflow(p_offset, p_filesz, &end) || end > buf_len) + return -1; + + seg.file_offset = p_offset; + seg.file_size = p_filesz; + seg.vaddr = p_vaddr; + seg.mem_size = p_memsz; + if (cb(&seg, buf + p_offset, opaque) < 0) + return -1; + visited++; + } + + return visited; +} + +int kbox_visit_elf_exec_segment_headers(const unsigned char *buf, + size_t buf_len, + kbox_elf_exec_segment_header_cb cb, + void *opaque) +{ + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + int visited = 0; + + if (!buf || !cb || buf_len < MIN_ELF_HDR) + return -1; + + if (memcmp(buf, elf_magic, 4) != 0) + return -1; + + if (buf[EI_CLASS] != ELFCLASS64 || buf[EI_DATA] != ELFDATA2LSB) + return -1; + + phoff = read_le64(buf + E_PHOFF_OFF); + phentsize = read_le16(buf + E_PHENTSIZE_OFF); + phnum = read_le16(buf + E_PHNUM_OFF); + + if (phentsize < MIN_PHENTSIZE) + return -1; + if (phnum > 0 && phoff >= buf_len) + return -1; + + for (uint16_t i = 0; i < phnum; i++) { + uint64_t off; + uint64_t off_end; + uint32_t p_type; + uint32_t p_flags; + struct kbox_elf_exec_segment seg; + + if (__builtin_add_overflow(phoff, (uint64_t) i * phentsize, &off)) + return -1; + if (__builtin_add_overflow(off, (uint64_t) MIN_PHENTSIZE, &off_end)) + return -1; + if (off_end > buf_len) + return -1; + + p_type = read_le32(buf + off + P_TYPE_OFF); + p_flags = read_le32(buf + off + P_FLAGS_OFF); + if (p_type != PT_LOAD || (p_flags & PF_X) == 0) + continue; + + seg.file_offset = read_le64(buf + off + P_OFFSET_OFF); + seg.file_size = read_le64(buf + off + P_FILESZ_OFF); + seg.vaddr = read_le64(buf + off + P_VADDR_OFF); + seg.mem_size = read_le64(buf + off + P_MEMSZ_OFF); + + if (cb(&seg, opaque) < 0) + return -1; + visited++; + } + + return visited; +} diff --git a/src/fd-table.c b/src/fd-table.c index 9cf8e97..d91d10e 100644 --- a/src/fd-table.c +++ b/src/fd-table.c @@ -36,6 +36,7 @@ void kbox_fd_table_init(struct kbox_fd_table *t) t->entries[i].lkl_fd = -1; t->entries[i].host_fd = -1; t->entries[i].shadow_sp = -1; + t->entries[i].shadow_writeback = 0; t->entries[i].mirror_tty = 0; t->entries[i].cloexec = 0; } @@ -43,10 +44,13 @@ void kbox_fd_table_init(struct kbox_fd_table *t) t->low_fds[i].lkl_fd = -1; t->low_fds[i].host_fd = -1; t->low_fds[i].shadow_sp = -1; + t->low_fds[i].shadow_writeback = 0; t->low_fds[i].mirror_tty = 0; t->low_fds[i].cloexec = 0; } t->next_fd = KBOX_FD_BASE; + t->next_fast_fd = KBOX_FD_FAST_BASE; + t->next_hostonly_fd = KBOX_FD_HOSTONLY_BASE; } /* Auto-allocate: always from the high range (>= KBOX_FD_BASE). @@ -55,20 +59,22 @@ void kbox_fd_table_init(struct kbox_fd_table *t) long kbox_fd_table_insert(struct kbox_fd_table *t, long lkl_fd, int mirror_tty) { long start_idx = t->next_fd - KBOX_FD_BASE; + long limit_idx = KBOX_FD_FAST_BASE - KBOX_FD_BASE; long idx; if (start_idx < 0) start_idx = 0; - if (start_idx >= KBOX_FD_TABLE_MAX) + if (start_idx >= limit_idx) start_idx = 0; - for (idx = start_idx; idx < KBOX_FD_TABLE_MAX; idx++) { + for (idx = start_idx; idx < limit_idx; idx++) { if (t->entries[idx].lkl_fd == -1) { long vfd = idx + KBOX_FD_BASE; t->entries[idx].lkl_fd = lkl_fd; t->entries[idx].host_fd = -1; t->entries[idx].shadow_sp = -1; + t->entries[idx].shadow_writeback = 0; t->entries[idx].mirror_tty = mirror_tty; t->entries[idx].cloexec = 0; t->next_fd = vfd + 1; @@ -84,6 +90,7 @@ long kbox_fd_table_insert(struct kbox_fd_table *t, long lkl_fd, int mirror_tty) t->entries[idx].lkl_fd = lkl_fd; t->entries[idx].host_fd = -1; t->entries[idx].shadow_sp = -1; + t->entries[idx].shadow_writeback = 0; t->entries[idx].mirror_tty = mirror_tty; t->entries[idx].cloexec = 0; t->next_fd = vfd + 1; @@ -94,6 +101,53 @@ long kbox_fd_table_insert(struct kbox_fd_table *t, long lkl_fd, int mirror_tty) return -1; /* table truly full */ } +long kbox_fd_table_insert_fast(struct kbox_fd_table *t, + long lkl_fd, + int mirror_tty) +{ + long start_idx = t->next_fast_fd - KBOX_FD_BASE; + long base_idx = KBOX_FD_FAST_BASE - KBOX_FD_BASE; + long limit_idx = KBOX_FD_HOSTONLY_BASE - KBOX_FD_BASE; + long idx; + + if (start_idx < base_idx) + start_idx = base_idx; + if (start_idx >= limit_idx) + start_idx = base_idx; + + for (idx = start_idx; idx < limit_idx; idx++) { + if (t->entries[idx].lkl_fd == -1) { + long vfd = idx + KBOX_FD_BASE; + + t->entries[idx].lkl_fd = lkl_fd; + t->entries[idx].host_fd = -1; + t->entries[idx].shadow_sp = -1; + t->entries[idx].shadow_writeback = 0; + t->entries[idx].mirror_tty = mirror_tty; + t->entries[idx].cloexec = 0; + t->next_fast_fd = vfd + 1; + return vfd; + } + } + + for (idx = base_idx; idx < start_idx; idx++) { + if (t->entries[idx].lkl_fd == -1) { + long vfd = idx + KBOX_FD_BASE; + + t->entries[idx].lkl_fd = lkl_fd; + t->entries[idx].host_fd = -1; + t->entries[idx].shadow_sp = -1; + t->entries[idx].shadow_writeback = 0; + t->entries[idx].mirror_tty = mirror_tty; + t->entries[idx].cloexec = 0; + t->next_fast_fd = vfd + 1; + return vfd; + } + } + + return -1; +} + int kbox_fd_table_insert_at(struct kbox_fd_table *t, long fd, long lkl_fd, @@ -106,6 +160,7 @@ int kbox_fd_table_insert_at(struct kbox_fd_table *t, e->lkl_fd = lkl_fd; e->host_fd = -1; e->shadow_sp = -1; + e->shadow_writeback = 0; e->mirror_tty = mirror_tty; e->cloexec = 0; @@ -145,9 +200,23 @@ long kbox_fd_table_remove(struct kbox_fd_table *t, long fd) #endif e->host_fd = -1; e->shadow_sp = -1; + e->shadow_writeback = 0; e->lkl_fd = -1; e->mirror_tty = 0; e->cloexec = 0; + if (fd >= KBOX_FD_HOSTONLY_BASE && fd < KBOX_FD_BASE + KBOX_FD_TABLE_MAX && + (t->next_hostonly_fd < KBOX_FD_HOSTONLY_BASE || + fd < t->next_hostonly_fd)) { + t->next_hostonly_fd = fd; + } + if (fd >= KBOX_FD_FAST_BASE && fd < KBOX_FD_HOSTONLY_BASE && + (t->next_fast_fd < KBOX_FD_FAST_BASE || fd < t->next_fast_fd)) { + t->next_fast_fd = fd; + } + if (fd >= KBOX_FD_BASE && fd < KBOX_FD_FAST_BASE && + (t->next_fd < KBOX_FD_BASE || fd < t->next_fd)) { + t->next_fd = fd; + } return old; } @@ -181,6 +250,7 @@ static void clear_entry(struct kbox_fd_entry *e) e->lkl_fd = -1; e->host_fd = -1; e->shadow_sp = -1; + e->shadow_writeback = 0; e->mirror_tty = 0; e->cloexec = 0; } diff --git a/src/fd-table.h b/src/fd-table.h index d2e7456..ff9f024 100644 --- a/src/fd-table.h +++ b/src/fd-table.h @@ -16,29 +16,37 @@ struct kbox_sysnrs; /* forward declaration */ #define KBOX_FD_BASE 32768 #define KBOX_FD_TABLE_MAX 4096 +#define KBOX_FD_FAST_BASE (KBOX_FD_BASE + (KBOX_FD_TABLE_MAX / 2)) +#define KBOX_FD_HOSTONLY_BASE (KBOX_FD_BASE + ((KBOX_FD_TABLE_MAX * 3) / 4)) /* redirect slots for FDs 0..1023 (dup2 targets) */ #define KBOX_LOW_FD_MAX 1024 #define KBOX_FD_TABLE_CAPACITY (KBOX_FD_TABLE_MAX + KBOX_LOW_FD_MAX) struct kbox_fd_entry { - long lkl_fd; /* LKL-internal FD, -1 if slot is free */ - long host_fd; /* host memfd shadow / tracee FD number, -1 if none */ - int shadow_sp; /* supervisor's dup of shadow socket sp[1], -1 if none. - * Kept alive so dup/dup2/dup3 can inject new copies into - * the tracee via ADDFD. - */ - int mirror_tty; /* 1 if this FD mirrors a host TTY */ - int cloexec; /* O_CLOEXEC tracking */ + long lkl_fd; /* LKL-internal FD, -1 if slot is free */ + long host_fd; /* host memfd shadow / tracee FD number, -1 if none */ + int shadow_sp; /* supervisor's dup of shadow socket sp[1], -1 if none. + * Kept alive so dup/dup2/dup3 can inject new copies into + * the tracee via ADDFD. + */ + int shadow_writeback; /* 1 if shadow_sp must be synced back to lkl_fd */ + int mirror_tty; /* 1 if this FD mirrors a host TTY */ + int cloexec; /* O_CLOEXEC tracking */ }; struct kbox_fd_table { struct kbox_fd_entry entries[KBOX_FD_TABLE_MAX]; struct kbox_fd_entry low_fds[KBOX_LOW_FD_MAX]; /* dup2 redirect slots */ - long next_fd; /* Next virtual FD to allocate */ + long next_fd; /* Next virtual FD to allocate */ + long next_fast_fd; /* Next host-shadow fast FD to allocate */ + long next_hostonly_fd; /* Next host-only cached-shadow FD to allocate */ }; void kbox_fd_table_init(struct kbox_fd_table *t); long kbox_fd_table_insert(struct kbox_fd_table *t, long lkl_fd, int mirror_tty); +long kbox_fd_table_insert_fast(struct kbox_fd_table *t, + long lkl_fd, + int mirror_tty); int kbox_fd_table_insert_at(struct kbox_fd_table *t, long fd, long lkl_fd, diff --git a/src/image.c b/src/image.c index 4f28a72..1d0d164 100644 --- a/src/image.c +++ b/src/image.c @@ -10,7 +10,12 @@ #include #include #include +#include #include +#include +#include +#include +#include #include #include "kbox/elf.h" @@ -19,13 +24,19 @@ #include "kbox/mount.h" #include "kbox/probe.h" #include "lkl-wrap.h" +#include "loader-launch.h" #include "net.h" +#include "rewrite.h" #include "seccomp.h" #include "shadow-fd.h" +#include "syscall-trap.h" #ifdef KBOX_HAS_WEB #include "web.h" #endif +int kbox_rewrite_has_fork_sites_memfd(int fd, + const struct kbox_host_nrs *host_nrs); + /* Determine the root image path from the three mutually exclusive options. * Returns the path, or NULL on error. */ @@ -69,6 +80,641 @@ static const char *join_mount_opts(const struct kbox_image_args *a, return buf; } +extern char **environ; + +/* AUTO fast-path selection: on aarch64, the in-process trap/rewrite path + * delivers 21x faster stat (LKL inode cache, no USER_NOTIF round-trip). + * On x86_64, seccomp is faster across the board because the USER_NOTIF + * overhead is lower (~10us vs ~20us on aarch64) and the SIGSYS service + * thread round-trip makes open+close slower in trap mode. + */ +#if defined(__aarch64__) +#define KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH 1 +#else +#define KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH 0 +#endif + +static int is_shell_command(const char *command) +{ + static const char *const shells[] = {"sh", "bash", "ash", "zsh", "dash", + "fish", "csh", "tcsh", "ksh", NULL}; + const char *base = strrchr(command, '/'); + + base = base ? base + 1 : command; + for (const char *const *s = shells; *s; s++) { + if (strcmp(base, *s) == 0) + return 1; + } + return 0; +} + +#if KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH +/* Decide whether AUTO mode should prefer the userspace fast path (trap/rewrite) + * over the seccomp supervisor path for a given binary. + * + * Considers the combined syscall site count from both the main binary and its + * interpreter (if dynamic). The fast path is viable as long as ANY executable + * segment has rewritable sites -- trap mode catches everything via SIGSYS, and + * rewrite mode patches sites in both the main binary and the interpreter. + * + * Returns 1 to use trap/rewrite, 0 to fall back to seccomp. + */ +static int auto_prefers_userspace_fast_path( + const struct kbox_rewrite_report *exec_report, + const struct kbox_rewrite_report *interp_report, + int has_fork_sites) +{ + if (!exec_report) + return 0; + + /* Trap/rewrite mode duplicates the in-process LKL state on fork, so + * parent and child see independent filesystem state. AUTO selects the + * fast path for non-shell commands that do not contain fork/clone + * wrapper sites in the main executable. The interpreter (libc) is + * NOT scanned because it always contains fork wrappers regardless of + * whether the specific program uses them. + * + * Dynamic binaries whose main executable has no fork wrappers get + * the fast path. If the program does fork through libc, children + * inherit their own LKL copy and run independently. This is a known + * trade-off: cross-process filesystem coherence is not guaranteed in + * trap/rewrite mode. + */ + if (has_fork_sites) + return 0; + + if (exec_report->candidate_count == 0 && + (!interp_report || interp_report->candidate_count == 0)) + return 0; + + return 1; +} +#endif + +static void maybe_apply_virtual_procinfo_fast_path(int fd, + const char *label, + int verbose) +{ + struct kbox_rewrite_report report; + size_t applied = 0; + + if (fd < 0) + return; + if (kbox_rewrite_analyze_memfd(fd, &report) < 0) + return; + if (report.arch != KBOX_REWRITE_ARCH_X86_64 && + report.arch != KBOX_REWRITE_ARCH_AARCH64) + return; + if (kbox_rewrite_apply_virtual_procinfo_memfd(fd, &applied, &report) < 0) + return; + if (verbose && applied > 0) { + fprintf(stderr, + "kbox: seccomp procinfo fast path: %s: patched %zu wrapper%s\n", + label ? label : "memfd", applied, applied == 1 ? "" : "s"); + } +} + +static uint32_t memfd_wrapper_family_mask(int fd, + const struct kbox_host_nrs *host_nrs) +{ + uint32_t mask = 0; + + if (fd < 0 || !host_nrs) + return 0; + if (kbox_rewrite_wrapper_family_mask_memfd(fd, host_nrs, &mask) < 0) + return 0; + return mask; +} + +static int wrapper_family_mask_has_stat(uint32_t mask) +{ + return (mask & KBOX_REWRITE_WRAPPER_FAMILY_STAT) != 0; +} + +static int wrapper_family_mask_has_open(uint32_t mask) +{ + return (mask & KBOX_REWRITE_WRAPPER_FAMILY_OPEN) != 0; +} + +static int memfd_has_stat_wrapper_fast_candidates( + int fd, + const struct kbox_host_nrs *host_nrs) +{ + return wrapper_family_mask_has_stat( + memfd_wrapper_family_mask(fd, host_nrs)); +} + +static int memfd_has_open_wrapper_fast_candidates( + int fd, + const struct kbox_host_nrs *host_nrs) +{ + return wrapper_family_mask_has_open( + memfd_wrapper_family_mask(fd, host_nrs)); +} + +struct wrapper_candidate_count_ctx { + size_t count; + int filter_enabled; + enum kbox_rewrite_wrapper_candidate_kind kind_filter; +}; + +static int count_wrapper_candidate_cb( + const struct kbox_rewrite_wrapper_candidate *candidate, + void *opaque) +{ + struct wrapper_candidate_count_ctx *ctx = opaque; + + if (!candidate || !ctx) + return -1; + if (ctx->filter_enabled && candidate->kind != ctx->kind_filter) + return 0; + ctx->count++; + return 0; +} + +static size_t memfd_count_wrapper_family_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask) +{ + struct wrapper_candidate_count_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.filter_enabled = 0; + if (fd < 0 || !host_nrs || family_mask == 0) + return 0; + if (kbox_rewrite_visit_memfd_wrapper_candidates( + fd, host_nrs, family_mask, count_wrapper_candidate_cb, &ctx) < 0) { + return 0; + } + return ctx.count; +} + +static size_t memfd_count_wrapper_family_candidates_by_kind( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + enum kbox_rewrite_wrapper_candidate_kind kind) +{ + struct wrapper_candidate_count_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.filter_enabled = 1; + ctx.kind_filter = kind; + if (fd < 0 || !host_nrs || family_mask == 0) + return 0; + if (kbox_rewrite_visit_memfd_wrapper_candidates( + fd, host_nrs, family_mask, count_wrapper_candidate_cb, &ctx) < 0) { + return 0; + } + return ctx.count; +} + +static size_t memfd_count_phase1_path_candidates( + int fd, + const struct kbox_host_nrs *host_nrs) +{ + struct kbox_rewrite_wrapper_candidate candidates[16]; + size_t count = 0; + + if (fd < 0 || !host_nrs) + return 0; + if (kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, host_nrs, candidates, + sizeof(candidates) / sizeof(candidates[0]), &count) < 0) { + return 0; + } + return count; +} + +struct wrapper_candidate_log_ctx { + const char *label; + const char *family_name; + const char *prefix; +}; + +static const char *wrapper_candidate_kind_name( + enum kbox_rewrite_wrapper_candidate_kind kind) +{ + switch (kind) { + case KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT: + return "direct"; + case KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL: + return "cancel"; + default: + return "unknown"; + } +} + +static int log_wrapper_candidate_cb( + const struct kbox_rewrite_wrapper_candidate *candidate, + void *opaque) +{ + const struct wrapper_candidate_log_ctx *ctx = opaque; + + if (!candidate || !ctx) + return -1; + fprintf(stderr, + "kbox: %s %s%s: off=0x%llx vaddr=0x%llx " + "nr=%llu kind=%s\n", + ctx->label ? ctx->label : "memfd", + ctx->family_name ? ctx->family_name : "path", + ctx->prefix ? ctx->prefix : "-wrapper candidate", + (unsigned long long) candidate->file_offset, + (unsigned long long) candidate->vaddr, + (unsigned long long) candidate->nr, + wrapper_candidate_kind_name(candidate->kind)); + return 0; +} + +static void maybe_log_wrapper_family_candidates( + const char *label, + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + const char *family_name, + int verbose) +{ + struct kbox_rewrite_wrapper_candidate candidates[16]; + struct wrapper_candidate_log_ctx ctx; + size_t count = 0; + + if (!verbose || fd < 0 || !host_nrs || family_mask == 0) + return; + if (kbox_rewrite_collect_memfd_wrapper_candidates( + fd, host_nrs, family_mask, candidates, + sizeof(candidates) / sizeof(candidates[0]), &count) < 0) { + return; + } + ctx.label = label; + ctx.family_name = family_name; + ctx.prefix = "-wrapper candidate"; + for (size_t i = 0; + i < count && i < (sizeof(candidates) / sizeof(candidates[0])); i++) { + (void) log_wrapper_candidate_cb(&candidates[i], &ctx); + } +} + +static void maybe_log_phase1_path_candidates( + const char *label, + int fd, + const struct kbox_host_nrs *host_nrs, + int verbose) +{ + struct kbox_rewrite_wrapper_candidate candidates[16]; + struct wrapper_candidate_log_ctx ctx; + size_t count = 0; + + if (!verbose || fd < 0 || !host_nrs) + return; + if (kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, host_nrs, candidates, + sizeof(candidates) / sizeof(candidates[0]), &count) < 0) { + return; + } + ctx.label = label; + ctx.family_name = "phase1-path"; + ctx.prefix = " target"; + for (size_t i = 0; + i < count && i < (sizeof(candidates) / sizeof(candidates[0])); i++) { + (void) log_wrapper_candidate_cb(&candidates[i], &ctx); + } +} + +static size_t count_envp(char *const *envp) +{ + size_t n = 0; + + if (!envp) + return 0; + while (envp[n]) + n++; + return n; +} + +static const char **build_loader_argv(const char *command, + const char *const *extra_args, + int extra_argc) +{ + size_t argc = (size_t) extra_argc + 1; + const char **argv = calloc(argc + 1, sizeof(*argv)); + + if (!argv) + return NULL; + argv[0] = command; + for (int i = 0; i < extra_argc; i++) + argv[i + 1] = extra_args[i]; + return argv; +} + +static int prepare_userspace_launch(const struct kbox_image_args *args, + const char *command, + int exec_memfd, + int interp_memfd, + uid_t override_uid, + gid_t override_gid, + struct kbox_loader_launch *launch) +{ + unsigned char launch_random[KBOX_LOADER_RANDOM_SIZE]; + struct kbox_loader_launch_spec spec; + const char **argv = NULL; + size_t argc = (size_t) args->extra_argc + 1; + uint32_t uid = (uint32_t) (args->root_id || args->system_root + ? 0 + : (override_uid != (uid_t) -1 ? override_uid + : getuid())); + uint32_t gid = (uint32_t) (args->root_id || args->system_root + ? 0 + : (override_gid != (gid_t) -1 ? override_gid + : getgid())); + int rc; + + if (!launch || exec_memfd < 0) + return -1; + + /* Fill AT_RANDOM with real entropy for stack canary and libc PRNG seeding. + * Fall back to zeros only if getrandom is unavailable. + */ + memset(launch_random, 0, sizeof(launch_random)); + (void) getrandom(launch_random, sizeof(launch_random), 0); + + argv = build_loader_argv(command, args->extra_args, args->extra_argc); + if (!argv) + return -1; + + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = exec_memfd; + spec.interp_fd = interp_memfd; + spec.argv = argv; + spec.argc = argc; + spec.envp = (const char *const *) environ; + spec.envc = count_envp(environ); + spec.execfn = command; + spec.random_bytes = launch_random; + spec.page_size = (uint64_t) sysconf(_SC_PAGESIZE); + spec.stack_top = 0x700000010000ULL; + spec.main_load_bias = 0x600000000000ULL; + spec.interp_load_bias = 0x610000000000ULL; + spec.uid = uid; + spec.euid = uid; + spec.gid = gid; + spec.egid = gid; + spec.secure = 0; + + rc = kbox_loader_prepare_launch(&spec, launch); + free(argv); + return rc; +} + +static const struct kbox_host_nrs *select_host_nrs(void) +{ +#if defined(__x86_64__) + return &HOST_NRS_X86_64; +#elif defined(__aarch64__) + return &HOST_NRS_AARCH64; +#else + return NULL; +#endif +} + +static int collect_trap_exec_ranges(const struct kbox_loader_launch *launch, + struct kbox_syscall_trap_ip_range *ranges, + size_t range_cap, + size_t *range_count) +{ + struct kbox_loader_exec_range exec_ranges[KBOX_LOADER_MAX_MAPPINGS]; + size_t exec_count = 0; + + if (!launch || !ranges || !range_count) + return -1; + if (kbox_loader_collect_exec_ranges( + launch, exec_ranges, KBOX_LOADER_MAX_MAPPINGS, &exec_count) < 0) { + return -1; + } + if (exec_count > range_cap) + return -1; + + for (size_t i = 0; i < exec_count; i++) { + ranges[i].start = (uintptr_t) exec_ranges[i].start; + ranges[i].end = (uintptr_t) exec_ranges[i].end; + } + *range_count = exec_count; + return 0; +} + +static void drop_launch_caps(void) +{ + prctl(47 /* PR_CAP_AMBIENT */, 4 /* PR_CAP_AMBIENT_CLEAR_ALL */, 0, 0, 0); + for (int cap = 0; cap <= 63; cap++) + prctl(24 /* PR_CAPBSET_DROP */, cap, 0, 0, 0); +} + +static int set_launch_rlimits(void) +{ + struct rlimit nofile = {65536, 65536}; + struct rlimit rtprio = {0, 0}; + struct rlimit current; + rlim_t required_nofile = (rlim_t) (KBOX_FD_BASE + KBOX_FD_TABLE_MAX); + + if (setrlimit(RLIMIT_NOFILE, &nofile) != 0) + return -1; + if (getrlimit(RLIMIT_NOFILE, ¤t) != 0) + return -1; + if (current.rlim_cur < required_nofile) { + errno = EMFILE; + return -1; + } + if (setrlimit(RLIMIT_RTPRIO, &rtprio) != 0) + return -1; + return 0; +} + +static void dump_loader_launch(const struct kbox_loader_launch *launch) +{ + if (!launch) + return; + + fprintf(stderr, "kbox: trap launch: pc=0x%llx sp=0x%llx mappings=%zu\n", + (unsigned long long) launch->transfer.pc, + (unsigned long long) launch->transfer.sp, + launch->layout.mapping_count); + for (size_t i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + + fprintf(stderr, + "kbox: trap launch: map[%zu] src=%d addr=0x%llx size=0x%llx " + "prot=%d flags=0x%x file_off=0x%llx file_size=0x%llx " + "zero_fill=0x%llx+0x%llx\n", + i, mapping->source, (unsigned long long) mapping->addr, + (unsigned long long) mapping->size, mapping->prot, + mapping->flags, (unsigned long long) mapping->file_offset, + (unsigned long long) mapping->file_size, + (unsigned long long) mapping->zero_fill_start, + (unsigned long long) mapping->zero_fill_size); + } +} + +static void init_launch_ctx(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_table *fd_table, + const struct kbox_image_args *args, + const struct kbox_sysnrs *sysnrs, + const struct kbox_host_nrs *host_nrs, + struct kbox_web_ctx *web_ctx) +{ + kbox_fd_table_init(fd_table); + memset(ctx, 0, sizeof(*ctx)); + ctx->sysnrs = sysnrs; + ctx->host_nrs = host_nrs; + ctx->fd_table = fd_table; + ctx->listener_fd = -1; + ctx->proc_self_fd_dirfd = -1; + ctx->proc_mem_fd = -1; + ctx->child_pid = getpid(); + ctx->host_root = NULL; + ctx->verbose = args->verbose; + ctx->root_identity = args->root_id || args->system_root; + ctx->override_uid = (uid_t) -1; + ctx->override_gid = (gid_t) -1; + ctx->normalize = args->normalize; + ctx->guest_mem_ops = &kbox_current_guest_mem_ops; + ctx->active_guest_mem.ops = &kbox_current_guest_mem_ops; + ctx->active_guest_mem.opaque = 0; + ctx->fd_inject_ops = NULL; + ctx->web = web_ctx; +} + +static int run_trap_launch(const struct kbox_image_args *args, + const struct kbox_sysnrs *sysnrs, + struct kbox_loader_launch *launch, + struct kbox_web_ctx *web_ctx) +{ + struct kbox_syscall_trap_ip_range ranges[KBOX_LOADER_MAX_MAPPINGS]; + const struct kbox_host_nrs *host_nrs = select_host_nrs(); + struct kbox_fd_table fd_table; + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + size_t range_count = 0; + + if (!host_nrs || !launch) + return -1; + if (collect_trap_exec_ranges(launch, ranges, KBOX_LOADER_MAX_MAPPINGS, + &range_count) < 0) { + fprintf(stderr, + "kbox: trap launch failed: cannot collect guest exec ranges\n"); + return -1; + } + if (args->verbose) + dump_loader_launch(launch); + + init_launch_ctx(&ctx, &fd_table, args, sysnrs, host_nrs, web_ctx); + + if (kbox_syscall_trap_runtime_install(&runtime, &ctx) < 0) { + fprintf(stderr, + "kbox: trap launch failed: cannot install SIGSYS handler\n"); + return -1; + } + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { + fprintf(stderr, "prctl(PR_SET_NO_NEW_PRIVS): %s\n", strerror(errno)); + kbox_syscall_trap_runtime_uninstall(&runtime); + return -1; + } + + drop_launch_caps(); + if (set_launch_rlimits() < 0) { + fprintf(stderr, "kbox: trap launch failed: RLIMIT_NOFILE too low\n"); + kbox_syscall_trap_runtime_uninstall(&runtime); + return -1; + } + + if (kbox_install_seccomp_trap_ranges(host_nrs, ranges, range_count) < 0) { + fprintf(stderr, + "kbox: trap launch failed: cannot install guest trap filter\n"); + kbox_syscall_trap_runtime_uninstall(&runtime); + return -1; + } + + kbox_loader_transfer_to_guest(&launch->transfer); +} + +static int run_rewrite_launch(const struct kbox_image_args *args, + const struct kbox_sysnrs *sysnrs, + struct kbox_loader_launch *launch, + struct kbox_web_ctx *web_ctx) +{ + struct kbox_syscall_trap_ip_range ranges[KBOX_LOADER_MAX_MAPPINGS]; + const struct kbox_host_nrs *host_nrs = select_host_nrs(); + struct kbox_fd_table fd_table; + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime trap_runtime; + struct kbox_rewrite_runtime rewrite_runtime; + size_t range_count = 0; + + if (!host_nrs || !launch) + return -1; +#if defined(__x86_64__) + if (args->verbose) { + fprintf( + stderr, + "kbox: rewrite launch on x86_64 currently falls back to trap\n"); + } + return run_trap_launch(args, sysnrs, launch, web_ctx); +#endif + if (collect_trap_exec_ranges(launch, ranges, KBOX_LOADER_MAX_MAPPINGS, + &range_count) < 0) { + fprintf( + stderr, + "kbox: rewrite launch failed: cannot collect guest exec ranges\n"); + return -1; + } + if (args->verbose) + dump_loader_launch(launch); + + init_launch_ctx(&ctx, &fd_table, args, sysnrs, host_nrs, web_ctx); + + if (kbox_rewrite_runtime_install(&rewrite_runtime, &ctx, launch) < 0) { + fprintf( + stderr, + "kbox: rewrite launch failed: cannot install rewrite runtime: %s\n", + strerror(errno)); + return -1; + } + + if (kbox_syscall_trap_runtime_install(&trap_runtime, &ctx) < 0) { + fprintf(stderr, + "kbox: rewrite launch failed: cannot install SIGSYS handler\n"); + kbox_rewrite_runtime_reset(&rewrite_runtime); + return -1; + } + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) { + fprintf(stderr, "prctl(PR_SET_NO_NEW_PRIVS): %s\n", strerror(errno)); + kbox_syscall_trap_runtime_uninstall(&trap_runtime); + kbox_rewrite_runtime_reset(&rewrite_runtime); + return -1; + } + + drop_launch_caps(); + if (set_launch_rlimits() < 0) { + fprintf(stderr, "kbox: rewrite launch failed: RLIMIT_NOFILE too low\n"); + kbox_syscall_trap_runtime_uninstall(&trap_runtime); + kbox_rewrite_runtime_reset(&rewrite_runtime); + return -1; + } + + if (kbox_install_seccomp_rewrite_ranges(host_nrs, ranges, range_count) < + 0) { + fprintf( + stderr, + "kbox: rewrite launch failed: cannot install guest trap filter\n"); + kbox_syscall_trap_runtime_uninstall(&trap_runtime); + kbox_rewrite_runtime_reset(&rewrite_runtime); + return -1; + } + + kbox_loader_transfer_to_guest(&launch->transfer); +} + /* Public entry point. */ int kbox_run_image(const struct kbox_image_args *args) @@ -84,12 +730,14 @@ int kbox_run_image(const struct kbox_image_args *args) const char *work_dir; const char *command; const struct kbox_sysnrs *sysnrs; + enum kbox_syscall_mode probe_mode; long ret; struct kbox_bind_spec bind_specs[KBOX_MAX_BIND_MOUNTS]; int bind_count = 0; int i; uid_t override_uid = (uid_t) -1; gid_t override_gid = (gid_t) -1; + int rewrite_requested = 0; /* Resolve parameters with defaults. */ root_path = select_root_path(args); @@ -99,6 +747,27 @@ int kbox_run_image(const struct kbox_image_args *args) fs_type = args->fs_type ? args->fs_type : "ext4"; work_dir = args->work_dir ? args->work_dir : "/"; command = args->command ? args->command : "/bin/sh"; + probe_mode = args->syscall_mode; + rewrite_requested = args->syscall_mode == KBOX_SYSCALL_MODE_REWRITE; + + /* AUTO enables rewrite analysis for non-shell commands so the + * auto_prefers_userspace_fast_path() selection function can see the + * exec_report and make an informed decision. Shell commands always + * fall back to seccomp (fork coherence), so skip the analysis. + */ + if (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO) { +#if KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH + if (!is_shell_command(command)) + rewrite_requested = 1; +#else + /* On current x86_64 and aarch64 builds, AUTO is intentionally + * pinned to seccomp. Skip rewrite/trap analysis entirely and probe + * the supervisor path directly so startup does not do dead work or + * print duplicate probe messages. + */ + probe_mode = KBOX_SYSCALL_MODE_SECCOMP; +#endif + } /* Parse bind mount specs. */ for (i = 0; i < args->bind_mount_count; i++) { @@ -221,8 +890,8 @@ int kbox_run_image(const struct kbox_image_args *args) } } - /* Probe host features. */ - if (kbox_probe_host_features() < 0) { + /* Probe host features. Rewrite mode skips seccomp-specific probes. */ + if (kbox_probe_host_features(probe_mode) < 0) { if (args->net) kbox_net_cleanup(); return -1; @@ -276,6 +945,9 @@ int kbox_run_image(const struct kbox_image_args *args) int exec_memfd; int interp_memfd = -1; int rc = -1; + struct kbox_loader_launch launch; + + memset(&launch, 0, sizeof(launch)); lkl_fd = kbox_lkl_openat(sysnrs, AT_FDCWD_LINUX, command, O_RDONLY, 0); if (lkl_fd < 0) { @@ -293,22 +965,118 @@ int kbox_run_image(const struct kbox_image_args *args) goto err_net; } - /* Check for PT_INTERP (dynamic binary). Read the first 4 KB of memfd; - * enough for the ELF header and program header table of any reasonable - * binary. - */ + /* Check for PT_INTERP (dynamic binary). */ { - unsigned char elf_buf[4096]; - ssize_t nr = pread(exec_memfd, elf_buf, sizeof(elf_buf), 0); + struct kbox_rewrite_report exec_report; + struct kbox_rewrite_trampoline_probe exec_probe; + const struct kbox_host_nrs *host_nrs = select_host_nrs(); + int scan_path_wrapper_candidates = + rewrite_requested || + (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO && args->verbose); + int exec_report_ok = 0; + int exec_has_stat_wrapper_fast_candidates = 0; + int exec_has_open_wrapper_fast_candidates = 0; + size_t exec_stat_wrapper_candidate_count = 0; + size_t exec_open_wrapper_candidate_count = 0; + size_t exec_stat_wrapper_direct_count = 0; + size_t exec_open_wrapper_direct_count = 0; + size_t exec_open_wrapper_cancel_count = 0; + size_t exec_phase1_path_candidate_count = 0; + unsigned char *elf_buf = NULL; + size_t elf_buf_len = 0; +#if KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH + struct kbox_rewrite_report interp_report_outer; + int interp_report_ok = 0; +#endif + int interp_has_stat_wrapper_fast_candidates = 0; + int interp_has_open_wrapper_fast_candidates = 0; + size_t interp_stat_wrapper_candidate_count = 0; + size_t interp_open_wrapper_candidate_count = 0; + size_t interp_stat_wrapper_direct_count = 0; + size_t interp_open_wrapper_direct_count = 0; + size_t interp_open_wrapper_cancel_count = 0; + size_t interp_phase1_path_candidate_count = 0; + + if (rewrite_requested && + kbox_rewrite_analyze_memfd(exec_memfd, &exec_report) == 0) { + exec_report_ok = 1; + + if (args->verbose) { + fprintf(stderr, + "kbox: syscall rewrite analysis: %s: arch=%s " + "exec-segments=%zu candidates=%zu\n", + command, kbox_rewrite_arch_name(exec_report.arch), + exec_report.exec_segment_count, + exec_report.candidate_count); + if (kbox_rewrite_probe_trampoline(exec_report.arch, + &exec_probe) == 0) { + fprintf(stderr, + "kbox: rewrite trampoline probe: %s: " + "feasible=%s reason=%s\n", + kbox_rewrite_arch_name(exec_probe.arch), + exec_probe.feasible ? "yes" : "no", + exec_probe.reason ? exec_probe.reason : "?"); + } + } + } + if (scan_path_wrapper_candidates) { + exec_has_stat_wrapper_fast_candidates = + memfd_has_stat_wrapper_fast_candidates(exec_memfd, + host_nrs); + exec_has_open_wrapper_fast_candidates = + memfd_has_open_wrapper_fast_candidates(exec_memfd, + host_nrs); + exec_stat_wrapper_candidate_count = + memfd_count_wrapper_family_candidates( + exec_memfd, host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_STAT); + exec_stat_wrapper_direct_count = + memfd_count_wrapper_family_candidates_by_kind( + exec_memfd, host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_STAT, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + exec_open_wrapper_candidate_count = + memfd_count_wrapper_family_candidates( + exec_memfd, host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + exec_open_wrapper_direct_count = + memfd_count_wrapper_family_candidates_by_kind( + exec_memfd, host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + exec_open_wrapper_cancel_count = + memfd_count_wrapper_family_candidates_by_kind( + exec_memfd, host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL); + exec_phase1_path_candidate_count = + memfd_count_phase1_path_candidates(exec_memfd, host_nrs); + maybe_log_wrapper_family_candidates( + command, exec_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_STAT, "stat", args->verbose); + maybe_log_wrapper_family_candidates( + command, exec_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, "open", args->verbose); + maybe_log_phase1_path_candidates(command, exec_memfd, host_nrs, + args->verbose); + } - if (nr > 0) { + if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, + &elf_buf_len) == 0) { char interp_path[256]; uint64_t pt_offset, pt_filesz; - int ilen = kbox_find_elf_interp_loc( - elf_buf, (size_t) nr, interp_path, sizeof(interp_path), + int ilen; + ilen = kbox_find_elf_interp_loc( + elf_buf, elf_buf_len, interp_path, sizeof(interp_path), &pt_offset, &pt_filesz); + munmap(elf_buf, elf_buf_len); + + if (ilen < 0) { + fprintf(stderr, + "kbox: malformed ELF: cannot parse " + "program headers for %s\n", + command); + close(exec_memfd); + goto err_net; + } if (ilen > 0) { + struct kbox_rewrite_report interp_report; /* Dynamic binary: extract the interpreter from LKL. */ long interp_lkl_fd = kbox_lkl_openat( sysnrs, AT_FDCWD_LINUX, interp_path, O_RDONLY, 0); @@ -376,10 +1144,336 @@ int kbox_run_image(const struct kbox_image_args *args) "interpreter %s -> /proc/self/fd/%d\n", command, interp_path, interp_memfd); } + + if (rewrite_requested && + kbox_rewrite_analyze_memfd(interp_memfd, + &interp_report) == 0) { +#if KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH + interp_report_outer = interp_report; + interp_report_ok = 1; +#endif + interp_has_stat_wrapper_fast_candidates = + memfd_has_stat_wrapper_fast_candidates(interp_memfd, + host_nrs); + interp_has_open_wrapper_fast_candidates = + memfd_has_open_wrapper_fast_candidates(interp_memfd, + host_nrs); + if (args->verbose) { + fprintf(stderr, + "kbox: syscall rewrite analysis: %s: " + "arch=%s exec-segments=%zu " + "candidates=%zu\n", + interp_path, + kbox_rewrite_arch_name(interp_report.arch), + interp_report.exec_segment_count, + interp_report.candidate_count); + } + } + if (scan_path_wrapper_candidates) { + interp_has_stat_wrapper_fast_candidates = + memfd_has_stat_wrapper_fast_candidates(interp_memfd, + host_nrs); + interp_has_open_wrapper_fast_candidates = + memfd_has_open_wrapper_fast_candidates(interp_memfd, + host_nrs); + interp_stat_wrapper_candidate_count = + memfd_count_wrapper_family_candidates( + interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_STAT); + interp_stat_wrapper_direct_count = + memfd_count_wrapper_family_candidates_by_kind( + interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_STAT, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + interp_open_wrapper_candidate_count = + memfd_count_wrapper_family_candidates( + interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + interp_open_wrapper_direct_count = + memfd_count_wrapper_family_candidates_by_kind( + interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + interp_open_wrapper_cancel_count = + memfd_count_wrapper_family_candidates_by_kind( + interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL); + interp_phase1_path_candidate_count = + memfd_count_phase1_path_candidates(interp_memfd, + host_nrs); + maybe_log_wrapper_family_candidates( + interp_path, interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_STAT, "stat", + args->verbose); + maybe_log_wrapper_family_candidates( + interp_path, interp_memfd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, "open", + args->verbose); + maybe_log_phase1_path_candidates( + interp_path, interp_memfd, host_nrs, args->verbose); + } + } + } + + if (args->verbose && (exec_has_stat_wrapper_fast_candidates || + exec_has_open_wrapper_fast_candidates || + interp_has_stat_wrapper_fast_candidates || + interp_has_open_wrapper_fast_candidates)) { + fprintf(stderr, + "kbox: path-wrapper fast-path candidates: " + "exec(stat=%s/%zu direct=%zu " + "open=%s/%zu direct=%zu cancel=%zu) " + "interp(stat=%s/%zu direct=%zu " + "open=%s/%zu direct=%zu cancel=%zu)\n", + exec_has_stat_wrapper_fast_candidates ? "yes" : "no", + exec_stat_wrapper_candidate_count, + exec_stat_wrapper_direct_count, + exec_has_open_wrapper_fast_candidates ? "yes" : "no", + exec_open_wrapper_candidate_count, + exec_open_wrapper_direct_count, + exec_open_wrapper_cancel_count, + interp_has_stat_wrapper_fast_candidates ? "yes" : "no", + interp_stat_wrapper_candidate_count, + interp_stat_wrapper_direct_count, + interp_has_open_wrapper_fast_candidates ? "yes" : "no", + interp_open_wrapper_candidate_count, + interp_open_wrapper_direct_count, + interp_open_wrapper_cancel_count); + fprintf(stderr, + "kbox: phase1 direct path-wrapper targets: " + "exec=%zu interp=%zu\n", + exec_phase1_path_candidate_count, + interp_phase1_path_candidate_count); + } + + /* Trap fast path: 3x faster than seccomp (3.5us vs 10.8us + * per syscall with FSGSBASE). + * + * Limitation: fork in trap mode duplicates the in-process + * LKL state, so child processes see their own filesystem + * copy. Shell scripts that fork+exec (e.g., sh -c 'mkdir + * /tmp/x && ls /tmp/x') lose cross-process filesystem + * coherence. AUTO uses trap only for direct commands + * (no shell wrapper). Shell invocations fall back to + * seccomp where the supervisor is a separate process and + * all children share one LKL instance. + */ + { + int use_trap = (args->syscall_mode == KBOX_SYSCALL_MODE_TRAP); + if (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO) { + if (!is_shell_command(command)) { +#if !KBOX_AUTO_ENABLE_USERSPACE_FAST_PATH + use_trap = 0; +#else + int fork_sites = 0; + + /* Scan the main binary for fork/clone wrappers. + * Do NOT scan the interpreter: libc always + * contains fork wrappers regardless of whether + * the specific program uses them. Scanning it + * would reject every dynamic binary. + */ + if (exec_report_ok) { + const struct kbox_host_nrs *hnrs = + select_host_nrs(); + if (hnrs) + fork_sites = kbox_rewrite_has_fork_sites_memfd( + exec_memfd, hnrs) > 0; + } + use_trap = auto_prefers_userspace_fast_path( + exec_report_ok ? &exec_report : NULL, + interp_report_ok ? &interp_report_outer : NULL, + fork_sites); +#if defined(__aarch64__) + if (use_trap && + (exec_open_wrapper_cancel_count > 0 || + interp_open_wrapper_cancel_count > 0)) { + use_trap = 0; + if (args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: " + "keeping seccomp because " + "cancel-style open wrappers are " + "still present\n"); + } + } +#endif +#endif + if (args->verbose && !use_trap) { + fprintf(stderr, + "kbox: --syscall-mode=auto: preferring " + "seccomp for this executable\n"); + } + } } + if (!use_trap) + goto skip_trap; + } + { + int prep_ok; + + maybe_apply_virtual_procinfo_fast_path(exec_memfd, command, + args->verbose); + maybe_apply_virtual_procinfo_fast_path( + interp_memfd, "PT_INTERP", args->verbose); + + prep_ok = prepare_userspace_launch(args, command, exec_memfd, + interp_memfd, override_uid, + override_gid, &launch) == 0; + if (!prep_ok) { + if (args->syscall_mode == KBOX_SYSCALL_MODE_TRAP) { + fprintf(stderr, + "kbox: --syscall-mode=trap launch preparation " + "failed.\n"); + kbox_loader_launch_reset(&launch); + if (interp_memfd >= 0) + close(interp_memfd); + close(exec_memfd); + goto err_net; + } + + if (args->verbose) { + fprintf( + stderr, + "kbox: --syscall-mode=auto: trap launch " + "preparation failed, falling back to seccomp\n"); + } + } else { + if (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO) { + if (exec_report_ok && + kbox_rewrite_probe_trampoline(exec_report.arch, + &exec_probe) == 0 && + exec_probe.feasible) { + if (args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: trying " + "rewrite fast path\n"); + } + /* run_rewrite_launch is noreturn on success. + * If it returns, the install failed before any + * irreversible process state changes (the first + * fallible step is kbox_rewrite_runtime_install, + * which cleans up on failure). Fall through to + * seccomp -- skipping trap because the loader + * layout was prepared for rewrite mode and may + * not be reusable as-is for a plain trap launch. + */ + (void) run_rewrite_launch(args, sysnrs, &launch, + web_ctx); + if (args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: rewrite " + "failed, falling back to seccomp\n"); + } + kbox_loader_launch_reset(&launch); + goto skip_trap; + } + if (args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: selecting trap " + "fast path\n"); + } + } + + /* run_trap_launch is noreturn on success. Only returns + * on failure. For explicit --syscall-mode=trap, this is + * a hard error. For AUTO, we already handled rewrite + * fallback above; trap failure here also falls through + * to seccomp. + */ + (void) run_trap_launch(args, sysnrs, &launch, web_ctx); + + if (args->syscall_mode != KBOX_SYSCALL_MODE_AUTO) { + if (interp_memfd >= 0) + close(interp_memfd); + close(exec_memfd); + kbox_loader_launch_reset(&launch); + goto err_net; + } + if (args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: trap failed, " + "falling back to seccomp\n"); + } + kbox_loader_launch_reset(&launch); + } + } + + skip_trap: + /* AUTO reaching here means the trap fast path was skipped + * (shell command) and seccomp will be used. Verify the + * supervisor features are available before proceeding. + */ + if (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO && + probe_mode != KBOX_SYSCALL_MODE_SECCOMP && + kbox_probe_host_features(KBOX_SYSCALL_MODE_SECCOMP) < 0) { + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); + goto err_net; + } + if (args->syscall_mode == KBOX_SYSCALL_MODE_TRAP) { + /* Unreachable: trap is handled above. */ + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); + goto err_net; + } + + if (args->syscall_mode == KBOX_SYSCALL_MODE_REWRITE) { + maybe_apply_virtual_procinfo_fast_path(exec_memfd, command, + args->verbose); + maybe_apply_virtual_procinfo_fast_path( + interp_memfd, "PT_INTERP", args->verbose); + int prep_ok = prepare_userspace_launch( + args, command, exec_memfd, interp_memfd, + override_uid, override_gid, &launch) == 0; + + if (!prep_ok) { + fprintf(stderr, + "kbox: --syscall-mode=rewrite launch preparation " + "failed.\n"); + } else { + if (!exec_report_ok) { + fprintf(stderr, + "kbox: --syscall-mode=rewrite executable " + "analysis failed.\n"); + } else if (kbox_rewrite_probe_trampoline( + exec_report.arch, &exec_probe) == 0 && + !exec_probe.feasible) { + fprintf(stderr, + "kbox: --syscall-mode=rewrite trampoline probe " + "failed for %s: %s\n", + kbox_rewrite_arch_name(exec_probe.arch), + exec_probe.reason ? exec_probe.reason : "?"); + } else { + if (interp_memfd >= 0) + close(interp_memfd); + close(exec_memfd); + rc = run_rewrite_launch(args, sysnrs, &launch, web_ctx); + kbox_loader_launch_reset(&launch); + goto err_net; + } + } + kbox_loader_launch_reset(&launch); + if (interp_memfd >= 0) + close(interp_memfd); + close(exec_memfd); + goto err_net; + } + + if (args->syscall_mode == KBOX_SYSCALL_MODE_AUTO && args->verbose) { + fprintf(stderr, + "kbox: --syscall-mode=auto: falling back to seccomp\n"); } } + maybe_apply_virtual_procinfo_fast_path(exec_memfd, command, + args->verbose); + maybe_apply_virtual_procinfo_fast_path(interp_memfd, "PT_INTERP", + args->verbose); + /* Fork, seccomp, exec, supervise. */ rc = kbox_run_supervisor( sysnrs, command, args->extra_args, args->extra_argc, NULL, diff --git a/src/io-util.h b/src/io-util.h new file mode 100644 index 0000000..11c553f --- /dev/null +++ b/src/io-util.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_IO_UTIL_H +#define KBOX_IO_UTIL_H + +#include +#include +#include + +static inline ssize_t pread_full(int fd, + unsigned char *buf, + size_t size, + off_t off) +{ + size_t total = 0; + + while (total < size) { + ssize_t nr = pread(fd, buf + total, size - total, off + (off_t) total); + + if (nr < 0) { + if (errno == EINTR) + continue; + return -1; + } + if (nr == 0) + break; + total += (size_t) nr; + } + + return (ssize_t) total; +} + +#endif /* KBOX_IO_UTIL_H */ diff --git a/src/lkl-wrap.h b/src/lkl-wrap.h index 9903b06..7241d50 100644 --- a/src/lkl-wrap.h +++ b/src/lkl-wrap.h @@ -8,14 +8,16 @@ #include "syscall-nr.h" +struct lkl_dev_blk_ops; + struct lkl_disk { void *dev; int fd; - void *ops; + struct lkl_dev_blk_ops *ops; }; extern unsigned char lkl_host_ops; -extern void lkl_dev_blk_ops; +extern struct lkl_dev_blk_ops lkl_dev_blk_ops; int lkl_init(void *ops); int lkl_start_kernel(const char *fmt, ...); diff --git a/src/loader-entry.c b/src/loader-entry.c new file mode 100644 index 0000000..faf77f1 --- /dev/null +++ b/src/loader-entry.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "loader-entry.h" + +static int machine_to_entry_arch(uint16_t machine, + enum kbox_loader_entry_arch *arch_out) +{ + if (!arch_out) + return -1; + + switch (machine) { + case 0x3e: + *arch_out = KBOX_LOADER_ENTRY_ARCH_X86_64; + return 0; + case 0xb7: + *arch_out = KBOX_LOADER_ENTRY_ARCH_AARCH64; + return 0; + default: + return -1; + } +} + +int kbox_loader_build_entry_state(const struct kbox_loader_layout *layout, + struct kbox_loader_entry_state *state) +{ + uint16_t machine; + + if (!layout || !state) + return -1; + + machine = layout->has_interp ? layout->interp_plan.machine + : layout->main_plan.machine; + memset(state, 0, sizeof(*state)); + if (machine_to_entry_arch(machine, &state->arch) < 0) + return -1; + state->pc = layout->initial_pc; + state->sp = layout->initial_sp; + return 0; +} diff --git a/src/loader-entry.h b/src/loader-entry.h new file mode 100644 index 0000000..6747d59 --- /dev/null +++ b/src/loader-entry.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_ENTRY_H +#define KBOX_LOADER_ENTRY_H + +#include + +#include "loader-layout.h" + +enum kbox_loader_entry_arch { + KBOX_LOADER_ENTRY_ARCH_X86_64, + KBOX_LOADER_ENTRY_ARCH_AARCH64, +}; + +struct kbox_loader_entry_state { + enum kbox_loader_entry_arch arch; + uint64_t pc; + uint64_t sp; + uint64_t regs[6]; +}; + +int kbox_loader_build_entry_state(const struct kbox_loader_layout *layout, + struct kbox_loader_entry_state *state); + +#endif /* KBOX_LOADER_ENTRY_H */ diff --git a/src/loader-handoff.c b/src/loader-handoff.c new file mode 100644 index 0000000..5169a74 --- /dev/null +++ b/src/loader-handoff.c @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "loader-handoff.h" + +static int region_matches_mapping(const struct kbox_loader_image *image, + const struct kbox_loader_mapping *mapping, + size_t index) +{ + const struct kbox_loader_image_region *region; + + if (!image || !mapping || index >= image->region_count) + return 0; + region = &image->regions[index]; + return region->addr == (void *) (uintptr_t) mapping->addr && + region->size == (size_t) mapping->size; +} + +static int find_entry_mapping(const struct kbox_loader_layout *layout, + uint64_t pc, + size_t *index_out) +{ + for (size_t i = 0; i < layout->mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &layout->mappings[i]; + uint64_t end; + + if ((mapping->prot & PROT_EXEC) == 0 || mapping->size == 0) + continue; + if (__builtin_add_overflow(mapping->addr, mapping->size, &end)) + return -1; + if (pc >= mapping->addr && pc < end) { + if (index_out) + *index_out = i; + return 0; + } + } + return -1; +} + +static int find_stack_mapping(const struct kbox_loader_layout *layout, + uint64_t sp, + size_t *index_out) +{ + for (size_t i = 0; i < layout->mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &layout->mappings[i]; + uint64_t end; + + if (mapping->source != KBOX_LOADER_MAPPING_STACK || mapping->size == 0) + continue; + if (__builtin_add_overflow(mapping->addr, mapping->size, &end)) + return -1; + if (sp >= mapping->addr && sp < end) { + if (index_out) + *index_out = i; + return 0; + } + } + return -1; +} + +int kbox_loader_build_handoff(const struct kbox_loader_layout *layout, + const struct kbox_loader_image *image, + struct kbox_loader_handoff *handoff) +{ + struct kbox_loader_entry_state entry; + size_t entry_index; + size_t stack_index; + uint64_t entry_map_end; + uint64_t stack_map_end; + + if (!layout || !image || !handoff) + return -1; + if (layout->mapping_count == 0 || + image->region_count < layout->mapping_count) + return -1; + if (kbox_loader_build_entry_state(layout, &entry) < 0) + return -1; + if ((entry.sp & 0xfu) != 0) + return -1; + if (find_entry_mapping(layout, entry.pc, &entry_index) < 0) + return -1; + if (find_stack_mapping(layout, entry.sp, &stack_index) < 0) + return -1; + if (!region_matches_mapping(image, &layout->mappings[entry_index], + entry_index) || + !region_matches_mapping(image, &layout->mappings[stack_index], + stack_index)) { + return -1; + } + if (__builtin_add_overflow(layout->mappings[entry_index].addr, + layout->mappings[entry_index].size, + &entry_map_end) || + __builtin_add_overflow(layout->mappings[stack_index].addr, + layout->mappings[stack_index].size, + &stack_map_end)) { + return -1; + } + + memset(handoff, 0, sizeof(*handoff)); + handoff->entry = entry; + handoff->entry_mapping_index = entry_index; + handoff->stack_mapping_index = stack_index; + handoff->entry_map_start = layout->mappings[entry_index].addr; + handoff->entry_map_end = entry_map_end; + handoff->stack_map_start = layout->mappings[stack_index].addr; + handoff->stack_map_end = stack_map_end; + return 0; +} diff --git a/src/loader-handoff.h b/src/loader-handoff.h new file mode 100644 index 0000000..f31e4f8 --- /dev/null +++ b/src/loader-handoff.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_HANDOFF_H +#define KBOX_LOADER_HANDOFF_H + +#include +#include + +#include "loader-entry.h" +#include "loader-image.h" + +struct kbox_loader_handoff { + struct kbox_loader_entry_state entry; + uint64_t entry_map_start; + uint64_t entry_map_end; + uint64_t stack_map_start; + uint64_t stack_map_end; + size_t entry_mapping_index; + size_t stack_mapping_index; +}; + +int kbox_loader_build_handoff(const struct kbox_loader_layout *layout, + const struct kbox_loader_image *image, + struct kbox_loader_handoff *handoff); + +#endif /* KBOX_LOADER_HANDOFF_H */ diff --git a/src/loader-image.c b/src/loader-image.c new file mode 100644 index 0000000..157115e --- /dev/null +++ b/src/loader-image.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include + +#include "loader-image.h" + +static int apply_final_prot(void *addr, size_t size, int prot) +{ + if (size == 0) + return 0; + return mprotect(addr, size, prot) == 0 ? 0 : -errno; +} + +static int map_region_exact(uint64_t addr, uint64_t size, int flags, void **out) +{ + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS; + void *mapped; + + if (!out || size == 0) + return -EINVAL; + if ((flags & MAP_STACK) != 0) + mmap_flags |= MAP_STACK; +#ifdef MAP_FIXED_NOREPLACE + mmap_flags |= MAP_FIXED_NOREPLACE; +#else + mmap_flags |= MAP_FIXED; +#endif + + mapped = mmap((void *) (uintptr_t) addr, (size_t) size, + PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (mapped == MAP_FAILED) + return -errno; + if ((uintptr_t) mapped != (uintptr_t) addr) { + munmap(mapped, (size_t) size); + return -EEXIST; + } + *out = mapped; + return 0; +} + +static void unmap_recorded_regions(struct kbox_loader_image *image) +{ + while (image && image->region_count > 0) { + struct kbox_loader_image_region *region = + &image->regions[image->region_count - 1]; + + if (region->addr && region->size > 0) + munmap(region->addr, region->size); + image->region_count--; + } +} + +static int record_region(struct kbox_loader_image *image, + void *addr, + size_t size) +{ + if (!image || image->region_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + image->regions[image->region_count++] = (struct kbox_loader_image_region) { + .addr = addr, + .size = size, + }; + return 0; +} + +static int copy_mapping_bytes(void *mapped, + size_t mapped_size, + const unsigned char *src, + size_t src_len, + const struct kbox_loader_mapping *mapping) +{ + if (!mapped || !mapping) + return -EINVAL; + if (mapping->file_size == 0) + return 0; + if (!src) + return -EINVAL; + if (mapping->file_offset > src_len || mapping->file_size > src_len || + mapping->file_offset + mapping->file_size > src_len) + return -EINVAL; + if (mapping->file_size > mapped_size) + return -EINVAL; + + memcpy(mapped, src + mapping->file_offset, (size_t) mapping->file_size); + if (mapping->zero_fill_size > 0) { + uintptr_t start = (uintptr_t) mapping->zero_fill_start; + uintptr_t base = (uintptr_t) mapping->addr; + + if (start < base || start - base > mapped_size || + mapping->zero_fill_size > mapped_size - (start - base)) { + return -EINVAL; + } + memset((unsigned char *) mapped + (start - base), 0, + (size_t) mapping->zero_fill_size); + } + return 0; +} + +void kbox_loader_image_reset(struct kbox_loader_image *image) +{ + if (!image) + return; + unmap_recorded_regions(image); + memset(image, 0, sizeof(*image)); +} + +int kbox_loader_materialize_image(const struct kbox_loader_image_spec *spec, + struct kbox_loader_image *image) +{ + const struct kbox_loader_layout *layout; + + if (!spec || !image || !spec->layout || !spec->main_elf) + return -EINVAL; + + kbox_loader_image_reset(image); + layout = spec->layout; + int rc = -1; + + for (size_t i = 0; i < layout->mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &layout->mappings[i]; + const unsigned char *src = NULL; + size_t src_len = 0; + void *mapped = NULL; + + if (mapping->size == 0) + continue; + + switch (mapping->source) { + case KBOX_LOADER_MAPPING_MAIN: + src = spec->main_elf; + src_len = spec->main_elf_len; + break; + case KBOX_LOADER_MAPPING_INTERP: + src = spec->interp_elf; + src_len = spec->interp_elf_len; + break; + case KBOX_LOADER_MAPPING_STACK: + src = layout->stack.data; + src_len = layout->stack.size; + break; + } + + rc = map_region_exact(mapping->addr, mapping->size, mapping->flags, + &mapped); + if (rc < 0) + goto fail; + if (record_region(image, mapped, (size_t) mapping->size) < 0) { + munmap(mapped, mapping->size); + rc = -ENOMEM; + goto fail; + } + + if (mapping->source == KBOX_LOADER_MAPPING_STACK) { + uintptr_t start = (uintptr_t) layout->initial_sp; + uintptr_t base = (uintptr_t) mapping->addr; + + if (!src || start < base || layout->stack.size > mapping->size || + start - base > mapping->size - layout->stack.size) { + rc = -EINVAL; + goto fail; + } + memcpy((unsigned char *) mapped + (start - base), src, + layout->stack.size); + } else { + rc = copy_mapping_bytes(mapped, (size_t) mapping->size, src, + src_len, mapping); + if (rc < 0) + goto fail; + } + + if (mapping->prot & PROT_EXEC) + __builtin___clear_cache((char *) mapped, + (char *) mapped + mapping->size); + rc = apply_final_prot(mapped, (size_t) mapping->size, mapping->prot); + if (rc < 0) + goto fail; + } + + return 0; + +fail: + kbox_loader_image_reset(image); + return rc; +} diff --git a/src/loader-image.h b/src/loader-image.h new file mode 100644 index 0000000..82277ff --- /dev/null +++ b/src/loader-image.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_IMAGE_H +#define KBOX_LOADER_IMAGE_H + +#include +#include + +#include "loader-layout.h" + +struct kbox_loader_image_region { + void *addr; + size_t size; +}; + +struct kbox_loader_image { + struct kbox_loader_image_region regions[KBOX_LOADER_MAX_MAPPINGS]; + size_t region_count; +}; + +struct kbox_loader_image_spec { + const struct kbox_loader_layout *layout; + const unsigned char *main_elf; + size_t main_elf_len; + const unsigned char *interp_elf; + size_t interp_elf_len; +}; + +void kbox_loader_image_reset(struct kbox_loader_image *image); +int kbox_loader_materialize_image(const struct kbox_loader_image_spec *spec, + struct kbox_loader_image *image); + +#endif /* KBOX_LOADER_IMAGE_H */ diff --git a/src/loader-launch.c b/src/loader-launch.c new file mode 100644 index 0000000..d6aa25d --- /dev/null +++ b/src/loader-launch.c @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include +#include +#include + +#include "loader-launch.h" + +static int read_fd_all(int fd, unsigned char **buf_out, size_t *len_out) +{ + struct stat st; + unsigned char *buf = NULL; + size_t len; + size_t total = 0; + + if (fd < 0 || !buf_out || !len_out) + return -1; + if (fstat(fd, &st) < 0 || st.st_size < 0) + return -1; + len = (size_t) st.st_size; + if (len == 0) + return -1; + /* Use mmap(MAP_ANONYMOUS) instead of malloc. In trap mode, + * read_fd_all may be called from a signal handler where glibc's + * malloc is not safe (the guest process may hold malloc locks). + * mmap is async-signal-safe and avoids the shared heap. + */ + buf = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (buf == MAP_FAILED) + return -1; + + while (total < len) { + ssize_t nr = pread(fd, buf + total, len - total, (off_t) total); + + if (nr < 0) { + if (errno == EINTR) + continue; + munmap(buf, len); + return -1; + } + if (nr == 0) { + munmap(buf, len); + return -1; + } + total += (size_t) nr; + } + + *buf_out = buf; + *len_out = len; + return 0; +} + +void kbox_loader_launch_reset(struct kbox_loader_launch *launch) +{ + if (!launch) + return; + kbox_loader_image_reset(&launch->image); + kbox_loader_layout_reset(&launch->layout); + if (launch->interp_elf && launch->interp_elf_len > 0) + munmap(launch->interp_elf, launch->interp_elf_len); + if (launch->main_elf && launch->main_elf_len > 0) + munmap(launch->main_elf, launch->main_elf_len); + memset(launch, 0, sizeof(*launch)); +} + +int kbox_loader_prepare_launch(const struct kbox_loader_launch_spec *spec, + struct kbox_loader_launch *launch) +{ + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image_spec image_spec; + + if (!spec || !launch || spec->exec_fd < 0 || !spec->argv || spec->argc == 0) + return -1; + + kbox_loader_launch_reset(launch); + + if (read_fd_all(spec->exec_fd, &launch->main_elf, &launch->main_elf_len) < + 0) + goto fail; + if (spec->interp_fd >= 0 && + read_fd_all(spec->interp_fd, &launch->interp_elf, + &launch->interp_elf_len) < 0) + goto fail; + + memset(&layout_spec, 0, sizeof(layout_spec)); + layout_spec.main_elf = launch->main_elf; + layout_spec.main_elf_len = launch->main_elf_len; + layout_spec.interp_elf = launch->interp_elf; + layout_spec.interp_elf_len = launch->interp_elf_len; + layout_spec.argv = spec->argv; + layout_spec.argc = spec->argc; + layout_spec.envp = spec->envp; + layout_spec.envc = spec->envc; + layout_spec.execfn = spec->execfn; + layout_spec.random_bytes = spec->random_bytes; + layout_spec.extra_auxv = spec->extra_auxv; + layout_spec.extra_auxv_count = spec->extra_auxv_count; + layout_spec.page_size = spec->page_size; + layout_spec.stack_top = spec->stack_top; + layout_spec.stack_size = spec->stack_size; + layout_spec.main_load_bias = spec->main_load_bias; + layout_spec.interp_load_bias = spec->interp_load_bias; + layout_spec.uid = spec->uid; + layout_spec.euid = spec->euid; + layout_spec.gid = spec->gid; + layout_spec.egid = spec->egid; + layout_spec.secure = spec->secure; + + if (kbox_loader_build_layout(&layout_spec, &launch->layout) < 0) + goto fail; + + memset(&image_spec, 0, sizeof(image_spec)); + image_spec.layout = &launch->layout; + image_spec.main_elf = launch->main_elf; + image_spec.main_elf_len = launch->main_elf_len; + image_spec.interp_elf = launch->interp_elf; + image_spec.interp_elf_len = launch->interp_elf_len; + + if (kbox_loader_materialize_image(&image_spec, &launch->image) < 0) + goto fail; + if (kbox_loader_build_handoff(&launch->layout, &launch->image, + &launch->handoff) < 0) { + goto fail; + } + if (kbox_loader_prepare_transfer(&launch->handoff, &launch->transfer) < 0) + goto fail; + return 0; + +fail: + kbox_loader_launch_reset(launch); + return -1; +} + +int kbox_loader_collect_exec_ranges(const struct kbox_loader_launch *launch, + struct kbox_loader_exec_range *ranges, + size_t range_cap, + size_t *range_count) +{ + size_t count = 0; + + if (!launch || !ranges || !range_count) + return -1; + + for (size_t i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + uint64_t end; + + if (mapping->size == 0 || (mapping->prot & PROT_EXEC) == 0) + continue; + if (mapping->source != KBOX_LOADER_MAPPING_MAIN && + mapping->source != KBOX_LOADER_MAPPING_INTERP) { + continue; + } + if (__builtin_add_overflow(mapping->addr, mapping->size, &end)) + return -1; + if (count >= range_cap) + return -1; + + ranges[count].start = mapping->addr; + ranges[count].end = end; + count++; + } + + *range_count = count; + return count > 0 ? 0 : -1; +} diff --git a/src/loader-launch.h b/src/loader-launch.h new file mode 100644 index 0000000..5c4068f --- /dev/null +++ b/src/loader-launch.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_LAUNCH_H +#define KBOX_LOADER_LAUNCH_H + +#include +#include + +#include "loader-handoff.h" +#include "loader-image.h" +#include "loader-layout.h" +#include "loader-transfer.h" + +struct kbox_loader_exec_range { + uint64_t start; + uint64_t end; +}; + +struct kbox_loader_launch { + unsigned char *main_elf; + size_t main_elf_len; + unsigned char *interp_elf; + size_t interp_elf_len; + struct kbox_loader_layout layout; + struct kbox_loader_image image; + struct kbox_loader_handoff handoff; + struct kbox_loader_transfer_state transfer; +}; + +struct kbox_loader_launch_spec { + int exec_fd; + int interp_fd; + const char *const *argv; + size_t argc; + const char *const *envp; + size_t envc; + const char *execfn; + const unsigned char *random_bytes; + const struct kbox_loader_auxv_entry *extra_auxv; + size_t extra_auxv_count; + uint64_t page_size; + uint64_t stack_top; + uint64_t stack_size; + uint64_t main_load_bias; + uint64_t interp_load_bias; + uint32_t uid; + uint32_t euid; + uint32_t gid; + uint32_t egid; + int secure; +}; + +void kbox_loader_launch_reset(struct kbox_loader_launch *launch); +int kbox_loader_prepare_launch(const struct kbox_loader_launch_spec *spec, + struct kbox_loader_launch *launch); +int kbox_loader_collect_exec_ranges(const struct kbox_loader_launch *launch, + struct kbox_loader_exec_range *ranges, + size_t range_cap, + size_t *range_count); + +#endif /* KBOX_LOADER_LAUNCH_H */ diff --git a/src/loader-layout.c b/src/loader-layout.c new file mode 100644 index 0000000..9fe10a6 --- /dev/null +++ b/src/loader-layout.c @@ -0,0 +1,274 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "loader-layout.h" + +static int align_up_u64(uint64_t value, uint64_t align, uint64_t *out) +{ + uint64_t addend; + + if (align == 0 || (align & (align - 1)) != 0) + return -1; + addend = align - 1; + if (__builtin_add_overflow(value, addend, out)) + return -1; + *out &= ~addend; + return 0; +} + +static int segment_prot(uint32_t flags) +{ + int prot = 0; + + if ((flags & 0x4u) != 0) + prot |= PROT_READ; + if ((flags & 0x2u) != 0) + prot |= PROT_WRITE; + if ((flags & 0x1u) != 0) + prot |= PROT_EXEC; + return prot; +} + +static int loader_stack_prot(const struct kbox_loader_layout *layout) +{ + uint32_t stack_flags = layout->main_plan.stack_flags; + + if (layout->has_interp) + stack_flags |= layout->interp_plan.stack_flags; + return PROT_READ | PROT_WRITE | + (((stack_flags & 0x1u) != 0) ? PROT_EXEC : 0); +} + +static uint64_t effective_load_bias(const struct kbox_elf_load_plan *plan, + uint64_t requested_bias) +{ + if (!plan) + return 0; + return plan->pie ? requested_bias : 0; +} + +static int append_plan_mappings(struct kbox_loader_layout *layout, + const struct kbox_elf_load_plan *plan, + uint64_t page_size, + uint64_t load_bias, + enum kbox_loader_mapping_source source) +{ + for (size_t i = 0; i < plan->segment_count; i++) { + const struct kbox_elf_load_segment *seg = &plan->segments[i]; + uint64_t seg_end; + uint64_t file_end; + uint64_t file_map_end; + uint64_t bss_map_end; + uint64_t zero_fill_size = 0; + struct kbox_loader_mapping *mapping; + + if (__builtin_add_overflow(seg->vaddr, seg->mem_size, &seg_end)) + return -1; + if (align_up_u64(seg_end, seg->map_align, &bss_map_end) < 0) + return -1; + + if (seg->file_size == 0) { + uint64_t biased_addr; + + if (__builtin_add_overflow(load_bias, seg->map_start, &biased_addr)) + return -1; + if (layout->mapping_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + mapping = &layout->mappings[layout->mapping_count++]; + *mapping = (struct kbox_loader_mapping) { + .addr = biased_addr, + .size = seg->map_size, + .file_offset = 0, + .file_size = 0, + .zero_fill_start = 0, + .zero_fill_size = 0, + .prot = segment_prot(seg->flags), + .flags = MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, + .source = source, + }; + continue; + } + + if (__builtin_add_overflow(seg->vaddr, seg->file_size, &file_end)) + return -1; + if (align_up_u64(file_end, seg->map_align, &file_map_end) < 0) + return -1; + if (file_map_end < seg->map_start || bss_map_end < file_map_end) + return -1; + if (seg->mem_size > seg->file_size && file_map_end > file_end) + zero_fill_size = file_map_end - file_end; + + { + uint64_t biased_addr; + uint64_t biased_zf; + uint64_t biased_bss; + + if (__builtin_add_overflow(load_bias, seg->map_start, &biased_addr)) + return -1; + if (zero_fill_size && + __builtin_add_overflow(load_bias, file_end, &biased_zf)) + return -1; + if (layout->mapping_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + mapping = &layout->mappings[layout->mapping_count++]; + *mapping = (struct kbox_loader_mapping) { + .addr = biased_addr, + .size = file_map_end - seg->map_start, + .file_offset = seg->map_offset, + .file_size = + seg->file_size + (seg->file_offset - seg->map_offset), + .zero_fill_start = zero_fill_size ? biased_zf : 0, + .zero_fill_size = zero_fill_size, + .prot = segment_prot(seg->flags), + .flags = MAP_PRIVATE | MAP_FIXED, + .source = source, + }; + + if (bss_map_end > file_map_end) { + if (__builtin_add_overflow(load_bias, file_map_end, + &biased_bss)) + return -1; + if (layout->mapping_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + mapping = &layout->mappings[layout->mapping_count++]; + *mapping = (struct kbox_loader_mapping) { + .addr = biased_bss, + .size = bss_map_end - file_map_end, + .file_offset = 0, + .file_size = 0, + .zero_fill_start = 0, + .zero_fill_size = 0, + .prot = segment_prot(seg->flags), + .flags = MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, + .source = source, + }; + } + } + } + + return 0; +} + +void kbox_loader_layout_reset(struct kbox_loader_layout *layout) +{ + if (!layout) + return; + kbox_loader_stack_image_reset(&layout->stack); + memset(layout, 0, sizeof(*layout)); +} + +int kbox_loader_build_layout(const struct kbox_loader_layout_spec *spec, + struct kbox_loader_layout *layout) +{ + struct kbox_loader_stack_spec stack_spec; + uint64_t main_load_bias; + uint64_t interp_load_bias = 0; + + if (!spec || !layout || !spec->main_elf || spec->main_elf_len == 0 || + !spec->argv || spec->argc == 0) + return -1; + + kbox_loader_layout_reset(layout); + + if (kbox_build_elf_load_plan(spec->main_elf, spec->main_elf_len, + spec->page_size, &layout->main_plan) < 0) + return -1; + + if (spec->interp_elf && spec->interp_elf_len > 0) { + if (kbox_build_elf_load_plan(spec->interp_elf, spec->interp_elf_len, + spec->page_size, + &layout->interp_plan) < 0) { + kbox_loader_layout_reset(layout); + return -1; + } + layout->has_interp = 1; + } + + main_load_bias = + effective_load_bias(&layout->main_plan, spec->main_load_bias); + if (layout->has_interp) { + interp_load_bias = + effective_load_bias(&layout->interp_plan, spec->interp_load_bias); + } + + memset(&stack_spec, 0, sizeof(stack_spec)); + stack_spec.argv = spec->argv; + stack_spec.argc = spec->argc; + stack_spec.envp = spec->envp; + stack_spec.envc = spec->envc; + stack_spec.execfn = spec->execfn; + stack_spec.random_bytes = spec->random_bytes; + stack_spec.extra_auxv = spec->extra_auxv; + stack_spec.extra_auxv_count = spec->extra_auxv_count; + stack_spec.main_plan = &layout->main_plan; + stack_spec.interp_plan = layout->has_interp ? &layout->interp_plan : NULL; + stack_spec.main_load_bias = main_load_bias; + stack_spec.interp_load_bias = interp_load_bias; + stack_spec.page_size = spec->page_size; + stack_spec.stack_top = spec->stack_top; + stack_spec.stack_size = + spec->stack_size ? spec->stack_size : KBOX_LOADER_DEFAULT_STACK_SIZE; + stack_spec.uid = spec->uid; + stack_spec.euid = spec->euid; + stack_spec.gid = spec->gid; + stack_spec.egid = spec->egid; + stack_spec.secure = spec->secure; + + if (kbox_loader_build_initial_stack(&stack_spec, &layout->stack) < 0) { + kbox_loader_layout_reset(layout); + return -1; + } + + layout->main_load_bias = main_load_bias; + layout->interp_load_bias = interp_load_bias; + layout->stack_top = stack_spec.stack_top; + layout->stack_size = stack_spec.stack_size; + layout->initial_sp = layout->stack.initial_sp; + + if (append_plan_mappings(layout, &layout->main_plan, spec->page_size, + main_load_bias, KBOX_LOADER_MAPPING_MAIN) < 0) { + kbox_loader_layout_reset(layout); + return -1; + } + if (layout->has_interp && + append_plan_mappings(layout, &layout->interp_plan, spec->page_size, + interp_load_bias, + KBOX_LOADER_MAPPING_INTERP) < 0) { + kbox_loader_layout_reset(layout); + return -1; + } + if (layout->mapping_count >= KBOX_LOADER_MAX_MAPPINGS) { + kbox_loader_layout_reset(layout); + return -1; + } + if (layout->stack_size > layout->stack_top) { + kbox_loader_layout_reset(layout); + return -1; + } + layout->mappings[layout->mapping_count++] = (struct kbox_loader_mapping) { + .addr = layout->stack_top - layout->stack_size, + .size = layout->stack_size, + .file_offset = 0, + .file_size = 0, + .zero_fill_start = 0, + .zero_fill_size = 0, + .prot = loader_stack_prot(layout), + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, + .source = KBOX_LOADER_MAPPING_STACK, + }; + + { + uint64_t pc; + uint64_t bias = layout->has_interp ? interp_load_bias : main_load_bias; + uint64_t entry = layout->has_interp ? layout->interp_plan.entry + : layout->main_plan.entry; + + if (__builtin_add_overflow(bias, entry, &pc)) { + kbox_loader_layout_reset(layout); + return -1; + } + layout->initial_pc = pc; + } + return 0; +} diff --git a/src/loader-layout.h b/src/loader-layout.h new file mode 100644 index 0000000..2b8cfdd --- /dev/null +++ b/src/loader-layout.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_LAYOUT_H +#define KBOX_LOADER_LAYOUT_H + +#include +#include +#include + +#include "loader-stack.h" + +#define KBOX_LOADER_DEFAULT_STACK_SIZE (8u * 1024u * 1024u) +#define KBOX_LOADER_MAX_MAPPINGS (3 * KBOX_ELF_MAX_LOAD_SEGMENTS + 1) + +enum kbox_loader_mapping_source { + KBOX_LOADER_MAPPING_MAIN, + KBOX_LOADER_MAPPING_INTERP, + KBOX_LOADER_MAPPING_STACK, +}; + +struct kbox_loader_mapping { + uint64_t addr; + uint64_t size; + uint64_t file_offset; + uint64_t file_size; + uint64_t zero_fill_start; + uint64_t zero_fill_size; + int prot; + int flags; + enum kbox_loader_mapping_source source; +}; + +struct kbox_loader_layout { + struct kbox_elf_load_plan main_plan; + struct kbox_elf_load_plan interp_plan; + struct kbox_loader_stack_image stack; + struct kbox_loader_mapping mappings[KBOX_LOADER_MAX_MAPPINGS]; + uint64_t main_load_bias; + uint64_t interp_load_bias; + uint64_t initial_pc; + uint64_t initial_sp; + uint64_t stack_top; + uint64_t stack_size; + size_t mapping_count; + int has_interp; +}; + +struct kbox_loader_layout_spec { + const unsigned char *main_elf; + size_t main_elf_len; + const unsigned char *interp_elf; + size_t interp_elf_len; + const char *const *argv; + size_t argc; + const char *const *envp; + size_t envc; + const char *execfn; + const unsigned char *random_bytes; + const struct kbox_loader_auxv_entry *extra_auxv; + size_t extra_auxv_count; + uint64_t page_size; + uint64_t stack_top; + uint64_t stack_size; + uint64_t main_load_bias; + uint64_t interp_load_bias; + uint32_t uid; + uint32_t euid; + uint32_t gid; + uint32_t egid; + int secure; +}; + +void kbox_loader_layout_reset(struct kbox_loader_layout *layout); +int kbox_loader_build_layout(const struct kbox_loader_layout_spec *spec, + struct kbox_loader_layout *layout); + +#endif /* KBOX_LOADER_LAYOUT_H */ diff --git a/src/loader-stack.c b/src/loader-stack.c new file mode 100644 index 0000000..ffce48a --- /dev/null +++ b/src/loader-stack.c @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include + +#include "loader-stack.h" + +/* Use mmap(MAP_ANONYMOUS) instead of malloc/calloc. In trap mode, + * the stack builder runs from a SIGSYS signal handler where the guest + * may hold glibc heap locks. mmap is async-signal-safe. + */ +static void *signal_safe_alloc(size_t size) +{ + void *p; + + if (size == 0) + return NULL; + p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + return p == MAP_FAILED ? NULL : p; +} + +static void *signal_safe_zalloc(size_t size) +{ + /* MAP_ANONYMOUS pages are zero-filled by the kernel. */ + return signal_safe_alloc(size); +} + +static void signal_safe_free(void *p, size_t size) +{ + if (p && size > 0) + munmap(p, size); +} + +#define AT_NULL 0 +#define AT_PHDR 3 +#define AT_PHENT 4 +#define AT_PHNUM 5 +#define AT_PAGESZ 6 +#define AT_BASE 7 +#define AT_FLAGS 8 +#define AT_ENTRY 9 +#define AT_UID 11 +#define AT_EUID 12 +#define AT_GID 13 +#define AT_EGID 14 +#define AT_SECURE 23 +#define AT_RANDOM 25 +#define AT_EXECFN 31 + +#define LOADER_STACK_AUXV_BASE 13 + +static int add_overflow_u64(uint64_t a, uint64_t b, uint64_t *out) +{ + return __builtin_add_overflow(a, b, out); +} + +static uint64_t align_down_u64(uint64_t value, uint64_t align) +{ + return value & ~(align - 1); +} + +static int is_power_of_two_u64(uint64_t value) +{ + return value != 0 && (value & (value - 1)) == 0; +} + +static void write_u64(unsigned char *p, uint64_t value) +{ + memcpy(p, &value, sizeof(value)); +} + +static int place_blob(unsigned char *buf, + uint64_t base, + uint64_t *cursor, + const void *src, + size_t len, + uint64_t *addr_out) +{ + if ((uint64_t) len > *cursor - base) + return -1; + *cursor -= (uint64_t) len; + memcpy(buf + (*cursor - base), src, len); + *addr_out = *cursor; + return 0; +} + +static int place_cstr(unsigned char *buf, + uint64_t base, + uint64_t *cursor, + const char *s, + uint64_t *addr_out) +{ + return place_blob(buf, base, cursor, s, strlen(s) + 1, addr_out); +} + +void kbox_loader_stack_image_reset(struct kbox_loader_stack_image *image) +{ + if (!image) + return; + signal_safe_free(image->data, image->capacity); + memset(image, 0, sizeof(*image)); +} + +int kbox_loader_build_initial_stack(const struct kbox_loader_stack_spec *spec, + struct kbox_loader_stack_image *image) +{ + uint64_t stack_base; + uint64_t cursor; + uint64_t table_start; + uint64_t table_size; + uint64_t auxc; + uint64_t words; + uint64_t image_size; + uint64_t offset; + unsigned char *buf = NULL; + size_t buf_size; + uint64_t *argv_addrs = NULL; + size_t argv_addrs_size; + uint64_t *env_addrs = NULL; + size_t env_addrs_size; + struct kbox_loader_auxv_entry *auxv = NULL; + size_t auxv_size; + size_t auxi = 0; + + if (!spec || !image || !spec->main_plan || !spec->argv || spec->argc == 0) + return -1; + if (spec->envc > 0 && !spec->envp) + return -1; + kbox_loader_stack_image_reset(image); + if (!is_power_of_two_u64(spec->page_size) || spec->stack_size == 0) + return -1; + if (spec->stack_top < spec->stack_size) + return -1; + + stack_base = spec->stack_top - spec->stack_size; + cursor = spec->stack_top; + + buf_size = (size_t) spec->stack_size; + buf = signal_safe_zalloc(buf_size); + if (!buf) + return -1; + + if (__builtin_mul_overflow(spec->argc, sizeof(*argv_addrs), + &argv_addrs_size)) + goto fail; + if (__builtin_mul_overflow(spec->envc ? spec->envc : 1, sizeof(*env_addrs), + &env_addrs_size)) + goto fail; + { + size_t auxv_count = LOADER_STACK_AUXV_BASE + spec->extra_auxv_count + + (spec->interp_plan ? 1 : 0); + if (__builtin_mul_overflow(auxv_count, sizeof(*auxv), &auxv_size)) + goto fail; + } + argv_addrs = signal_safe_zalloc(argv_addrs_size); + env_addrs = signal_safe_zalloc(env_addrs_size); + auxv = signal_safe_zalloc(auxv_size); + if (!argv_addrs || !env_addrs || !auxv) + goto fail; + + if (place_blob(buf, stack_base, &cursor, + spec->random_bytes + ? spec->random_bytes + : (const unsigned char[KBOX_LOADER_RANDOM_SIZE]) {0}, + KBOX_LOADER_RANDOM_SIZE, &image->random_addr) < 0) + goto fail; + + if (place_cstr(buf, stack_base, &cursor, + spec->execfn ? spec->execfn : spec->argv[0], + &image->execfn_addr) < 0) + goto fail; + + for (size_t i = spec->envc; i > 0; i--) { + if (place_cstr(buf, stack_base, &cursor, spec->envp[i - 1], + &env_addrs[i - 1]) < 0) + goto fail; + } + + for (size_t i = spec->argc; i > 0; i--) { + if (place_cstr(buf, stack_base, &cursor, spec->argv[i - 1], + &argv_addrs[i - 1]) < 0) + goto fail; + } + + { + uint64_t phdr_addr; + + if (__builtin_add_overflow(spec->main_load_bias, + spec->main_plan->phdr_vaddr, &phdr_addr)) + goto fail; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_PHDR, + .value = phdr_addr, + }; + } + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_PHENT, + .value = spec->main_plan->phentsize, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_PHNUM, + .value = spec->main_plan->phnum, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_PAGESZ, + .value = spec->page_size, + }; + if (spec->interp_plan) { + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_BASE, + .value = spec->interp_load_bias, + }; + } + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_FLAGS, + .value = 0, + }; + { + uint64_t entry_addr; + + if (__builtin_add_overflow(spec->main_load_bias, spec->main_plan->entry, + &entry_addr)) + goto fail; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_ENTRY, + .value = entry_addr, + }; + } + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_UID, + .value = spec->uid, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_EUID, + .value = spec->euid, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_GID, + .value = spec->gid, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_EGID, + .value = spec->egid, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_SECURE, + .value = spec->secure ? 1u : 0u, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_RANDOM, + .value = image->random_addr, + }; + auxv[auxi++] = (struct kbox_loader_auxv_entry) { + .key = AT_EXECFN, + .value = image->execfn_addr, + }; + + for (size_t i = 0; i < spec->extra_auxv_count; i++) + auxv[auxi++] = spec->extra_auxv[i]; + + auxc = (uint64_t) auxi; + if (add_overflow_u64(1 + spec->argc + 1 + spec->envc + 1, 2 * (auxc + 1), + &words)) + goto fail; + if (__builtin_mul_overflow(words, (uint64_t) sizeof(uint64_t), &table_size)) + goto fail; + if (cursor < stack_base + table_size) + goto fail; + + table_start = align_down_u64(cursor - table_size, 16); + if (table_start < stack_base) + goto fail; + + image_size = spec->stack_top - table_start; + if (image_size > spec->stack_size) + goto fail; + + image->initial_sp = table_start; + image->size = (size_t) image_size; + + offset = 0; + write_u64(buf + (table_start - stack_base) + offset, spec->argc); + offset += sizeof(uint64_t); + for (size_t i = 0; i < spec->argc; i++, offset += sizeof(uint64_t)) + write_u64(buf + (table_start - stack_base) + offset, argv_addrs[i]); + write_u64(buf + (table_start - stack_base) + offset, 0); + offset += sizeof(uint64_t); + for (size_t i = 0; i < spec->envc; i++, offset += sizeof(uint64_t)) + write_u64(buf + (table_start - stack_base) + offset, env_addrs[i]); + write_u64(buf + (table_start - stack_base) + offset, 0); + offset += sizeof(uint64_t); + for (size_t i = 0; i < auxi; i++) { + write_u64(buf + (table_start - stack_base) + offset, auxv[i].key); + offset += sizeof(uint64_t); + write_u64(buf + (table_start - stack_base) + offset, auxv[i].value); + offset += sizeof(uint64_t); + } + write_u64(buf + (table_start - stack_base) + offset, AT_NULL); + offset += sizeof(uint64_t); + write_u64(buf + (table_start - stack_base) + offset, 0); + + image->capacity = image->size; + image->data = signal_safe_alloc(image->capacity); + if (!image->data) + goto fail; + memcpy(image->data, buf + (table_start - stack_base), image->size); + + signal_safe_free(buf, buf_size); + signal_safe_free(argv_addrs, argv_addrs_size); + signal_safe_free(env_addrs, env_addrs_size); + signal_safe_free(auxv, auxv_size); + return 0; + +fail: + signal_safe_free(buf, buf_size); + signal_safe_free(argv_addrs, argv_addrs_size); + signal_safe_free(env_addrs, env_addrs_size); + signal_safe_free(auxv, auxv_size); + kbox_loader_stack_image_reset(image); + return -1; +} diff --git a/src/loader-stack.h b/src/loader-stack.h new file mode 100644 index 0000000..379a220 --- /dev/null +++ b/src/loader-stack.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_STACK_H +#define KBOX_LOADER_STACK_H + +#include +#include + +#include "kbox/elf.h" + +#define KBOX_LOADER_RANDOM_SIZE 16 + +struct kbox_loader_auxv_entry { + uint64_t key; + uint64_t value; +}; + +struct kbox_loader_stack_spec { + const char *const *argv; + size_t argc; + const char *const *envp; + size_t envc; + const char *execfn; + const unsigned char *random_bytes; + const struct kbox_loader_auxv_entry *extra_auxv; + size_t extra_auxv_count; + const struct kbox_elf_load_plan *main_plan; + const struct kbox_elf_load_plan *interp_plan; + uint64_t main_load_bias; + uint64_t interp_load_bias; + uint64_t page_size; + uint64_t stack_top; + uint64_t stack_size; + uint32_t uid; + uint32_t euid; + uint32_t gid; + uint32_t egid; + int secure; +}; + +struct kbox_loader_stack_image { + unsigned char *data; + size_t size; + size_t capacity; + uint64_t initial_sp; + uint64_t random_addr; + uint64_t execfn_addr; +}; + +void kbox_loader_stack_image_reset(struct kbox_loader_stack_image *image); +int kbox_loader_build_initial_stack(const struct kbox_loader_stack_spec *spec, + struct kbox_loader_stack_image *image); + +#endif /* KBOX_LOADER_STACK_H */ diff --git a/src/loader-transfer.c b/src/loader-transfer.c new file mode 100644 index 0000000..afc880d --- /dev/null +++ b/src/loader-transfer.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "loader-transfer.h" + +int kbox_loader_prepare_transfer(const struct kbox_loader_handoff *handoff, + struct kbox_loader_transfer_state *state) +{ + if (!handoff || !state) + return -1; + if (handoff->entry_map_end <= handoff->entry_map_start || + handoff->stack_map_end <= handoff->stack_map_start) { + return -1; + } + if (handoff->entry.pc < handoff->entry_map_start || + handoff->entry.pc >= handoff->entry_map_end) { + return -1; + } + if (handoff->entry.sp < handoff->stack_map_start || + handoff->entry.sp >= handoff->stack_map_end) { + return -1; + } + if ((handoff->entry.sp & 0xfu) != 0) + return -1; + + memset(state, 0, sizeof(*state)); + state->arch = handoff->entry.arch; + state->pc = handoff->entry.pc; + state->sp = handoff->entry.sp; + memcpy(state->regs, handoff->entry.regs, sizeof(state->regs)); + state->entry_map_start = handoff->entry_map_start; + state->entry_map_end = handoff->entry_map_end; + state->stack_map_start = handoff->stack_map_start; + state->stack_map_end = handoff->stack_map_end; + return 0; +} + +__attribute__((noreturn)) void kbox_loader_transfer_to_guest( + const struct kbox_loader_transfer_state *state) +{ + if (!state) + __builtin_trap(); + +#if defined(__x86_64__) + if (state->arch != KBOX_LOADER_ENTRY_ARCH_X86_64) + __builtin_trap(); + register uint64_t rdi __asm__("rdi") = state->regs[0]; + register uint64_t rsi __asm__("rsi") = state->regs[1]; + register uint64_t rdx __asm__("rdx") = state->regs[2]; + register uint64_t r10 __asm__("r10") = state->regs[3]; + register uint64_t r8 __asm__("r8") = state->regs[4]; + register uint64_t r9 __asm__("r9") = state->regs[5]; + uint64_t sp = state->sp; + uint64_t pc = state->pc; + + __asm__ volatile( + "mov %0, %%rsp\n\t" + "jmp *%1\n\t" + : + : "r"(sp), "r"(pc), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10), "r"(r8), + "r"(r9) + : "memory"); +#elif defined(__aarch64__) + if (state->arch != KBOX_LOADER_ENTRY_ARCH_AARCH64) + __builtin_trap(); + register uint64_t x0 __asm__("x0") = state->regs[0]; + register uint64_t x1 __asm__("x1") = state->regs[1]; + register uint64_t x2 __asm__("x2") = state->regs[2]; + register uint64_t x3 __asm__("x3") = state->regs[3]; + register uint64_t x4 __asm__("x4") = state->regs[4]; + register uint64_t x5 __asm__("x5") = state->regs[5]; + register uint64_t x16 __asm__("x16") = state->pc; + uint64_t sp = state->sp; + + __asm__ volatile( + "mov sp, %0\n\t" + "br x16\n\t" + : + : "r"(sp), "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), + "r"(x16) + : "memory"); +#else + (void) state; + __builtin_trap(); +#endif + + __builtin_unreachable(); +} diff --git a/src/loader-transfer.h b/src/loader-transfer.h new file mode 100644 index 0000000..c2097cd --- /dev/null +++ b/src/loader-transfer.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_LOADER_TRANSFER_H +#define KBOX_LOADER_TRANSFER_H + +#include + +#include "loader-handoff.h" + +struct kbox_loader_transfer_state { + enum kbox_loader_entry_arch arch; + uint64_t pc; + uint64_t sp; + uint64_t regs[6]; + uint64_t entry_map_start; + uint64_t entry_map_end; + uint64_t stack_map_start; + uint64_t stack_map_end; +}; + +int kbox_loader_prepare_transfer(const struct kbox_loader_handoff *handoff, + struct kbox_loader_transfer_state *state); +__attribute__((noreturn)) void kbox_loader_transfer_to_guest( + const struct kbox_loader_transfer_state *state); + +#endif /* KBOX_LOADER_TRANSFER_H */ diff --git a/src/net-slirp.c b/src/net-slirp.c index c9a95d5..0d4930f 100644 --- a/src/net-slirp.c +++ b/src/net-slirp.c @@ -135,9 +135,6 @@ static void *rx_reader_loop(void *arg) if (pkt_len == 0) continue; /* wakeup signal, not a real packet */ - if (pkt_len > MAX_PKT_SIZE) - break; - /* Read payload. */ uint8_t buf[MAX_PKT_SIZE]; size_t remaining = pkt_len; @@ -774,7 +771,7 @@ static void drain_tx_pipe(void) ssize_t n = read(tx_pipe[0], &pkt_len, sizeof(pkt_len)); if (n != sizeof(pkt_len)) break; - if (pkt_len == 0 || pkt_len > MAX_PKT_SIZE) + if (pkt_len == 0) break; size_t remaining = pkt_len; diff --git a/src/probe.c b/src/probe.c index 8215a99..00eab59 100644 --- a/src/probe.c +++ b/src/probe.c @@ -146,6 +146,54 @@ static int probe_seccomp_listener(void) return -1; } +static int probe_seccomp_filter_basic(void) +{ + pid_t pid = fork(); + if (pid < 0) { + fprintf(stderr, "probe: fork: %s\n", strerror(errno)); + return -1; + } + + if (pid == 0) { + struct kbox_sock_filter filter[2]; + struct kbox_sock_fprog prog; + long ret; + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) + _exit(1); + + filter[0] = (struct kbox_sock_filter) { + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, 0, 0, 0}; + filter[1] = (struct kbox_sock_filter) {KBOX_BPF_RET | KBOX_BPF_K, 0, 0, + KBOX_SECCOMP_RET_ALLOW}; + prog.len = 2; + prog.filter = filter; + + ret = syscall(__NR_seccomp, KBOX_SECCOMP_SET_MODE_FILTER, 0, &prog); + if (ret < 0) + _exit(2); + _exit(0); + } + + { + int status = 0; + pid_t w; + + do { + w = waitpid(pid, &status, 0); + } while (w < 0 && errno == EINTR); + if (w < 0 || !WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fprintf(stderr, + "probe: FAIL -- seccomp(SET_MODE_FILTER) is not " + "supported.\n" + " This is required for all syscall modes.\n"); + return -1; + } + } + + return 0; +} + /* Check process_vm_readv works between parent and child. * * Ubuntu's AppArmor and Yama LSM (ptrace_scope >= 1) can restrict @@ -248,24 +296,72 @@ static int probe_yama_scope(void) /* Public entry point. */ -int kbox_probe_host_features(void) +int kbox_collect_probe_result(enum kbox_syscall_mode mode, + struct kbox_probe_result *out) +{ + int need_supervisor = 0; + + if (!out) + return -1; + + memset(out, 0, sizeof(*out)); + + if (mode == KBOX_SYSCALL_MODE_SECCOMP || mode == KBOX_SYSCALL_MODE_AUTO) + need_supervisor = 1; + + out->no_new_privs_ok = probe_no_new_privs() == 0; + out->seccomp_filter_ok = probe_seccomp_filter_basic() == 0; + + if (need_supervisor) { + out->seccomp_listener_ok = probe_seccomp_listener() == 0; + out->process_vm_readv_ok = probe_process_vm_readv() == 0; + } + + return 0; +} + +int kbox_probe_host_features(enum kbox_syscall_mode mode) { int failures = 0; + int need_supervisor = 0; + struct kbox_probe_result result; fprintf(stderr, "kbox: probing host features...\n"); /* Advisory check first. */ probe_yama_scope(); - if (probe_no_new_privs() < 0) - failures++; + if (kbox_collect_probe_result(mode, &result) < 0) + return -1; - if (probe_seccomp_listener() < 0) - failures++; + if (mode == KBOX_SYSCALL_MODE_SECCOMP) + need_supervisor = 1; - if (probe_process_vm_readv() < 0) + if (!result.no_new_privs_ok) + failures++; + if (!result.seccomp_filter_ok) failures++; + if (need_supervisor) { + if (!result.seccomp_listener_ok) + failures++; + if (!result.process_vm_readv_ok) + failures++; + } + + /* AUTO probes supervisor features but only warns (doesn't fail). + * The launch path selects trap for direct binaries and seccomp + * for shells. If seccomp-unotify is unavailable, AUTO still + * works for direct binaries via the trap path. + */ + if (mode == KBOX_SYSCALL_MODE_AUTO) { + if (!result.seccomp_listener_ok || !result.process_vm_readv_ok) { + fprintf(stderr, + "kbox: WARN -- seccomp-unotify features unavailable; " + "AUTO mode restricted to trap fast path only\n"); + } + } + if (failures > 0) { fprintf(stderr, "kbox: %d feature probe(s) failed -- cannot " diff --git a/src/procmem.c b/src/procmem.c index 22dbca7..7bcfa4f 100644 --- a/src/procmem.c +++ b/src/procmem.c @@ -10,11 +10,171 @@ #include #include #include +#include #include #include #include "procmem.h" +static inline pid_t guest_pid(const struct kbox_guest_mem *guest) +{ + return (pid_t) guest->opaque; +} + +static int ensure_self_mem_fd(void) +{ + static int self_mem_fd = -1; + int fd = __atomic_load_n(&self_mem_fd, __ATOMIC_ACQUIRE); + + if (fd >= 0) + return fd; + + fd = open("/proc/self/mem", O_RDWR | O_CLOEXEC); + if (fd < 0) + return -1; + + { + int expected = -1; + if (!__atomic_compare_exchange_n(&self_mem_fd, &expected, fd, 0, + __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)) { + close(fd); + fd = expected; + } + } + + return fd; +} + +static int self_mem_read(uint64_t remote_addr, void *out, size_t len) +{ + int fd; + ssize_t ret; + + if (len == 0) + return 0; + if (remote_addr == 0 || !out) + return -EFAULT; + + fd = ensure_self_mem_fd(); + if (fd < 0) + return -errno; + + ret = pread(fd, out, len, (off_t) remote_addr); + if (ret < 0) + return -errno; + if ((size_t) ret != len) + return -EIO; + return 0; +} + +static int self_mem_read_string(uint64_t remote_addr, char *buf, size_t max_len) +{ + int fd; + size_t total = 0; + + enum { + KBOX_STRING_READ_CHUNK = 256, + }; + + if (remote_addr == 0) + return -EFAULT; + if (max_len == 0) + return -EINVAL; + + fd = ensure_self_mem_fd(); + if (fd < 0) + return -errno; + + while (total < max_len) { + ssize_t n; + size_t i; + size_t chunk = max_len - total; + + if (chunk > KBOX_STRING_READ_CHUNK) + chunk = KBOX_STRING_READ_CHUNK; + + n = pread(fd, buf + total, chunk, (off_t) (remote_addr + total)); + if (n < 0) + return -errno; + if (n == 0) + return -EIO; + + for (i = 0; i < (size_t) n; i++) { + if (buf[total + i] == '\0') + return (int) (total + i); + } + + total += (size_t) n; + + if ((size_t) n < chunk) + return -EFAULT; + } + + if (max_len > 0) + buf[0] = '\0'; + return -ENAMETOOLONG; +} + +int kbox_current_read(uint64_t remote_addr, void *out, size_t len) +{ + return self_mem_read(remote_addr, out, len); +} + +int kbox_current_write(uint64_t remote_addr, const void *in, size_t len) +{ + return kbox_vm_write(getpid(), remote_addr, in, len); +} + +int kbox_current_write_force(uint64_t remote_addr, const void *in, size_t len) +{ + static const char proc_self_mem[] = "/proc/self/mem"; + int fd; + ssize_t n; + + if (len == 0) + return 0; + if (remote_addr == 0 || !in) + return -EFAULT; + + fd = open(proc_self_mem, O_WRONLY | O_CLOEXEC); + if (fd < 0) + return -errno; + + n = pwrite(fd, in, len, (off_t) remote_addr); + if (n < 0) { + int saved_errno = errno; + close(fd); + return -saved_errno; + } + close(fd); + + if ((size_t) n != len) + return -EIO; + return 0; +} + +int kbox_current_read_string(uint64_t remote_addr, char *buf, size_t max_len) +{ + return self_mem_read_string(remote_addr, buf, max_len); +} + +int kbox_current_read_open_how(uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) +{ + uint64_t expected = (uint64_t) sizeof(struct kbox_open_how); + + if (remote_addr == 0) + return -EFAULT; + if (size < expected) + return -EINVAL; + if (size > expected) + return -E2BIG; + + memset(out, 0, sizeof(*out)); + return kbox_current_read(remote_addr, out, sizeof(*out)); +} + int kbox_vm_read(pid_t pid, uint64_t remote_addr, void *out, size_t len) { struct iovec local_iov; @@ -23,13 +183,15 @@ int kbox_vm_read(pid_t pid, uint64_t remote_addr, void *out, size_t len) if (len == 0) return 0; + if (remote_addr == 0 || !out) + return -EFAULT; local_iov.iov_base = out; local_iov.iov_len = len; remote_iov.iov_base = (void *) (uintptr_t) remote_addr; remote_iov.iov_len = len; - ret = process_vm_readv(pid, &local_iov, 1, &remote_iov, 1, 0); + ret = syscall(SYS_process_vm_readv, pid, &local_iov, 1, &remote_iov, 1, 0); if (ret < 0) return -errno; if ((size_t) ret != len) @@ -45,6 +207,8 @@ int kbox_vm_write(pid_t pid, uint64_t remote_addr, const void *in, size_t len) if (len == 0) return 0; + if (remote_addr == 0 || !in) + return -EFAULT; /* process_vm_writev takes a non-const iov_base, but we only read from the * local buffer. The cast is safe. @@ -54,7 +218,7 @@ int kbox_vm_write(pid_t pid, uint64_t remote_addr, const void *in, size_t len) remote_iov.iov_base = (void *) (uintptr_t) remote_addr; remote_iov.iov_len = len; - ret = process_vm_writev(pid, &local_iov, 1, &remote_iov, 1, 0); + ret = syscall(SYS_process_vm_writev, pid, &local_iov, 1, &remote_iov, 1, 0); if (ret < 0) return -errno; if ((size_t) ret != len) @@ -69,39 +233,50 @@ int kbox_vm_read_string(pid_t pid, { struct iovec local_iov; struct iovec remote_iov; - ssize_t n; - size_t i; + size_t total = 0; + + enum { + KBOX_STRING_READ_CHUNK = 256, + }; if (remote_addr == 0) return -EFAULT; if (max_len == 0) return -EINVAL; - local_iov.iov_base = buf; - local_iov.iov_len = max_len; - remote_iov.iov_base = (void *) (uintptr_t) remote_addr; - remote_iov.iov_len = max_len; + while (total < max_len) { + ssize_t n; + size_t i; + size_t chunk = max_len - total; + + if (chunk > KBOX_STRING_READ_CHUNK) + chunk = KBOX_STRING_READ_CHUNK; + + local_iov.iov_base = buf + total; + local_iov.iov_len = chunk; + remote_iov.iov_base = + (void *) (uintptr_t) (remote_addr + (uint64_t) total); + remote_iov.iov_len = chunk; + + n = syscall(SYS_process_vm_readv, pid, &local_iov, 1, &remote_iov, 1, + 0); + if (n <= 0) + return errno ? -errno : -EIO; + + for (i = 0; i < (size_t) n; i++) { + if (buf[total + i] == '\0') + return (int) (total + i); + } - n = process_vm_readv(pid, &local_iov, 1, &remote_iov, 1, 0); - if (n <= 0) - return errno ? -errno : -EIO; + total += (size_t) n; - /* Find the NUL terminator within the bytes we actually read. */ - for (i = 0; i < (size_t) n; i++) { - if (buf[i] == '\0') - return (int) i; + /* Short read before NUL means the next page isn't readable. */ + if ((size_t) n < chunk) + return -EFAULT; } - /* No NUL found in the read data. Two possible reasons: - * - Short read (page boundary): tracee memory is faulted. - * - Full read: path exceeds PATH_MAX. - * - * Either way, do not silently truncate: a truncated path could resolve to - * an unintended file. - */ - buf[0] = '\0'; - if ((size_t) n < max_len) - return -EFAULT; + if (max_len > 0) + buf[0] = '\0'; return -ENAMETOOLONG; } @@ -134,18 +309,173 @@ int kbox_vm_write_force(pid_t pid, if (len == 0) return 0; + if (remote_addr == 0 || !in) + return -EFAULT; snprintf(proc_path, sizeof(proc_path), "/proc/%d/mem", (int) pid); - fd = open(proc_path, O_WRONLY); + fd = open(proc_path, O_WRONLY | O_CLOEXEC); if (fd < 0) return -errno; n = pwrite(fd, in, len, (off_t) remote_addr); + if (n < 0) { + int saved_errno = errno; + close(fd); + return -saved_errno; + } close(fd); - - if (n < 0) - return -errno; if ((size_t) n != len) return -EIO; return 0; } + +static int process_vm_guest_read(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + void *out, + size_t len) +{ + return kbox_vm_read(guest_pid(guest), remote_addr, out, len); +} + +static int process_vm_guest_write(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + return kbox_vm_write(guest_pid(guest), remote_addr, in, len); +} + +static int process_vm_guest_write_force(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + return kbox_vm_write_force(guest_pid(guest), remote_addr, in, len); +} + +static int process_vm_guest_read_string(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + char *buf, + size_t max_len) +{ + return kbox_vm_read_string(guest_pid(guest), remote_addr, buf, max_len); +} + +static int process_vm_guest_read_open_how(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) +{ + return kbox_vm_read_open_how(guest_pid(guest), remote_addr, size, out); +} + +static int current_guest_read(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + void *out, + size_t len) +{ + (void) guest; + return kbox_current_read(remote_addr, out, len); +} + +static int current_guest_write(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + (void) guest; + return kbox_current_write(remote_addr, in, len); +} + +static int current_guest_write_force(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + (void) guest; + return kbox_current_write_force(remote_addr, in, len); +} + +static int current_guest_read_string(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + char *buf, + size_t max_len) +{ + (void) guest; + return kbox_current_read_string(remote_addr, buf, max_len); +} + +static int current_guest_read_open_how(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) +{ + (void) guest; + return kbox_current_read_open_how(remote_addr, size, out); +} + +const struct kbox_guest_mem_ops kbox_process_vm_guest_mem_ops = { + .read = process_vm_guest_read, + .write = process_vm_guest_write, + .write_force = process_vm_guest_write_force, + .read_string = process_vm_guest_read_string, + .read_open_how = process_vm_guest_read_open_how, +}; + +const struct kbox_guest_mem_ops kbox_current_guest_mem_ops = { + .read = current_guest_read, + .write = current_guest_write, + .write_force = current_guest_write_force, + .read_string = current_guest_read_string, + .read_open_how = current_guest_read_open_how, +}; + +int kbox_guest_mem_read(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + void *out, + size_t len) +{ + if (!guest || !guest->ops || !guest->ops->read) + return -EINVAL; + return guest->ops->read(guest, remote_addr, out, len); +} + +int kbox_guest_mem_write(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + if (!guest || !guest->ops || !guest->ops->write) + return -EINVAL; + return guest->ops->write(guest, remote_addr, in, len); +} + +int kbox_guest_mem_write_force(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len) +{ + if (!guest || !guest->ops || !guest->ops->write_force) + return -EINVAL; + return guest->ops->write_force(guest, remote_addr, in, len); +} + +int kbox_guest_mem_read_string(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + char *buf, + size_t max_len) +{ + if (!guest || !guest->ops || !guest->ops->read_string) + return -EINVAL; + return guest->ops->read_string(guest, remote_addr, buf, max_len); +} + +int kbox_guest_mem_read_open_how(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) +{ + if (!guest || !guest->ops || !guest->ops->read_open_how) + return -EINVAL; + return guest->ops->read_open_how(guest, remote_addr, size, out); +} diff --git a/src/procmem.h b/src/procmem.h index 02a7627..6c1f72e 100644 --- a/src/procmem.h +++ b/src/procmem.h @@ -9,6 +9,39 @@ #include "lkl-wrap.h" +struct kbox_guest_mem; + +struct kbox_guest_mem_ops { + int (*read)(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + void *out, + size_t len); + int (*write)(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len); + int (*write_force)(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len); + int (*read_string)(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + char *buf, + size_t max_len); + int (*read_open_how)(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out); +}; + +struct kbox_guest_mem { + const struct kbox_guest_mem_ops *ops; + uintptr_t opaque; +}; + +extern const struct kbox_guest_mem_ops kbox_process_vm_guest_mem_ops; +extern const struct kbox_guest_mem_ops kbox_current_guest_mem_ops; + int kbox_vm_read(pid_t pid, uint64_t remote_addr, void *out, size_t len); int kbox_vm_write(pid_t pid, uint64_t remote_addr, const void *in, size_t len); int kbox_vm_write_force(pid_t pid, @@ -23,5 +56,32 @@ int kbox_vm_read_open_how(pid_t pid, uint64_t remote_addr, uint64_t size, struct kbox_open_how *out); +int kbox_guest_mem_read(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + void *out, + size_t len); +int kbox_guest_mem_write(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len); +int kbox_guest_mem_write_force(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + const void *in, + size_t len); +int kbox_guest_mem_read_string(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + char *buf, + size_t max_len); +int kbox_guest_mem_read_open_how(const struct kbox_guest_mem *guest, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out); +int kbox_current_read(uint64_t remote_addr, void *out, size_t len); +int kbox_current_write(uint64_t remote_addr, const void *in, size_t len); +int kbox_current_write_force(uint64_t remote_addr, const void *in, size_t len); +int kbox_current_read_string(uint64_t remote_addr, char *buf, size_t max_len); +int kbox_current_read_open_how(uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out); #endif /* KBOX_PROCMEM_H */ diff --git a/src/rewrite.c b/src/rewrite.c new file mode 100644 index 0000000..dd5fe24 --- /dev/null +++ b/src/rewrite.c @@ -0,0 +1,4060 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kbox/elf.h" + +#include "io-util.h" +#include "kbox/x86-decode.h" +#include "rewrite.h" +#include "syscall-nr.h" +#include "syscall-trap.h" + +#define EM_X86_64 62 +#define EM_AARCH64 183 +#define AARCH64_B_OPCODE 0x14000000u +#define AARCH64_B_IMM26_MASK 0x03ffffffu +#define AARCH64_B_RANGE ((int64_t) 128 * 1024 * 1024) +#define AARCH64_LDR_LITERAL_OPCODE 0x58000000u +#define AARCH64_BR_OPCODE 0xd61f0000u +#define AARCH64_NOP_OPCODE 0xd503201fu +#define AARCH64_REWRITE_SLOT_SIZE 32u +#define X86_64_REWRITE_PAGE_ZERO_SIZE (64u * 1024u) +#define X86_64_REWRITE_WRAPPER_SLOT_SIZE 32u +#define X86_64_MOVABS_R11_OPCODE_LEN 10u +#define X86_64_JMP_R11_OPCODE_LEN 3u +#define X86_64_PAGE_ZERO_TAIL_LEN \ + (X86_64_MOVABS_R11_OPCODE_LEN + X86_64_JMP_R11_OPCODE_LEN) +#define X86_64_JMP_REL32_OPCODE_LEN 5u +#define X86_64_WRAPPER_SITE_LEN 8u +#define X86_64_TRAMPOLINE_SEARCH_STEP (64u * 1024u) +#define X86_64_TRAMPOLINE_SEARCH_LIMIT ((uint64_t) INT32_MAX - 4096u) +#define AARCH64_VENEER_SIZE 16u /* LDR x16, +8; BR x16; .quad target */ +#define AARCH64_VENEER_SEARCH_STEP (64u * 1024u) +#define AARCH64_VENEER_SEARCH_LIMIT ((uint64_t) 127 * 1024 * 1024) + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +struct analyze_ctx { + enum kbox_rewrite_arch arch; + kbox_rewrite_site_cb cb; + kbox_rewrite_planned_site_cb planned_cb; + void *opaque; + size_t candidates; + size_t segments; +}; + +struct runtime_planned_site { + struct kbox_rewrite_planned_site planned; + uint64_t actual_site_addr; + uint64_t actual_trampoline_addr; + enum kbox_loader_mapping_source source; + enum kbox_rewrite_wrapper_candidate_kind wrapper_kind; + uint64_t wrapper_nr; +}; + +struct runtime_site_array { + struct runtime_planned_site *sites; + size_t count; + size_t cap; +}; + +struct runtime_collect_ctx { + struct runtime_site_array *array; + uint64_t load_bias; + enum kbox_loader_mapping_source source; +}; + +static struct kbox_rewrite_runtime *active_rewrite_runtime; + +static inline struct kbox_rewrite_runtime *load_active_rewrite_runtime(void) +{ + return __atomic_load_n(&active_rewrite_runtime, __ATOMIC_ACQUIRE); +} + +static inline void store_active_rewrite_runtime( + struct kbox_rewrite_runtime *runtime) +{ + __atomic_store_n(&active_rewrite_runtime, runtime, __ATOMIC_RELEASE); +} + +static void write_le32(unsigned char out[4], uint32_t value); +static int rewrite_is_wrapper_site(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr); +static uint32_t wrapper_family_mask_for_nr(const struct kbox_host_nrs *host_nrs, + uint64_t nr); +static int planned_site_matches_wrapper_candidate( + const struct kbox_rewrite_planned_site *planned, + const struct kbox_rewrite_wrapper_candidate *candidate); + +#if defined(__aarch64__) +extern char kbox_syscall_rewrite_aarch64_entry[]; +extern char kbox_syscall_rewrite_aarch64_cancel_entry[]; +extern int64_t kbox_syscall_rewrite_aarch64_dispatch(uint64_t origin_addr, + uint64_t nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5); + +__asm__( + ".text\n" + ".globl kbox_syscall_rewrite_aarch64_entry\n" + ".type kbox_syscall_rewrite_aarch64_entry,%function\n" + "kbox_syscall_rewrite_aarch64_entry:\n" + "sub sp, sp, #144\n" + "stp x1, x2, [sp, #0]\n" + "stp x3, x4, [sp, #16]\n" + "stp x5, x6, [sp, #32]\n" + "stp x7, x8, [sp, #48]\n" + "stp x9, x10, [sp, #64]\n" + "stp x11, x12, [sp, #80]\n" + "stp x13, x14, [sp, #96]\n" + "stp x15, x18, [sp, #112]\n" + "stp x19, x30, [sp, #128]\n" + "mov x19, x17\n" + "mov x2, x0\n" + "ldr x3, [sp, #0]\n" + "ldr x4, [sp, #8]\n" + "ldr x5, [sp, #16]\n" + "ldr x6, [sp, #24]\n" + "ldr x7, [sp, #32]\n" + "ldr x1, [sp, #56]\n" + "mov x0, x19\n" + "bl kbox_syscall_rewrite_aarch64_dispatch\n" + "add x16, x19, #4\n" + "ldp x1, x2, [sp, #0]\n" + "ldp x3, x4, [sp, #16]\n" + "ldp x5, x6, [sp, #32]\n" + "ldp x7, x8, [sp, #48]\n" + "ldp x9, x10, [sp, #64]\n" + "ldp x11, x12, [sp, #80]\n" + "ldp x13, x14, [sp, #96]\n" + "ldp x15, x18, [sp, #112]\n" + "ldp x19, x30, [sp, #128]\n" + "add sp, sp, #144\n" + "br x16\n" + ".size kbox_syscall_rewrite_aarch64_entry, " + ".-kbox_syscall_rewrite_aarch64_entry\n"); + +__asm__( + ".text\n" + ".globl kbox_syscall_rewrite_aarch64_cancel_entry\n" + ".type kbox_syscall_rewrite_aarch64_cancel_entry,%function\n" + "kbox_syscall_rewrite_aarch64_cancel_entry:\n" + "sub sp, sp, #144\n" + "stp x1, x2, [sp, #0]\n" + "stp x3, x4, [sp, #16]\n" + "stp x5, x6, [sp, #32]\n" + "stp x7, x8, [sp, #48]\n" + "stp x9, x10, [sp, #64]\n" + "stp x11, x12, [sp, #80]\n" + "stp x13, x14, [sp, #96]\n" + "stp x15, x18, [sp, #112]\n" + "stp x19, x30, [sp, #128]\n" + "mov x19, x17\n" + "mov x2, x0\n" + "ldr x3, [sp, #0]\n" + "ldr x4, [sp, #8]\n" + "ldr x5, [sp, #16]\n" + "ldr x6, [sp, #24]\n" + "ldr x7, [sp, #32]\n" + "ldr x1, [sp, #40]\n" + "mov x0, x19\n" + "bl kbox_syscall_rewrite_aarch64_dispatch\n" + "add x16, x19, #4\n" + "ldp x1, x2, [sp, #0]\n" + "ldp x3, x4, [sp, #16]\n" + "ldp x5, x6, [sp, #32]\n" + "ldp x7, x8, [sp, #48]\n" + "ldp x9, x10, [sp, #64]\n" + "ldp x11, x12, [sp, #80]\n" + "ldp x13, x14, [sp, #96]\n" + "ldp x15, x18, [sp, #112]\n" + "ldp x19, x30, [sp, #128]\n" + "add sp, sp, #144\n" + "br x16\n" + ".size kbox_syscall_rewrite_aarch64_cancel_entry, " + ".-kbox_syscall_rewrite_aarch64_cancel_entry\n"); +#endif + +#if defined(__x86_64__) +extern char kbox_syscall_rewrite_x86_64_entry[]; +extern char kbox_syscall_rewrite_x86_64_wrapper_entry[]; +extern int64_t kbox_syscall_rewrite_x86_64_dispatch(uint64_t origin_addr, + uint64_t nr, + const uint64_t *args); + +__asm__( + ".text\n" + ".globl kbox_syscall_rewrite_x86_64_entry\n" + ".type kbox_syscall_rewrite_x86_64_entry,@function\n" + "kbox_syscall_rewrite_x86_64_entry:\n" + "mov (%rsp), %r11\n" + "sub $56, %rsp\n" + "mov %rdi, 8(%rsp)\n" + "mov %rsi, 16(%rsp)\n" + "mov %rdx, 24(%rsp)\n" + "mov %r10, 32(%rsp)\n" + "mov %r8, 40(%rsp)\n" + "mov %r9, 48(%rsp)\n" + "mov %r11, %rdi\n" + "mov %rax, %rsi\n" + "lea 8(%rsp), %rdx\n" + "call kbox_syscall_rewrite_x86_64_dispatch\n" + "mov 8(%rsp), %rdi\n" + "mov 16(%rsp), %rsi\n" + "mov 24(%rsp), %rdx\n" + "mov 32(%rsp), %r10\n" + "mov 40(%rsp), %r8\n" + "mov 48(%rsp), %r9\n" + "add $56, %rsp\n" + "ret\n" + ".size kbox_syscall_rewrite_x86_64_entry, " + ".-kbox_syscall_rewrite_x86_64_entry\n"); + +__asm__( + ".text\n" + ".globl kbox_syscall_rewrite_x86_64_wrapper_entry\n" + ".type kbox_syscall_rewrite_x86_64_wrapper_entry,@function\n" + "kbox_syscall_rewrite_x86_64_wrapper_entry:\n" + "sub $56, %rsp\n" + "mov %rdi, 8(%rsp)\n" + "mov %rsi, 16(%rsp)\n" + "mov %rdx, 24(%rsp)\n" + "mov %r10, 32(%rsp)\n" + "mov %r8, 40(%rsp)\n" + "mov %r9, 48(%rsp)\n" + "mov %r11, %rdi\n" + "mov %rax, %rsi\n" + "lea 8(%rsp), %rdx\n" + "call kbox_syscall_rewrite_x86_64_dispatch\n" + "mov 8(%rsp), %rdi\n" + "mov 16(%rsp), %rsi\n" + "mov 24(%rsp), %rdx\n" + "mov 32(%rsp), %r10\n" + "mov 40(%rsp), %r8\n" + "mov 48(%rsp), %r9\n" + "add $56, %rsp\n" + "ret\n" + ".size kbox_syscall_rewrite_x86_64_wrapper_entry, " + ".-kbox_syscall_rewrite_x86_64_wrapper_entry\n"); +#endif + +static int x86_64_is_wrapper_site(const unsigned char *segment_bytes, + size_t file_size, + size_t offset) +{ + if (!segment_bytes || offset + X86_64_WRAPPER_SITE_LEN > file_size) + return 0; + return segment_bytes[offset] == 0xb8 && segment_bytes[offset + 5] == 0x0f && + (segment_bytes[offset + 6] == 0x05 || + segment_bytes[offset + 6] == 0x34) && + segment_bytes[offset + 7] == 0xc3; +} + +static uint32_t x86_64_wrapper_syscall_nr(const unsigned char original[8]) +{ + return (uint32_t) original[1] | ((uint32_t) original[2] << 8) | + ((uint32_t) original[3] << 16) | ((uint32_t) original[4] << 24); +} + +static uint32_t aarch64_wrapper_syscall_nr(const struct kbox_rewrite_site *site) +{ + uint32_t insn; + + if (!site || site->width != 4) + return UINT32_MAX; + insn = (uint32_t) site->original[0] | ((uint32_t) site->original[1] << 8) | + ((uint32_t) site->original[2] << 16) | + ((uint32_t) site->original[3] << 24); + + /* movz x8, #imm16 or movz w8, #imm16 */ + if ((insn & 0xffe0001fu) == 0xd2800008u || + (insn & 0xffe0001fu) == 0x52800008u) { + return (insn >> 5) & 0xffffu; + } + + return UINT32_MAX; +} + +static int aarch64_movz_reg_imm16(uint32_t insn, + unsigned reg, + uint32_t *imm_out) +{ + if (!imm_out || reg > 31) + return -1; + + if ((insn & 0xffe0001fu) == (0xd2800000u | reg) || + (insn & 0xffe0001fu) == (0x52800000u | reg)) { + *imm_out = (insn >> 5) & 0xffffu; + return 0; + } + + return -1; +} + +static int encode_x86_64_virtual_procinfo_patch( + const struct kbox_rewrite_site *site, + struct kbox_rewrite_patch *patch) +{ + uint32_t nr; + uint32_t value; + + if (!site || !patch) + return -1; + if (site->width != X86_64_WRAPPER_SITE_LEN || site->original[0] != 0xb8 || + site->original[5] != 0x0f || + (site->original[6] != 0x05 && site->original[6] != 0x34) || + site->original[7] != 0xc3) { + return -1; + } + + nr = x86_64_wrapper_syscall_nr(site->original); + if (nr == (uint32_t) HOST_NRS_X86_64.getpid || + nr == (uint32_t) HOST_NRS_X86_64.gettid) { + value = 1; + } else if (nr == (uint32_t) HOST_NRS_X86_64.getppid) { + value = 0; + } else { + return -1; + } + + patch->width = X86_64_WRAPPER_SITE_LEN; + patch->bytes[0] = 0xb8; + patch->bytes[1] = (unsigned char) (value & 0xff); + patch->bytes[2] = (unsigned char) ((value >> 8) & 0xff); + patch->bytes[3] = (unsigned char) ((value >> 16) & 0xff); + patch->bytes[4] = (unsigned char) ((value >> 24) & 0xff); + patch->bytes[5] = 0xc3; + patch->bytes[6] = 0x90; + patch->bytes[7] = 0x90; + return 0; +} + +static int aarch64_prev_insn_syscall_nr(const unsigned char *image, + size_t image_len, + size_t site_off, + uint32_t *nr_out) +{ + uint32_t insn; + + if (!image || !nr_out || site_off < 4 || site_off + 4 > image_len) + return -1; + + insn = (uint32_t) image[site_off - 4] | + ((uint32_t) image[site_off - 3] << 8) | + ((uint32_t) image[site_off - 2] << 16) | + ((uint32_t) image[site_off - 1] << 24); + + /* movz x8, #imm16 */ + if ((insn & 0xffe0001fu) == 0xd2800008u) { + *nr_out = (insn >> 5) & 0xffffu; + return 0; + } + + /* movz w8, #imm16 */ + if ((insn & 0xffe0001fu) == 0x52800008u) { + *nr_out = (insn >> 5) & 0xffffu; + return 0; + } + + return -1; +} + +static int encode_aarch64_virtual_procinfo_patch( + const struct kbox_rewrite_site *site, + const unsigned char *image, + size_t image_len, + struct kbox_rewrite_patch *patch) +{ + uint32_t nr; + uint32_t value; + uint32_t next_insn; + size_t site_off; + uint32_t movz_x0; + + if (!site || !image || !patch || site->width != 4) + return -1; + if (site->original[0] != 0x01 || site->original[1] != 0x00 || + site->original[2] != 0x00 || site->original[3] != 0xd4) { + return -1; + } + + site_off = (size_t) site->file_offset; + if (site_off + 8 > image_len) + return -1; + + next_insn = (uint32_t) image[site_off + 4] | + ((uint32_t) image[site_off + 5] << 8) | + ((uint32_t) image[site_off + 6] << 16) | + ((uint32_t) image[site_off + 7] << 24); + if (next_insn != 0xd65f03c0u) + return -1; + + if (aarch64_prev_insn_syscall_nr(image, image_len, site_off, &nr) < 0) + return -1; + + if (nr == (uint32_t) HOST_NRS_AARCH64.getpid || + nr == (uint32_t) HOST_NRS_AARCH64.gettid) { + value = 1; + } else if (nr == (uint32_t) HOST_NRS_AARCH64.getppid) { + value = 0; + } else { + return -1; + } + + movz_x0 = 0xd2800000u | ((value & 0xffffu) << 5); + patch->width = 4; + write_le32(patch->bytes, movz_x0); + return 0; +} + +static int encode_virtual_procinfo_patch(const struct kbox_rewrite_site *site, + const unsigned char *image, + size_t image_len, + enum kbox_rewrite_arch arch, + struct kbox_rewrite_patch *patch) +{ + switch (arch) { + case KBOX_REWRITE_ARCH_X86_64: + return encode_x86_64_virtual_procinfo_patch(site, patch); + case KBOX_REWRITE_ARCH_AARCH64: + return encode_aarch64_virtual_procinfo_patch(site, image, image_len, + patch); + default: + return -1; + } +} + +static int rewrite_read_fd_all(int fd, unsigned char **buf_out, size_t *len_out) +{ + struct stat st; + unsigned char *buf; + size_t len; + size_t total = 0; + + if (fd < 0 || !buf_out || !len_out) + return -1; + if (fstat(fd, &st) < 0 || st.st_size < 0) + return -1; + + len = (size_t) st.st_size; + if (len == 0) + return -1; + + buf = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (buf == MAP_FAILED) + return -1; + + while (total < len) { + ssize_t nr = pread(fd, buf + total, len - total, (off_t) total); + + if (nr < 0) { + if (errno == EINTR) + continue; + munmap(buf, len); + return -1; + } + if (nr == 0) { + munmap(buf, len); + return -1; + } + total += (size_t) nr; + } + + *buf_out = buf; + *len_out = len; + return 0; +} + +static uint64_t align_up_u64_or_zero(uint64_t value, uint64_t align) +{ + uint64_t mask; + + if (align == 0 || (align & (align - 1)) != 0) + return 0; + mask = align - 1; + if (value > UINT64_MAX - mask) + return 0; + return (value + mask) & ~mask; +} + +static int rewrite_origin_addr(const struct kbox_rewrite_site *site, + uint64_t *origin_out) +{ + uint64_t origin; + uint32_t insn; + + if (!site || !origin_out) + return -1; + + if (site->width == 2 && site->original[0] == 0x0f && + (site->original[1] == 0x05 || site->original[1] == 0x34)) { + if (__builtin_add_overflow(site->vaddr, 2u, &origin)) + return -1; + *origin_out = origin; + return 0; + } + + if (site->width == X86_64_WRAPPER_SITE_LEN && site->original[0] == 0xb8 && + site->original[5] == 0x0f && + (site->original[6] == 0x05 || site->original[6] == 0x34) && + site->original[7] == 0xc3) { + *origin_out = site->vaddr; + return 0; + } + + if (site->width == 4 && site->original[0] == 0x01 && + site->original[1] == 0x00 && site->original[2] == 0x00 && + site->original[3] == 0xd4) { + *origin_out = site->vaddr; + return 0; + } + + if (site->width == 4) { + insn = (uint32_t) site->original[0] | + ((uint32_t) site->original[1] << 8) | + ((uint32_t) site->original[2] << 16) | + ((uint32_t) site->original[3] << 24); + if ((insn & 0xfc000000u) == 0x14000000u || + (insn & 0xfc000000u) == 0x94000000u) { + *origin_out = site->vaddr; + return 0; + } + } + + errno = EINVAL; + return -1; +} + +/* Site classification for caller-aware rewrite dispatch. + * + * x86-64 wrapper pattern: the 8-byte sequence "mov $NR, %eax; syscall; ret" + * (B8 xx xx xx xx 0F 05 C3) is always WRAPPER: the function's only purpose + * is to execute the syscall and return the result. This is the musl __syscall0 + * pattern and covers the vast majority of libc syscall sites. + * + * For 2-byte syscall sites (bare 0F 05), we look at the byte immediately + * after the syscall: if it is C3 (ret) or 0F 1F (NOP), the site is inside a + * leaf wrapper. Otherwise (conditional jump, further computation, another + * syscall) the site is COMPLEX. + * + * aarch64 wrapper pattern: SVC #0 followed by RET (D65F03C0) within the next + * 2 instructions (8 bytes). If the ret is immediate or separated only by a + * MOV (return value adjustment), it's WRAPPER. + */ +enum kbox_rewrite_site_class kbox_rewrite_classify_x86_64_site( + const unsigned char *segment_bytes, + size_t segment_size, + size_t site_offset, + unsigned char site_width) +{ + if (!segment_bytes) + return KBOX_REWRITE_SITE_UNKNOWN; + + /* 8-byte wrapper: mov $NR, %eax; syscall; ret → always WRAPPER. */ + if (site_width == X86_64_WRAPPER_SITE_LEN) { + if (site_offset + X86_64_WRAPPER_SITE_LEN <= segment_size && + segment_bytes[site_offset] == 0xb8 && + segment_bytes[site_offset + 5] == 0x0f && + (segment_bytes[site_offset + 6] == 0x05 || + segment_bytes[site_offset + 6] == 0x34) && + segment_bytes[site_offset + 7] == 0xc3) { + return KBOX_REWRITE_SITE_WRAPPER; + } + return KBOX_REWRITE_SITE_UNKNOWN; + } + + /* 2-byte syscall (0F 05): check what follows. */ + if (site_width == 2) { + size_t after = site_offset + 2; + if (after >= segment_size) + return KBOX_REWRITE_SITE_UNKNOWN; + + /* Immediate ret after syscall → wrapper. */ + if (segment_bytes[after] == 0xc3) + return KBOX_REWRITE_SITE_WRAPPER; + + /* "cmp $0xfffffffffffff001, %rax; jae " is the musl + * __syscall_ret pattern: syscall; cmp; jae; ret. Still a wrapper + * (the error path does not feed the result into another syscall). + * Pattern: 48 3d 01 f0 ff ff (6 bytes for CMP rax, -4095) + */ + if (after + 6 <= segment_size && segment_bytes[after] == 0x48 && + segment_bytes[after + 1] == 0x3d && + segment_bytes[after + 2] == 0x01 && + segment_bytes[after + 3] == 0xf0 && + segment_bytes[after + 4] == 0xff && + segment_bytes[after + 5] == 0xff) { + return KBOX_REWRITE_SITE_WRAPPER; + } + + /* NOP after syscall (alignment padding) then check next. */ + if (segment_bytes[after] == 0x90) { + if (after + 1 < segment_size && segment_bytes[after + 1] == 0xc3) + return KBOX_REWRITE_SITE_WRAPPER; + } + + /* Anything else: inside a complex function. */ + return KBOX_REWRITE_SITE_COMPLEX; + } + + return KBOX_REWRITE_SITE_UNKNOWN; +} + +enum kbox_rewrite_site_class kbox_rewrite_classify_aarch64_site( + const unsigned char *segment_bytes, + size_t segment_size, + size_t site_offset) +{ + uint32_t next_insn; + + if (!segment_bytes) + return KBOX_REWRITE_SITE_UNKNOWN; + + /* SVC #0 is 4 bytes. Check the instruction(s) that follow. */ + if (site_offset + 8 > segment_size) + return KBOX_REWRITE_SITE_UNKNOWN; + + /* Read the next instruction (little-endian). */ + next_insn = (uint32_t) segment_bytes[site_offset + 4] | + ((uint32_t) segment_bytes[site_offset + 5] << 8) | + ((uint32_t) segment_bytes[site_offset + 6] << 16) | + ((uint32_t) segment_bytes[site_offset + 7] << 24); + + /* RET (D65F03C0) immediately after SVC → wrapper. */ + if (next_insn == 0xd65f03c0u) + return KBOX_REWRITE_SITE_WRAPPER; + + /* CMN x0, #0xFFF (musl __syscall_ret error check): + * Encoding: B1 00 3C 1F (CMN x0, #0xf, LSL #12) or + * 31 00 10 01 (CMN x0, #4, common variant) + * If followed by B.HI/B.CS → wrapper pattern. + * Be conservative: check for a common NEG/MOV + RET within 2 insns. + */ + if (site_offset + 12 <= segment_size) { + uint32_t insn2 = (uint32_t) segment_bytes[site_offset + 8] | + ((uint32_t) segment_bytes[site_offset + 9] << 8) | + ((uint32_t) segment_bytes[site_offset + 10] << 16) | + ((uint32_t) segment_bytes[site_offset + 11] << 24); + uint32_t insn3 = 0; + uint32_t insn4 = 0; + + /* NEG x0, x0 (CB0003E0) followed by RET: error path wrapper. */ + if (next_insn == 0xCB0003E0u && insn2 == 0xd65f03c0u) + return KBOX_REWRITE_SITE_WRAPPER; + + /* Second instruction is RET: wrapper (with one intermediate insn). */ + if (insn2 == 0xd65f03c0u) + return KBOX_REWRITE_SITE_WRAPPER; + + /* CMN x0/w0, #imm followed by conditional branch: error-check + * wrapper. + * CMN x0 encoding: 0xB100xxxx where bits [21:10] are imm12. + * CMN w0 encoding: 0x3100xxxx with the same immediate layout. + * Then B.cond: 0x54xxxxxx. + */ + if (((next_insn & 0xFF000000u) == 0xB1000000u || + (next_insn & 0xFF000000u) == 0x31000000u) && + (insn2 & 0xFF000000u) == 0x54000000u) { + return KBOX_REWRITE_SITE_WRAPPER; + } + + if (site_offset + 20 <= segment_size) { + insn3 = (uint32_t) segment_bytes[site_offset + 12] | + ((uint32_t) segment_bytes[site_offset + 13] << 8) | + ((uint32_t) segment_bytes[site_offset + 14] << 16) | + ((uint32_t) segment_bytes[site_offset + 15] << 24); + insn4 = (uint32_t) segment_bytes[site_offset + 16] | + ((uint32_t) segment_bytes[site_offset + 17] << 8) | + ((uint32_t) segment_bytes[site_offset + 18] << 16) | + ((uint32_t) segment_bytes[site_offset + 19] << 24); + + /* musl __internal_syscall_cancel epilogue: + * svc #0 + * ldr x19, [sp, #imm] + * ldp x29, x30, [sp], #imm + * autiasp + * ret + */ + if (next_insn == 0xF9400BF3u && insn2 == 0xA8C27BFDu && + insn3 == 0xD50323BFu && insn4 == 0xD65F03C0u) { + return KBOX_REWRITE_SITE_WRAPPER; + } + } + } + + /* Anything else: complex function. */ + return KBOX_REWRITE_SITE_COMPLEX; +} + +/* Look up site classification from the origin map. + */ +int kbox_rewrite_origin_map_find_class( + const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + enum kbox_rewrite_site_class *out) +{ + struct kbox_rewrite_origin_entry entry; + + if (!map || !out) + return -1; + if (!kbox_rewrite_origin_map_find(map, origin_addr, &entry)) + return -1; + *out = entry.site_class; + return 0; +} + +/* Try the rewrite fast path for a WRAPPER-site process-info syscall. + * + * WRAPPER sites can safely return virtualized PID values directly without + * going through the full dispatch machinery. The result goes directly to + * the caller and is not consumed internally by signal/thread helpers. + * + * Returns 1 if fast-pathed (with result written to *out), 0 if the + * syscall must go through full dispatch. + * + * kbox virtualizes: getpid=1, gettid=1, getppid=0. These must match + * the values in kbox_dispatch_request() to maintain PID model consistency. + */ +int kbox_rewrite_is_site_fast_eligible( + const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + const struct kbox_host_nrs *host_nrs, + uint64_t nr) +{ + if (!map || !host_nrs) + return 0; + + /* Only zero-argument process-info syscalls are eligible. */ + if ((int) nr != host_nrs->getpid && (int) nr != host_nrs->getppid && + (int) nr != host_nrs->gettid) { + return 0; + } + + return rewrite_is_wrapper_site(map, origin_addr); +} + +/* Return the virtualized value for a fast-path-eligible process-info syscall. + * Must match the values produced by kbox_dispatch_request() for the same + * syscall numbers (getpid=1, gettid=1, getppid=0). + */ +static int64_t rewrite_fast_procinfo_value(const struct kbox_host_nrs *host_nrs, + uint64_t nr) +{ + if ((int) nr == host_nrs->getpid || (int) nr == host_nrs->gettid) + return 1; + if ((int) nr == host_nrs->getppid) + return 0; + return -ENOSYS; /* Should not be reached; caller checked eligibility. */ +} + +int kbox_rewrite_init_trampoline_layout( + enum kbox_rewrite_arch arch, + const struct kbox_elf_exec_segment *seg, + struct kbox_rewrite_trampoline_layout *layout) +{ + uint64_t seg_end; + + if (!seg || !layout) + return -1; + + memset(layout, 0, sizeof(*layout)); + layout->arch = arch; + + switch (arch) { + case KBOX_REWRITE_ARCH_X86_64: + if (__builtin_add_overflow(seg->vaddr, seg->mem_size, &seg_end)) + return -1; + layout->base_addr = align_up_u64_or_zero(seg_end, 16); + if (layout->base_addr == 0 && seg_end != 0) + return -1; + layout->slot_size = X86_64_REWRITE_WRAPPER_SLOT_SIZE; + return 0; + case KBOX_REWRITE_ARCH_AARCH64: + if (__builtin_add_overflow(seg->vaddr, seg->mem_size, &seg_end)) + return -1; + layout->base_addr = align_up_u64_or_zero(seg_end, 16); + if (layout->base_addr == 0 && seg_end != 0) + return -1; + layout->slot_size = AARCH64_REWRITE_SLOT_SIZE; + return 0; + default: + return -1; + } +} + +int kbox_rewrite_plan_site(const struct kbox_rewrite_site *site, + const struct kbox_rewrite_trampoline_layout *layout, + size_t slot_index, + struct kbox_rewrite_planned_site *planned) +{ + uint64_t trampoline_addr; + uint64_t slot_offset; + + if (!site || !layout || !planned || site->segment_mem_size == 0) + return -1; + + memset(planned, 0, sizeof(*planned)); + planned->site = *site; + + if (layout->arch == KBOX_REWRITE_ARCH_X86_64 || + layout->arch == KBOX_REWRITE_ARCH_AARCH64) { + if (layout->slot_size == 0 || + __builtin_mul_overflow((uint64_t) slot_index, layout->slot_size, + &slot_offset) || + __builtin_add_overflow(layout->base_addr, slot_offset, + &trampoline_addr)) + return -1; + } else { + return -1; + } + + if (kbox_rewrite_encode_patch(site, trampoline_addr, &planned->patch) < 0) { + /* For aarch64 SVC sites where B offset exceeds ±128MB, mark the + * patch as deferred (width=0) rather than failing. The runtime + * install path will allocate a veneer to bridge the gap. + */ + if (site->width == 4 && site->original[0] == 0x01 && + site->original[1] == 0x00 && site->original[2] == 0x00 && + site->original[3] == 0xd4) { + memset(&planned->patch, 0, sizeof(planned->patch)); + } else { + return -1; + } + } + planned->trampoline_addr = trampoline_addr; + return 0; +} + +static int analyze_segment(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque) +{ + struct analyze_ctx *ctx = opaque; + struct kbox_rewrite_trampoline_layout layout; + struct kbox_rewrite_site site; + struct kbox_rewrite_planned_site planned; + size_t slot_index = 0; + ctx->segments++; + + if (ctx->planned_cb && + kbox_rewrite_init_trampoline_layout(ctx->arch, seg, &layout) < 0) + return -1; + + if (ctx->arch == KBOX_REWRITE_ARCH_X86_64) { + if (seg->file_size < 2) + return 0; + /* Walk instruction boundaries using the length decoder. + * Only match syscall/sysenter at true instruction starts, + * never inside immediates/displacements of longer encodings. + */ + for (size_t i = 0; i < seg->file_size;) { + int insn_len = + kbox_x86_insn_length(segment_bytes + i, seg->file_size - i); + if (insn_len <= 0) { + /* Unknown instruction; skip one byte and resync. + * This is safe: we may miss a syscall in truly + * unknown code, but we won't corrupt anything. + */ + i++; + continue; + } + + /* Check 8-byte wrapper pattern: B8 imm32 0F05/0F34 C3. + * The length decoder returns 5 for the MOV instruction, so we + * must peek ahead at the next 3 bytes to detect the full + * 8-byte wrapper sequence (3 instructions: MOV + syscall + RET). + */ + if (insn_len == 5 && + x86_64_is_wrapper_site(segment_bytes, seg->file_size, i)) { + memset(&site, 0, sizeof(site)); + site.file_offset = seg->file_offset + i; + site.vaddr = seg->vaddr + i; + site.segment_vaddr = seg->vaddr; + site.segment_mem_size = seg->mem_size; + site.width = X86_64_WRAPPER_SITE_LEN; + memcpy(site.original, segment_bytes + i, site.width); + site.site_class = kbox_rewrite_classify_x86_64_site( + segment_bytes, seg->file_size, i, X86_64_WRAPPER_SITE_LEN); + if (ctx->cb && ctx->cb(&site, ctx->opaque) < 0) + return -1; + if (ctx->planned_cb) { + if (kbox_rewrite_plan_site(&site, &layout, slot_index, + &planned) < 0) + return -1; + if (ctx->planned_cb(&planned, ctx->opaque) < 0) + return -1; + slot_index++; + } + ctx->candidates++; + i += X86_64_WRAPPER_SITE_LEN; + continue; + } + + /* Check 2-byte syscall (0F 05) or sysenter (0F 34) at + * a true instruction boundary. */ + if (insn_len == 2 && segment_bytes[i] == 0x0f && + (segment_bytes[i + 1] == 0x05 || + segment_bytes[i + 1] == 0x34)) { + memset(&site, 0, sizeof(site)); + site.file_offset = seg->file_offset + i; + site.vaddr = seg->vaddr + i; + site.segment_vaddr = seg->vaddr; + site.segment_mem_size = seg->mem_size; + site.width = 2; + site.original[0] = segment_bytes[i]; + site.original[1] = segment_bytes[i + 1]; + site.site_class = kbox_rewrite_classify_x86_64_site( + segment_bytes, seg->file_size, i, 2); + if (ctx->cb && ctx->cb(&site, ctx->opaque) < 0) + return -1; + if (ctx->planned_cb) { + if (kbox_rewrite_plan_site(&site, &layout, slot_index, + &planned) < 0) + return -1; + if (ctx->planned_cb(&planned, ctx->opaque) < 0) + return -1; + slot_index++; + } + ctx->candidates++; + } + + i += (size_t) insn_len; + } + return 0; + } + + if (ctx->arch == KBOX_REWRITE_ARCH_AARCH64) { + for (size_t i = 0; i + 3 < seg->file_size; i += 4) { + if (segment_bytes[i] != 0x01 || segment_bytes[i + 1] != 0x00 || + segment_bytes[i + 2] != 0x00 || segment_bytes[i + 3] != 0xd4) + continue; + memset(&site, 0, sizeof(site)); + site.file_offset = seg->file_offset + i; + site.vaddr = seg->vaddr + i; + site.segment_vaddr = seg->vaddr; + site.segment_mem_size = seg->mem_size; + site.width = 4; + memcpy(site.original, segment_bytes + i, 4); + site.site_class = kbox_rewrite_classify_aarch64_site( + segment_bytes, seg->file_size, i); + if (ctx->cb && ctx->cb(&site, ctx->opaque) < 0) + return -1; + if (ctx->planned_cb) { + if (kbox_rewrite_plan_site(&site, &layout, slot_index, + &planned) < 0) + return -1; + if (ctx->planned_cb(&planned, ctx->opaque) < 0) + return -1; + slot_index++; + } + ctx->candidates++; + } + return 0; + } + + return -1; +} + +const char *kbox_syscall_mode_name(enum kbox_syscall_mode mode) +{ + switch (mode) { + case KBOX_SYSCALL_MODE_SECCOMP: + return "seccomp"; + case KBOX_SYSCALL_MODE_TRAP: + return "trap"; + case KBOX_SYSCALL_MODE_REWRITE: + return "rewrite"; + case KBOX_SYSCALL_MODE_AUTO: + return "auto"; + } + return "unknown"; +} + +int kbox_parse_syscall_mode(const char *value, enum kbox_syscall_mode *out) +{ + if (!value || !out) + return -1; + + if (strcmp(value, "seccomp") == 0) + *out = KBOX_SYSCALL_MODE_SECCOMP; + else if (strcmp(value, "trap") == 0) + *out = KBOX_SYSCALL_MODE_TRAP; + else if (strcmp(value, "rewrite") == 0) + *out = KBOX_SYSCALL_MODE_REWRITE; + else if (strcmp(value, "auto") == 0) + *out = KBOX_SYSCALL_MODE_AUTO; + else + return -1; + + return 0; +} + +const char *kbox_rewrite_arch_name(enum kbox_rewrite_arch arch) +{ + switch (arch) { + case KBOX_REWRITE_ARCH_X86_64: + return "x86_64"; + case KBOX_REWRITE_ARCH_AARCH64: + return "aarch64"; + default: + return "unknown"; + } +} + +static void write_le32(unsigned char out[4], uint32_t value) +{ + out[0] = (unsigned char) (value & 0xff); + out[1] = (unsigned char) ((value >> 8) & 0xff); + out[2] = (unsigned char) ((value >> 16) & 0xff); + out[3] = (unsigned char) ((value >> 24) & 0xff); +} + +static void write_le64(unsigned char out[8], uint64_t value) +{ + for (int i = 0; i < 8; i++) + out[i] = (unsigned char) ((value >> (i * 8)) & 0xff); +} + +int kbox_rewrite_encode_patch(const struct kbox_rewrite_site *site, + uint64_t trampoline_addr, + struct kbox_rewrite_patch *patch) +{ + int64_t delta; + int64_t imm26; + uint32_t insn; + + if (!site || !patch) + return -1; + + memset(patch, 0, sizeof(*patch)); + + if (site->width == 2 && site->original[0] == 0x0f && + (site->original[1] == 0x05 || site->original[1] == 0x34)) { + patch->width = 2; + patch->bytes[0] = 0xff; + patch->bytes[1] = 0xd0; + return 0; + } + + if (site->width == X86_64_WRAPPER_SITE_LEN && site->original[0] == 0xb8 && + site->original[5] == 0x0f && + (site->original[6] == 0x05 || site->original[6] == 0x34) && + site->original[7] == 0xc3) { + int64_t rel32 = (int64_t) trampoline_addr - + (int64_t) (site->vaddr + X86_64_JMP_REL32_OPCODE_LEN); + + if (rel32 < INT32_MIN || rel32 > INT32_MAX) + return -1; + patch->width = X86_64_WRAPPER_SITE_LEN; + patch->bytes[0] = 0xe9; + write_le32(&patch->bytes[1], (uint32_t) (int32_t) rel32); + patch->bytes[5] = 0x90; + patch->bytes[6] = 0x90; + patch->bytes[7] = 0x90; + return 0; + } + + if (site->width == 4) { + if ((site->vaddr & 3u) != 0 || (trampoline_addr & 3u) != 0) + return -1; + delta = (int64_t) trampoline_addr - (int64_t) site->vaddr; + if (delta <= -AARCH64_B_RANGE || delta >= AARCH64_B_RANGE) + return -1; + if ((delta & 3) != 0) + return -1; + imm26 = delta >> 2; + if (imm26 < -(1 << 25) || imm26 > ((1 << 25) - 1)) + return -1; + insn = AARCH64_B_OPCODE | ((uint32_t) imm26 & AARCH64_B_IMM26_MASK); + patch->width = 4; + write_le32(patch->bytes, insn); + return 0; + } + + return -1; +} + +int kbox_rewrite_encode_x86_64_page_zero_trampoline(unsigned char *buf, + size_t buf_len, + uint64_t entry_addr) +{ + size_t tail_off; + + if (!buf || buf_len <= X86_64_PAGE_ZERO_TAIL_LEN) + return -1; + + memset(buf, 0x90, buf_len); + tail_off = buf_len - X86_64_PAGE_ZERO_TAIL_LEN; + buf[tail_off + 0] = 0x49; + buf[tail_off + 1] = 0xbb; + write_le64(buf + tail_off + 2, entry_addr); + buf[tail_off + 10] = 0x41; + buf[tail_off + 11] = 0xff; + buf[tail_off + 12] = 0xe3; + return 0; +} + +int kbox_rewrite_probe_x86_64_page_zero( + uint64_t mmap_min_addr, + struct kbox_rewrite_trampoline_probe *probe) +{ + if (!probe) + return -1; + + memset(probe, 0, sizeof(*probe)); + probe->arch = KBOX_REWRITE_ARCH_X86_64; + probe->trampoline_addr = 0; + if (mmap_min_addr == 0) { + probe->feasible = 1; + probe->reason = "page-zero trampoline available"; + } else { + probe->feasible = 0; + probe->reason = "vm.mmap_min_addr must be 0 for x86_64 rewrite"; + } + return 0; +} + +static int write_x86_64_wrapper_trampoline(uint64_t trampoline_addr, + uint64_t origin_addr, + uint32_t nr) +{ +#if defined(__x86_64__) + unsigned char slot[X86_64_REWRITE_WRAPPER_SLOT_SIZE]; + + memset(slot, 0x90, sizeof(slot)); + slot[0] = 0xb8; + write_le32(&slot[1], nr); + slot[5] = 0x49; + slot[6] = 0xbb; + write_le64(&slot[7], origin_addr); + slot[15] = 0x49; + slot[16] = 0xba; + write_le64( + &slot[17], + (uint64_t) (uintptr_t) kbox_syscall_rewrite_x86_64_wrapper_entry); + slot[25] = 0x41; + slot[26] = 0xff; + slot[27] = 0xe2; + memcpy((void *) (uintptr_t) trampoline_addr, slot, sizeof(slot)); + return 0; +#else + (void) trampoline_addr; + (void) origin_addr; + (void) nr; + return -1; +#endif +} + +int kbox_rewrite_probe_trampoline(enum kbox_rewrite_arch arch, + struct kbox_rewrite_trampoline_probe *probe) +{ + if (!probe) + return -1; + + memset(probe, 0, sizeof(*probe)); + probe->arch = arch; + + switch (arch) { + case KBOX_REWRITE_ARCH_X86_64: + probe->feasible = 1; + probe->reason = "x86_64 uses wrapper trampolines on stock kernels"; + probe->trampoline_addr = 0; + return 0; + case KBOX_REWRITE_ARCH_AARCH64: + probe->feasible = 1; + probe->reason = "aarch64 uses relative branch trampolines"; + probe->trampoline_addr = 0; + return 0; + default: + probe->reason = "unsupported rewrite architecture"; + return -1; + } +} +int kbox_rewrite_analyze_elf(const unsigned char *buf, + size_t buf_len, + struct kbox_rewrite_report *report) +{ + return kbox_rewrite_visit_elf_sites(buf, buf_len, NULL, NULL, report); +} + +int kbox_rewrite_visit_elf_planned_sites(const unsigned char *buf, + size_t buf_len, + kbox_rewrite_planned_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report) +{ + uint16_t machine = 0; + struct analyze_ctx ctx; + int rc; + + if (!buf || !report) + return -1; + + memset(report, 0, sizeof(*report)); + + if (kbox_elf_machine(buf, buf_len, &machine) < 0) + return -1; + + memset(&ctx, 0, sizeof(ctx)); + ctx.cb = NULL; + ctx.planned_cb = cb; + ctx.opaque = opaque; + if (machine == EM_X86_64) + ctx.arch = KBOX_REWRITE_ARCH_X86_64; + else if (machine == EM_AARCH64) + ctx.arch = KBOX_REWRITE_ARCH_AARCH64; + else + return -1; + + rc = kbox_visit_elf_exec_segments(buf, buf_len, analyze_segment, &ctx); + if (rc < 0) + return -1; + + report->arch = ctx.arch; + report->exec_segment_count = ctx.segments; + report->candidate_count = ctx.candidates; + return 0; +} + +struct site_cb_adapter_ctx { + kbox_rewrite_site_cb cb; + void *opaque; +}; + +struct planned_site_array { + struct kbox_rewrite_planned_site *sites; + size_t count; + size_t cap; +}; + +struct site_array { + struct kbox_rewrite_site *sites; + size_t count; + size_t cap; +}; + +static int site_cb_adapter(const struct kbox_rewrite_planned_site *planned, + void *opaque) +{ + struct site_cb_adapter_ctx *ctx = opaque; + + return ctx->cb(&planned->site, ctx->opaque); +} + +static int collect_planned_sites_array_cb( + const struct kbox_rewrite_planned_site *planned, + void *opaque) +{ + struct planned_site_array *array = opaque; + struct kbox_rewrite_planned_site *sites; + size_t new_cap; + + if (!array || !planned) + return -1; + if (array->count == array->cap) { + size_t alloc_size; + + new_cap = array->cap ? array->cap * 2 : 8; + if (new_cap < array->cap || + __builtin_mul_overflow(new_cap, sizeof(*sites), &alloc_size)) + return -1; + sites = realloc(array->sites, alloc_size); + if (!sites) + return -1; + array->sites = sites; + array->cap = new_cap; + } + array->sites[array->count++] = *planned; + return 0; +} + +static void free_site_array(struct site_array *array) +{ + if (!array) + return; + free(array->sites); + array->sites = NULL; + array->count = 0; + array->cap = 0; +} + +static int collect_sites_array_cb(const struct kbox_rewrite_site *site, + void *opaque) +{ + struct site_array *array = opaque; + struct kbox_rewrite_site *new_sites; + size_t new_cap; + + if (!array || !site) + return -1; + if (array->count == array->cap) { + size_t alloc_size; + + new_cap = array->cap ? array->cap * 2 : 16; + if (new_cap < array->cap || + __builtin_mul_overflow(new_cap, sizeof(*new_sites), &alloc_size)) + return -1; + new_sites = realloc(array->sites, alloc_size); + if (!new_sites) + return -1; + array->sites = new_sites; + array->cap = new_cap; + } + + array->sites[array->count++] = *site; + return 0; +} + +static void free_planned_site_array(struct planned_site_array *array) +{ + if (!array) + return; + free(array->sites); + array->sites = NULL; + array->count = 0; + array->cap = 0; +} + +int kbox_rewrite_visit_elf_sites(const unsigned char *buf, + size_t buf_len, + kbox_rewrite_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report) +{ + struct site_cb_adapter_ctx adapter; + + if (!cb) + return kbox_rewrite_visit_elf_planned_sites(buf, buf_len, NULL, NULL, + report); + + adapter.cb = cb; + adapter.opaque = opaque; + return kbox_rewrite_visit_elf_planned_sites(buf, buf_len, site_cb_adapter, + &adapter, report); +} + +/* Maximum bytes to read for rewrite analysis. The analysis needs the ELF + * header, program header table, and all PT_LOAD|PF_X segment contents. + * 16 MB covers any realistic executable/interpreter without risking OOM on + * multi-gigabyte guest binaries. Binaries larger than this cap are rejected + * gracefully (returns -1 with EFBIG). + */ +#define REWRITE_ANALYZE_MAX (16u * 1024 * 1024) +#define REWRITE_PHDR_MAX (256u * 1024) + +static ssize_t pwrite_full(int fd, + const unsigned char *buf, + size_t size, + off_t off) +{ + size_t total = 0; + + while (total < size) { + ssize_t nr = pwrite(fd, buf + total, size - total, off + (off_t) total); + + if (nr < 0) { + if (errno == EINTR) + continue; + return -1; + } + if (nr == 0) + break; + total += (size_t) nr; + } + + return (ssize_t) total; +} + +struct memfd_visit_ctx { + int fd; + struct analyze_ctx *analyze; + uint64_t total_bytes; +}; + +struct memfd_exec_segment_visit_ctx { + int fd; + uint64_t total_bytes; + int (*cb)(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque); + void *opaque; +}; + +static int read_segment_from_memfd(const struct kbox_elf_exec_segment *seg, + void *opaque) +{ + struct memfd_visit_ctx *ctx = opaque; + unsigned char *buf; + ssize_t nr; + int rc; + + if (seg->file_size == 0) + return 0; + if (__builtin_add_overflow(ctx->total_bytes, seg->file_size, + &ctx->total_bytes)) + return -1; + if (ctx->total_bytes > REWRITE_ANALYZE_MAX) { + errno = EFBIG; + return -1; + } + + buf = malloc((size_t) seg->file_size); + if (!buf) + return -1; + + nr = pread_full(ctx->fd, buf, (size_t) seg->file_size, + (off_t) seg->file_offset); + if (nr < 0 || (uint64_t) nr != seg->file_size) { + free(buf); + if (nr >= 0) + errno = EIO; + return -1; + } + + rc = analyze_segment(seg, buf, ctx->analyze); + free(buf); + return rc; +} + +static int read_segment_from_memfd_cb(const struct kbox_elf_exec_segment *seg, + void *opaque) +{ + struct memfd_exec_segment_visit_ctx *ctx = opaque; + unsigned char *buf; + ssize_t nr; + int rc; + + if (seg->file_size == 0) + return 0; + if (__builtin_add_overflow(ctx->total_bytes, seg->file_size, + &ctx->total_bytes)) { + return -1; + } + if (ctx->total_bytes > REWRITE_ANALYZE_MAX) { + errno = EFBIG; + return -1; + } + + buf = malloc((size_t) seg->file_size); + if (!buf) + return -1; + + nr = pread_full(ctx->fd, buf, (size_t) seg->file_size, + (off_t) seg->file_offset); + if (nr < 0 || (uint64_t) nr != seg->file_size) { + free(buf); + if (nr >= 0) + errno = EIO; + return -1; + } + + rc = ctx->cb(seg, buf, ctx->opaque); + free(buf); + return rc; +} + +static int visit_memfd_exec_segments( + int fd, + int (*cb)(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque), + void *opaque) +{ + off_t end; + unsigned char elf_hdr[64]; + unsigned char *buf; + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + uint64_t ph_end; + ssize_t nr; + size_t size; + int rc; + struct memfd_exec_segment_visit_ctx visit; + + if (fd < 0 || !cb) + return -1; + + end = lseek(fd, 0, SEEK_END); + if (end <= 0) + return -1; + + nr = pread_full(fd, elf_hdr, sizeof(elf_hdr), 0); + if (nr < (ssize_t) sizeof(elf_hdr)) { + if (nr >= 0) + errno = EIO; + return -1; + } + + phoff = ((uint64_t) elf_hdr[32]) | ((uint64_t) elf_hdr[33] << 8) | + ((uint64_t) elf_hdr[34] << 16) | ((uint64_t) elf_hdr[35] << 24) | + ((uint64_t) elf_hdr[36] << 32) | ((uint64_t) elf_hdr[37] << 40) | + ((uint64_t) elf_hdr[38] << 48) | ((uint64_t) elf_hdr[39] << 56); + phentsize = (uint16_t) (elf_hdr[54] | ((uint16_t) elf_hdr[55] << 8)); + phnum = (uint16_t) (elf_hdr[56] | ((uint16_t) elf_hdr[57] << 8)); + if (__builtin_add_overflow(phoff, (uint64_t) phentsize * phnum, &ph_end)) + return -1; + if (ph_end > (uint64_t) end || ph_end > REWRITE_PHDR_MAX) { + errno = EFBIG; + return -1; + } + + size = (size_t) ph_end; + buf = malloc(size); + if (!buf) + return -1; + + nr = pread_full(fd, buf, size, 0); + if (nr < 0 || (size_t) nr != size) { + free(buf); + if (nr >= 0) + errno = EIO; + return -1; + } + + memset(&visit, 0, sizeof(visit)); + visit.fd = fd; + visit.cb = cb; + visit.opaque = opaque; + rc = kbox_visit_elf_exec_segment_headers( + buf, size, read_segment_from_memfd_cb, &visit); + free(buf); + return rc; +} + +int kbox_rewrite_analyze_memfd(int fd, struct kbox_rewrite_report *report) +{ + return kbox_rewrite_visit_memfd_sites(fd, NULL, NULL, report); +} + +int kbox_rewrite_visit_memfd_sites(int fd, + kbox_rewrite_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report) +{ + struct site_cb_adapter_ctx adapter; + + if (!cb) + return kbox_rewrite_visit_memfd_planned_sites(fd, NULL, NULL, report); + + adapter.cb = cb; + adapter.opaque = opaque; + return kbox_rewrite_visit_memfd_planned_sites(fd, site_cb_adapter, &adapter, + report); +} + +int kbox_rewrite_visit_memfd_planned_sites(int fd, + kbox_rewrite_planned_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report) +{ + off_t end; + unsigned char elf_hdr[64]; + unsigned char *buf; + uint64_t phoff; + uint16_t phentsize; + uint16_t phnum; + uint64_t ph_end; + ssize_t nr; + size_t size; + int rc; + uint16_t machine = 0; + struct analyze_ctx analyze; + struct memfd_visit_ctx visit; + + if (fd < 0 || !report) + return -1; + + end = lseek(fd, 0, SEEK_END); + if (end <= 0) + return -1; + + nr = pread_full(fd, elf_hdr, sizeof(elf_hdr), 0); + if (nr < (ssize_t) sizeof(elf_hdr)) { + if (nr >= 0) + errno = EIO; + return -1; + } + + if (kbox_elf_machine(elf_hdr, sizeof(elf_hdr), &machine) < 0) + return -1; + + phoff = ((uint64_t) elf_hdr[32]) | ((uint64_t) elf_hdr[33] << 8) | + ((uint64_t) elf_hdr[34] << 16) | ((uint64_t) elf_hdr[35] << 24) | + ((uint64_t) elf_hdr[36] << 32) | ((uint64_t) elf_hdr[37] << 40) | + ((uint64_t) elf_hdr[38] << 48) | ((uint64_t) elf_hdr[39] << 56); + phentsize = (uint16_t) (elf_hdr[54] | ((uint16_t) elf_hdr[55] << 8)); + phnum = (uint16_t) (elf_hdr[56] | ((uint16_t) elf_hdr[57] << 8)); + if (__builtin_add_overflow(phoff, (uint64_t) phentsize * phnum, &ph_end)) + return -1; + if (ph_end > (uint64_t) end || ph_end > REWRITE_PHDR_MAX) { + errno = EFBIG; + return -1; + } + + size = (size_t) ph_end; + buf = malloc(size); + if (!buf) + return -1; + + nr = pread_full(fd, buf, size, 0); + if (nr < 0 || (size_t) nr != size) { + free(buf); + if (nr >= 0) + errno = EIO; + return -1; + } + + memset(&analyze, 0, sizeof(analyze)); + analyze.cb = NULL; + analyze.planned_cb = cb; + analyze.opaque = opaque; + if (machine == EM_X86_64) + analyze.arch = KBOX_REWRITE_ARCH_X86_64; + else if (machine == EM_AARCH64) + analyze.arch = KBOX_REWRITE_ARCH_AARCH64; + else { + free(buf); + return -1; + } + + memset(&visit, 0, sizeof(visit)); + visit.fd = fd; + visit.analyze = &analyze; + rc = kbox_visit_elf_exec_segment_headers(buf, size, read_segment_from_memfd, + &visit); + free(buf); + + if (rc < 0) + return -1; + + memset(report, 0, sizeof(*report)); + report->arch = analyze.arch; + report->exec_segment_count = analyze.segments; + report->candidate_count = analyze.candidates; + return 0; +} + +int kbox_rewrite_apply_elf(unsigned char *buf, + size_t buf_len, + size_t *applied_count, + struct kbox_rewrite_report *report) +{ + struct planned_site_array array; + struct kbox_rewrite_report local_report; + size_t applied = 0; + int rc; + + if (!buf) + return -1; + + memset(&array, 0, sizeof(array)); + rc = kbox_rewrite_visit_elf_planned_sites( + buf, buf_len, collect_planned_sites_array_cb, &array, + report ? report : &local_report); + if (rc < 0) { + free_planned_site_array(&array); + return -1; + } + + for (size_t i = 0; i < array.count; i++) { + const struct kbox_rewrite_planned_site *planned = &array.sites[i]; + size_t off = (size_t) planned->site.file_offset; + size_t width = planned->patch.width; + + if (width == 0 || off > buf_len || width > buf_len - off) { + free_planned_site_array(&array); + return -1; + } + if (memcmp(buf + off, planned->site.original, width) != 0) { + free_planned_site_array(&array); + errno = EIO; + return -1; + } + memcpy(buf + off, planned->patch.bytes, width); + applied++; + } + + free_planned_site_array(&array); + if (applied_count) + *applied_count = applied; + return 0; +} + +int kbox_rewrite_apply_memfd(int fd, + size_t *applied_count, + struct kbox_rewrite_report *report) +{ + struct planned_site_array array; + struct kbox_rewrite_report local_report; + size_t applied = 0; + int rc; + + if (fd < 0) + return -1; + + memset(&array, 0, sizeof(array)); + rc = kbox_rewrite_visit_memfd_planned_sites( + fd, collect_planned_sites_array_cb, &array, + report ? report : &local_report); + if (rc < 0) { + free_planned_site_array(&array); + return -1; + } + + for (size_t i = 0; i < array.count; i++) { + const struct kbox_rewrite_planned_site *planned = &array.sites[i]; + unsigned char current[KBOX_REWRITE_MAX_PATCH_BYTES]; + size_t width = planned->patch.width; + off_t off = (off_t) planned->site.file_offset; + ssize_t nr; + + if (width == 0 || width > sizeof(current)) { + free_planned_site_array(&array); + return -1; + } + nr = pread_full(fd, current, width, off); + if (nr < 0 || (size_t) nr != width) { + free_planned_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + if (memcmp(current, planned->site.original, width) != 0) { + free_planned_site_array(&array); + errno = EIO; + return -1; + } + nr = pwrite_full(fd, planned->patch.bytes, width, off); + if (nr < 0 || (size_t) nr != width) { + free_planned_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + applied++; + } + + free_planned_site_array(&array); + if (applied_count) + *applied_count = applied; + return 0; +} + +int kbox_rewrite_apply_virtual_procinfo_elf(unsigned char *buf, + size_t buf_len, + size_t *applied_count, + struct kbox_rewrite_report *report) +{ + struct site_array array; + struct kbox_rewrite_report local_report; + size_t applied = 0; + int rc; + + if (!buf) + return -1; + + memset(&array, 0, sizeof(array)); + rc = kbox_rewrite_visit_elf_sites(buf, buf_len, collect_sites_array_cb, + &array, report ? report : &local_report); + if (rc < 0) { + free_site_array(&array); + return -1; + } + + for (size_t i = 0; i < array.count; i++) { + const struct kbox_rewrite_site *site = &array.sites[i]; + struct kbox_rewrite_patch patch; + size_t off = (size_t) site->file_offset; + size_t width; + + if (encode_virtual_procinfo_patch( + site, buf, buf_len, report ? report->arch : local_report.arch, + &patch) < 0) + continue; + + width = patch.width; + if (width == 0 || off > buf_len || width > buf_len - off) { + free_site_array(&array); + return -1; + } + if (memcmp(buf + off, site->original, width) != 0) { + free_site_array(&array); + errno = EIO; + return -1; + } + memcpy(buf + off, patch.bytes, width); + applied++; + } + + free_site_array(&array); + if (applied_count) + *applied_count = applied; + return 0; +} + +int kbox_rewrite_apply_virtual_procinfo_memfd( + int fd, + size_t *applied_count, + struct kbox_rewrite_report *report) +{ + struct site_array array; + struct kbox_rewrite_report local_report; + enum kbox_rewrite_arch arch; + unsigned char *image = NULL; + size_t image_len = 0; + size_t applied = 0; + int rc; + + if (fd < 0) + return -1; + + memset(&array, 0, sizeof(array)); + rc = kbox_rewrite_visit_memfd_sites(fd, collect_sites_array_cb, &array, + report ? report : &local_report); + if (rc < 0) { + free_site_array(&array); + return -1; + } + + arch = report ? report->arch : local_report.arch; + if (arch == KBOX_REWRITE_ARCH_AARCH64) { + if (rewrite_read_fd_all(fd, &image, &image_len) < 0) { + free_site_array(&array); + return -1; + } + } + + for (size_t i = 0; i < array.count; i++) { + const struct kbox_rewrite_site *site = &array.sites[i]; + struct kbox_rewrite_patch patch; + unsigned char current[KBOX_REWRITE_MAX_PATCH_BYTES]; + size_t width; + off_t off = (off_t) site->file_offset; + ssize_t nr; + + if (site->file_offset > (uint64_t) SIZE_MAX) { + if (image) + munmap(image, image_len); + free_site_array(&array); + errno = EOVERFLOW; + return -1; + } + + if (encode_virtual_procinfo_patch(site, image, image_len, arch, + &patch) < 0) { + continue; + } + + width = patch.width; + nr = pread_full(fd, current, width, off); + if (nr < 0 || (size_t) nr != width) { + if (image) + munmap(image, image_len); + free_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + if (memcmp(current, site->original, width) != 0) { + if (image) + munmap(image, image_len); + free_site_array(&array); + errno = EIO; + return -1; + } + nr = pwrite_full(fd, patch.bytes, width, off); + if (nr < 0 || (size_t) nr != width) { + if (image) + munmap(image, image_len); + free_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + applied++; + } + + if (image) + munmap(image, image_len); + free_site_array(&array); + if (applied_count) + *applied_count = applied; + return 0; +} + +void kbox_rewrite_origin_map_init(struct kbox_rewrite_origin_map *map, + enum kbox_rewrite_arch arch) +{ + if (!map) + return; + memset(map, 0, sizeof(*map)); + map->arch = arch; +} + +void kbox_rewrite_origin_map_reset(struct kbox_rewrite_origin_map *map) +{ + if (!map) + return; + free(map->entries); + map->entries = NULL; + map->count = 0; + map->cap = 0; +} + +int kbox_rewrite_origin_map_add_site_source( + struct kbox_rewrite_origin_map *map, + const struct kbox_rewrite_site *site, + enum kbox_loader_mapping_source source) +{ + uint64_t origin; + size_t lo, hi, mid; + struct kbox_rewrite_origin_entry *entries; + size_t new_cap; + + if (!map || !site) + return -1; + if (rewrite_origin_addr(site, &origin) < 0) { + if (errno == 0) + errno = EINVAL; + return -1; + } + + lo = 0; + hi = map->count; + while (lo < hi) { + mid = lo + (hi - lo) / 2; + if (map->entries[mid].origin < origin) + lo = mid + 1; + else + hi = mid; + } + if (lo < map->count && map->entries[lo].origin == origin) + return 0; + + if (map->count == map->cap) { + size_t alloc_size; + + new_cap = map->cap ? map->cap * 2 : 8; + if (new_cap < map->cap || + __builtin_mul_overflow(new_cap, sizeof(*entries), &alloc_size)) + return -1; + entries = realloc(map->entries, alloc_size); + if (!entries) + return -1; + map->entries = entries; + map->cap = new_cap; + } + + if (lo < map->count) { + memmove(&map->entries[lo + 1], &map->entries[lo], + (map->count - lo) * sizeof(*map->entries)); + } + map->entries[lo].origin = origin; + map->entries[lo].source = source; + map->entries[lo].site_class = KBOX_REWRITE_SITE_UNKNOWN; + map->count++; + return 0; +} + +int kbox_rewrite_origin_map_add_classified( + struct kbox_rewrite_origin_map *map, + const struct kbox_rewrite_site *site, + enum kbox_loader_mapping_source source, + enum kbox_rewrite_site_class site_class) +{ + int rc = kbox_rewrite_origin_map_add_site_source(map, site, source); + if (rc == 0 && site_class != KBOX_REWRITE_SITE_UNKNOWN) { + /* The entry was just inserted or already existed. Find it and + * update the classification (add_site_source returns 0 for both). + */ + uint64_t origin; + if (rewrite_origin_addr(site, &origin) == 0) { + struct kbox_rewrite_origin_entry *entry; + size_t lo = 0, hi = map->count; + while (lo < hi) { + size_t mid = lo + (hi - lo) / 2; + if (map->entries[mid].origin < origin) + lo = mid + 1; + else + hi = mid; + } + if (lo < map->count && map->entries[lo].origin == origin) { + entry = &map->entries[lo]; + entry->site_class = site_class; + } + } + } + return rc; +} + +int kbox_rewrite_origin_map_find(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + struct kbox_rewrite_origin_entry *out) +{ + size_t lo = 0; + size_t hi; + + if (!map || !map->entries) + return 0; + + hi = map->count; + while (lo < hi) { + size_t mid = lo + (hi - lo) / 2; + uint64_t value = map->entries[mid].origin; + + if (value == origin_addr) { + if (out) + *out = map->entries[mid]; + return 1; + } + if (value < origin_addr) + lo = mid + 1; + else + hi = mid; + } + return 0; +} + +int kbox_rewrite_origin_map_contains(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr) +{ + return kbox_rewrite_origin_map_find(map, origin_addr, NULL); +} + +static int origin_map_collect_site_cb(const struct kbox_rewrite_site *site, + void *opaque) +{ + return kbox_rewrite_origin_map_add_site(opaque, site); +} + +int kbox_rewrite_origin_map_build_elf(struct kbox_rewrite_origin_map *map, + const unsigned char *buf, + size_t buf_len, + struct kbox_rewrite_report *report) +{ + struct kbox_rewrite_report local_report; + + if (!map || !buf) + return -1; + + kbox_rewrite_origin_map_reset(map); + return kbox_rewrite_visit_elf_sites(buf, buf_len, + origin_map_collect_site_cb, map, + report ? report : &local_report); +} + +int kbox_rewrite_origin_map_build_memfd(struct kbox_rewrite_origin_map *map, + int fd, + struct kbox_rewrite_report *report) +{ + struct kbox_rewrite_report local_report; + + if (!map || fd < 0) + return -1; + + kbox_rewrite_origin_map_reset(map); + return kbox_rewrite_visit_memfd_sites(fd, origin_map_collect_site_cb, map, + report ? report : &local_report); +} + +static int runtime_site_array_append(struct runtime_site_array *array, + const struct runtime_planned_site *site) +{ + struct runtime_planned_site *sites; + size_t new_cap; + + if (!array || !site) + return -1; + if (array->count == array->cap) { + size_t alloc_size; + + new_cap = array->cap ? array->cap * 2 : 8; + if (new_cap < array->cap || + __builtin_mul_overflow(new_cap, sizeof(*sites), &alloc_size)) + return -1; + sites = realloc(array->sites, alloc_size); + if (!sites) + return -1; + array->sites = sites; + array->cap = new_cap; + } + array->sites[array->count++] = *site; + return 0; +} + +static void runtime_site_array_reset(struct runtime_site_array *array) +{ + if (!array) + return; + free(array->sites); + array->sites = NULL; + array->count = 0; + array->cap = 0; +} + +static int runtime_collect_planned_cb( + const struct kbox_rewrite_planned_site *planned, + void *opaque) +{ + struct runtime_collect_ctx *ctx = opaque; + struct runtime_planned_site site; + + if (!ctx || !ctx->array || !planned) + return -1; + + memset(&site, 0, sizeof(site)); + site.planned = *planned; + site.actual_site_addr = ctx->load_bias + planned->site.vaddr; + site.actual_trampoline_addr = ctx->load_bias + planned->trampoline_addr; + site.source = ctx->source; + site.wrapper_kind = KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT; + site.wrapper_nr = UINT64_MAX; + return runtime_site_array_append(ctx->array, &site); +} + +static const struct kbox_loader_mapping *find_exec_mapping( + const struct kbox_loader_launch *launch, + enum kbox_loader_mapping_source source, + uint64_t addr) +{ + size_t i; + + if (!launch) + return NULL; + + for (i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + uint64_t end; + + if (mapping->source != source || (mapping->prot & PROT_EXEC) == 0 || + mapping->size == 0) { + continue; + } + if (__builtin_add_overflow(mapping->addr, mapping->size, &end)) + continue; + if (addr >= mapping->addr && addr < end) + return mapping; + } + + return NULL; +} + +#if defined(__aarch64__) +static int encode_aarch64_ldr_literal(uint32_t *out, + unsigned rt, + int32_t offset_bytes) +{ + int32_t imm19; + + if (!out || rt > 31 || (offset_bytes & 3) != 0) + return -1; + imm19 = offset_bytes >> 2; + if (imm19 < -(1 << 18) || imm19 > ((1 << 18) - 1)) + return -1; + *out = AARCH64_LDR_LITERAL_OPCODE | (((uint32_t) imm19 & 0x7ffffu) << 5) | + (rt & 31u); + return 0; +} + +static uint32_t encode_aarch64_br(unsigned rn) +{ + return AARCH64_BR_OPCODE | ((rn & 31u) << 5); +} +#endif /* __aarch64__ */ + +static int write_aarch64_trampoline( + uint64_t trampoline_addr, + uint64_t origin_addr, + enum kbox_rewrite_wrapper_candidate_kind wrapper_kind) +{ +#if defined(__aarch64__) + unsigned char slot[AARCH64_REWRITE_SLOT_SIZE]; + uint32_t insn; + uint64_t entry_addr; + + memset(slot, 0, sizeof(slot)); + if (encode_aarch64_ldr_literal(&insn, 17, 16) < 0) + return -1; + write_le32(slot + 0, insn); + if (encode_aarch64_ldr_literal(&insn, 16, 20) < 0) + return -1; + write_le32(slot + 4, insn); + write_le32(slot + 8, encode_aarch64_br(16)); + write_le32(slot + 12, AARCH64_NOP_OPCODE); + write_le64(slot + 16, origin_addr); + entry_addr = + (uint64_t) (uintptr_t) (wrapper_kind == + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL + ? kbox_syscall_rewrite_aarch64_cancel_entry + : kbox_syscall_rewrite_aarch64_entry); + write_le64(slot + 24, entry_addr); + memcpy((void *) (uintptr_t) trampoline_addr, slot, sizeof(slot)); + return 0; +#else + (void) trampoline_addr; + (void) origin_addr; + (void) wrapper_kind; + return -1; +#endif +} + +/* Allocate a veneer page near @near_addr (within ±128MB) for aarch64 long + * branches. The veneer is a small code page containing indirect branch stubs: + * LDR x16, [PC+8] + * BR x16 + * .quad + * + * Each veneer entry is 16 bytes (AARCH64_VENEER_SIZE). The veneer page can + * hold page_size / 16 entries. + * + * Returns 0 on success with the veneer page base in *veneer_base_out. + */ +static int alloc_aarch64_veneer_page(struct kbox_rewrite_runtime *runtime, + uint64_t near_addr, + uint64_t *veneer_base_out) +{ +#if defined(__aarch64__) + uint64_t page_size; + uint64_t search_lo, search_hi, addr; + void *region; + + if (!runtime || !veneer_base_out) + return -1; + if (runtime->trampoline_region_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + + page_size = (uint64_t) sysconf(_SC_PAGESIZE); + if (page_size == 0 || (page_size & (page_size - 1)) != 0) + return -1; + + /* Search within B range of near_addr. */ + search_lo = near_addr > AARCH64_VENEER_SEARCH_LIMIT + ? near_addr - AARCH64_VENEER_SEARCH_LIMIT + : page_size; + search_lo = (search_lo + page_size - 1) & ~(page_size - 1); + if (__builtin_add_overflow(near_addr, AARCH64_VENEER_SEARCH_LIMIT, + &search_hi)) + search_hi = UINT64_MAX - page_size; + + /* Search upward first (likely to succeed, past the mapping). */ + for (addr = (near_addr + page_size) & ~(page_size - 1); addr <= search_hi; + addr += AARCH64_VENEER_SEARCH_STEP) { + region = mmap((void *) (uintptr_t) addr, (size_t) page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); + if (region != MAP_FAILED) { + runtime->trampoline_regions[runtime->trampoline_region_count] + .mapping = region; + runtime->trampoline_regions[runtime->trampoline_region_count].size = + (size_t) page_size; + runtime->trampoline_region_count++; + *veneer_base_out = (uint64_t) (uintptr_t) region; + return 0; + } + if (errno != EEXIST && errno != ENOMEM) + break; + } + + /* Search downward. */ + for (addr = search_lo; addr < near_addr; + addr += AARCH64_VENEER_SEARCH_STEP) { + region = mmap((void *) (uintptr_t) addr, (size_t) page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); + if (region != MAP_FAILED) { + runtime->trampoline_regions[runtime->trampoline_region_count] + .mapping = region; + runtime->trampoline_regions[runtime->trampoline_region_count].size = + (size_t) page_size; + runtime->trampoline_region_count++; + *veneer_base_out = (uint64_t) (uintptr_t) region; + return 0; + } + if (errno != EEXIST && errno != ENOMEM) + break; + } + + errno = ENOSPC; + return -1; +#else + (void) runtime; + (void) near_addr; + (void) veneer_base_out; + errno = ENOTSUP; + return -1; +#endif +} + +/* Write an aarch64 veneer entry at @veneer_addr that branches to + * @trampoline_addr (full 64-bit indirect branch). Returns 0 on success. + */ +static int write_aarch64_veneer(uint64_t veneer_addr, uint64_t trampoline_addr) +{ +#if defined(__aarch64__) + unsigned char slot[AARCH64_VENEER_SIZE]; + uint32_t ldr_insn; + + /* LDR x16, [PC+8]: loads the 64-bit value 8 bytes ahead */ + if (encode_aarch64_ldr_literal(&ldr_insn, 16, 8) < 0) + return -1; + write_le32(slot + 0, ldr_insn); + write_le32(slot + 4, encode_aarch64_br(16)); + write_le64(slot + 8, trampoline_addr); + memcpy((void *) (uintptr_t) veneer_addr, slot, sizeof(slot)); + return 0; +#else + (void) veneer_addr; + (void) trampoline_addr; + return -1; +#endif +} + +/* Encode an aarch64 B instruction targeting @veneer_addr from @site_vaddr. + * Returns 0 on success with patch bytes in @patch. + */ +static int encode_aarch64_b_to_veneer(uint64_t site_vaddr, + uint64_t veneer_addr, + struct kbox_rewrite_patch *patch) +{ + int64_t delta; + int32_t imm26; + uint32_t insn; + + if (!patch) + return -1; + if ((site_vaddr & 3u) != 0 || (veneer_addr & 3u) != 0) + return -1; + delta = (int64_t) veneer_addr - (int64_t) site_vaddr; + if (delta <= -AARCH64_B_RANGE || delta >= AARCH64_B_RANGE) + return -1; + if ((delta & 3) != 0) + return -1; + imm26 = (int32_t) (delta >> 2); + if (imm26 < -(1 << 25) || imm26 > ((1 << 25) - 1)) + return -1; + insn = AARCH64_B_OPCODE | ((uint32_t) imm26 & AARCH64_B_IMM26_MASK); + patch->width = 4; + write_le32(patch->bytes, insn); + return 0; +} + +static int64_t rewrite_dispatch_result(struct kbox_rewrite_runtime *runtime, + struct kbox_dispatch *dispatch, + uint64_t nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5) +{ + const struct kbox_host_nrs *h = NULL; + + if (!dispatch) + return -ENOSYS; + if (runtime && runtime->ctx) + h = runtime->ctx->host_nrs; + + if (dispatch->kind == KBOX_DISPATCH_CONTINUE) { + if (h && (nr == (uint64_t) h->exit || nr == (uint64_t) h->exit_group)) + return kbox_syscall_trap_host_exit_group_now((int) a0); + if (h && nr == (uint64_t) h->execve) + return kbox_syscall_trap_host_execve_now( + (const char *) (uintptr_t) a0, (char *const *) (uintptr_t) a1, + (char *const *) (uintptr_t) a2); + if (h && nr == (uint64_t) h->execveat) + return kbox_syscall_trap_host_execveat_now( + (int) a0, (const char *) (uintptr_t) a1, + (char *const *) (uintptr_t) a2, (char *const *) (uintptr_t) a3, + (int) a4); + if (h && nr == (uint64_t) h->clone) + return kbox_syscall_trap_host_clone_now(a0, a1, a2, a3, a4); + if (h && nr == (uint64_t) h->clone3) + return kbox_syscall_trap_host_clone3_now( + (const void *) (uintptr_t) a0, (size_t) a1); +#if defined(__x86_64__) + if (h && nr == (uint64_t) h->fork) + return kbox_syscall_trap_host_fork_now(); + if (h && nr == (uint64_t) h->vfork) + return kbox_syscall_trap_host_vfork_now(); +#endif + return kbox_syscall_trap_host_syscall6((long) nr, a0, a1, a2, a3, a4, + a5); + } + if (dispatch->error != 0) + return -(int64_t) dispatch->error; + return dispatch->val; +} + +static int rewrite_is_wrapper_site(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr) +{ + enum kbox_rewrite_site_class site_class; + + if (!map) + return 0; + if (kbox_rewrite_origin_map_find_class(map, origin_addr, &site_class) < 0) + return 0; + return site_class == KBOX_REWRITE_SITE_WRAPPER; +} + +static int rewrite_dispatch_request(struct kbox_rewrite_runtime *runtime, + const struct kbox_syscall_request *req, + struct kbox_dispatch *dispatch) +{ + if (!runtime || !runtime->ctx || !req || !dispatch) + return -1; + +#ifndef KBOX_UNIT_TEST + kbox_dispatch_prepare_request_ctx(runtime->ctx, req); + + if (req->source == KBOX_SYSCALL_SOURCE_REWRITE && + rewrite_is_wrapper_site(&runtime->origin_map, + req->instruction_pointer) && + kbox_dispatch_try_rewrite_wrapper_fast_path(runtime->ctx, req, + dispatch)) { + return 0; + } +#endif + + if (kbox_syscall_trap_active_dispatch(req, dispatch) == 0) + return 0; + + *dispatch = kbox_dispatch_request(runtime->ctx, req); + return 0; +} + +static int rewrite_runtime_should_patch_site( + const struct kbox_rewrite_runtime *runtime, + const struct runtime_planned_site *site) +{ + const struct kbox_host_nrs *host_nrs; + uint64_t nr; + uint32_t family_mask; + + if (!runtime || !runtime->ctx || !site) + return 0; + if (runtime->arch != KBOX_REWRITE_ARCH_AARCH64) + return 1; + if (site->planned.site.site_class != KBOX_REWRITE_SITE_WRAPPER) + return 0; + + host_nrs = runtime->ctx->host_nrs; + if (!host_nrs) + return 0; + nr = site->wrapper_nr; + if (nr == UINT64_MAX && kbox_rewrite_wrapper_syscall_nr( + &site->planned.site, runtime->arch, &nr) < 0) { + return 0; + } + + if (nr == (uint64_t) host_nrs->getpid || + nr == (uint64_t) host_nrs->getppid || + nr == (uint64_t) host_nrs->gettid) { + return 1; + } + + family_mask = wrapper_family_mask_for_nr(host_nrs, nr); + if (family_mask & + (KBOX_REWRITE_WRAPPER_FAMILY_STAT | KBOX_REWRITE_WRAPPER_FAMILY_OPEN)) { + return 1; + } + + return 0; +} + +/* Check whether an ELF binary contains fork-family syscall sites. + * + * Scans 8-byte wrapper sites (mov $NR, %eax; syscall; ret) for + * fork/clone/vfork/clone3 syscall numbers. Also checks aarch64 + * SVC sites preceded by MOV x8, #NR for the same. + * + * Returns 1 if any fork-family site found, 0 if none, -1 on error. + */ +struct fork_scan_ctx { + const struct kbox_host_nrs *host_nrs; + int found; +}; + +struct wrapper_nr_scan_ctx { + enum kbox_rewrite_arch arch; + const uint64_t *nrs; + size_t nr_count; + int found; +}; + +struct wrapper_family_scan_ctx { + enum kbox_rewrite_arch arch; + const struct kbox_host_nrs *host_nrs; + uint32_t mask; +}; + +struct wrapper_candidate_scan_ctx { + enum kbox_rewrite_arch arch; + const struct kbox_host_nrs *host_nrs; + uint32_t family_mask; + kbox_rewrite_wrapper_candidate_cb cb; + void *opaque; +}; + +struct wrapper_candidate_collect_ctx { + struct kbox_rewrite_wrapper_candidate *out; + size_t out_cap; + size_t count; + int filter_enabled; + enum kbox_rewrite_wrapper_candidate_kind kind; +}; + +static int wrapper_nr_in_allowlist(const struct wrapper_nr_scan_ctx *ctx, + uint64_t nr) +{ + size_t i; + + if (!ctx || !ctx->nrs) + return 0; + for (i = 0; i < ctx->nr_count; i++) { + if (ctx->nrs[i] == nr) + return 1; + } + return 0; +} + +static int wrapper_nr_scan_segment(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque) +{ + struct wrapper_nr_scan_ctx *ctx = opaque; + + (void) seg; + if (!ctx || !segment_bytes || ctx->found) + return 0; + + if (ctx->arch == KBOX_REWRITE_ARCH_X86_64) { + if (seg->file_size < X86_64_WRAPPER_SITE_LEN) + return 0; + for (size_t i = 0; i < seg->file_size;) { + int insn_len = + kbox_x86_insn_length(segment_bytes + i, seg->file_size - i); + if (insn_len <= 0) { + i++; + continue; + } + if (insn_len == 5 && + x86_64_is_wrapper_site(segment_bytes, seg->file_size, i)) { + uint64_t nr = + (uint64_t) x86_64_wrapper_syscall_nr(segment_bytes + i); + + if (wrapper_nr_in_allowlist(ctx, nr)) { + ctx->found = 1; + return 0; + } + i += X86_64_WRAPPER_SITE_LEN; + continue; + } + i += (size_t) insn_len; + } + return 0; + } + + if (ctx->arch == KBOX_REWRITE_ARCH_AARCH64) { + for (size_t i = 0; i + 3 < seg->file_size; i += 4) { + uint32_t insn; + uint32_t nr = UINT32_MAX; + size_t j; + + insn = (uint32_t) segment_bytes[i] | + ((uint32_t) segment_bytes[i + 1] << 8) | + ((uint32_t) segment_bytes[i + 2] << 16) | + ((uint32_t) segment_bytes[i + 3] << 24); + if (aarch64_movz_reg_imm16(insn, 6, &nr) < 0) + continue; + + /* Detect the static-musl __syscall_cancel caller pattern: + * mov x6,#nr ... b/bl __syscall_cancel + * This is a selector signal only for now, so a conservative + * short-range scan is sufficient. + */ + for (j = i + 4; j + 3 < seg->file_size && j <= i + 32; j += 4) { + uint32_t next = (uint32_t) segment_bytes[j] | + ((uint32_t) segment_bytes[j + 1] << 8) | + ((uint32_t) segment_bytes[j + 2] << 16) | + ((uint32_t) segment_bytes[j + 3] << 24); + + if ((next & 0xfc000000u) == 0x14000000u || + (next & 0xfc000000u) == 0x94000000u) { + if (wrapper_nr_in_allowlist(ctx, nr)) { + ctx->found = 1; + return 0; + } + break; + } + + if (next == 0xd4000001u) + break; + } + } + + for (size_t i = 4; i + 3 < seg->file_size; i += 4) { + struct kbox_rewrite_site site; + uint64_t nr = 0; + + if (segment_bytes[i] != 0x01 || segment_bytes[i + 1] != 0x00 || + segment_bytes[i + 2] != 0x00 || segment_bytes[i + 3] != 0xd4) { + continue; + } + if (kbox_rewrite_classify_aarch64_site(segment_bytes, + seg->file_size, i) != + KBOX_REWRITE_SITE_WRAPPER) { + continue; + } + + memset(&site, 0, sizeof(site)); + site.width = 4; + site.site_class = KBOX_REWRITE_SITE_WRAPPER; + memcpy(site.original, segment_bytes + i - 4, 4); + if (kbox_rewrite_wrapper_syscall_nr(&site, ctx->arch, &nr) == 0 && + wrapper_nr_in_allowlist(ctx, nr)) { + ctx->found = 1; + return 0; + } + } + return 0; + } + + return 0; +} + +static uint32_t wrapper_family_mask_for_nr(const struct kbox_host_nrs *host_nrs, + uint64_t nr) +{ + uint32_t mask = 0; + + if (!host_nrs) + return 0; + if ((int) nr == host_nrs->getpid || (int) nr == host_nrs->getppid || + (int) nr == host_nrs->gettid) { + mask |= KBOX_REWRITE_WRAPPER_FAMILY_PROCINFO; + } + if ((int) nr == host_nrs->newfstatat || (int) nr == host_nrs->fstat || + (int) nr == host_nrs->stat || (int) nr == host_nrs->lstat) { + mask |= KBOX_REWRITE_WRAPPER_FAMILY_STAT; + } + if ((int) nr == host_nrs->openat || (int) nr == host_nrs->openat2 || + (int) nr == host_nrs->open) { + mask |= KBOX_REWRITE_WRAPPER_FAMILY_OPEN; + } + return mask; +} + +static int emit_wrapper_candidate(struct wrapper_candidate_scan_ctx *ctx, + enum kbox_rewrite_wrapper_candidate_kind kind, + uint64_t file_offset, + uint64_t vaddr, + uint64_t nr) +{ + struct kbox_rewrite_wrapper_candidate candidate; + uint32_t mask; + + if (!ctx || !ctx->host_nrs || !ctx->cb) + return -1; + + mask = wrapper_family_mask_for_nr(ctx->host_nrs, nr); + if ((mask & ctx->family_mask) == 0) + return 0; + + memset(&candidate, 0, sizeof(candidate)); + candidate.arch = ctx->arch; + candidate.kind = kind; + candidate.file_offset = file_offset; + candidate.vaddr = vaddr; + candidate.nr = nr; + candidate.family_mask = mask; + return ctx->cb(&candidate, ctx->opaque); +} + +static int wrapper_family_scan_segment(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque) +{ + struct wrapper_family_scan_ctx *ctx = opaque; + + (void) seg; + if (!ctx || !segment_bytes || !ctx->host_nrs) + return 0; + + if (ctx->arch == KBOX_REWRITE_ARCH_X86_64) { + if (seg->file_size < X86_64_WRAPPER_SITE_LEN) + return 0; + for (size_t i = 0; i < seg->file_size;) { + int insn_len = + kbox_x86_insn_length(segment_bytes + i, seg->file_size - i); + if (insn_len <= 0) { + i++; + continue; + } + if (insn_len == 5 && + x86_64_is_wrapper_site(segment_bytes, seg->file_size, i)) { + uint64_t nr = + (uint64_t) x86_64_wrapper_syscall_nr(segment_bytes + i); + + ctx->mask |= wrapper_family_mask_for_nr(ctx->host_nrs, nr); + i += X86_64_WRAPPER_SITE_LEN; + continue; + } + i += (size_t) insn_len; + } + return 0; + } + + if (ctx->arch == KBOX_REWRITE_ARCH_AARCH64) { + for (size_t i = 0; i + 3 < seg->file_size; i += 4) { + uint32_t insn; + uint32_t nr = UINT32_MAX; + size_t j; + + insn = (uint32_t) segment_bytes[i] | + ((uint32_t) segment_bytes[i + 1] << 8) | + ((uint32_t) segment_bytes[i + 2] << 16) | + ((uint32_t) segment_bytes[i + 3] << 24); + if (aarch64_movz_reg_imm16(insn, 6, &nr) < 0) + continue; + + for (j = i + 4; j + 3 < seg->file_size && j <= i + 32; j += 4) { + uint32_t next = (uint32_t) segment_bytes[j] | + ((uint32_t) segment_bytes[j + 1] << 8) | + ((uint32_t) segment_bytes[j + 2] << 16) | + ((uint32_t) segment_bytes[j + 3] << 24); + + if ((next & 0xfc000000u) == 0x14000000u || + (next & 0xfc000000u) == 0x94000000u) { + ctx->mask |= wrapper_family_mask_for_nr(ctx->host_nrs, nr); + break; + } + + if (next == 0xd4000001u) + break; + } + } + + for (size_t i = 4; i + 3 < seg->file_size; i += 4) { + struct kbox_rewrite_site site; + uint64_t nr = 0; + + if (segment_bytes[i] != 0x01 || segment_bytes[i + 1] != 0x00 || + segment_bytes[i + 2] != 0x00 || segment_bytes[i + 3] != 0xd4) { + continue; + } + if (kbox_rewrite_classify_aarch64_site(segment_bytes, + seg->file_size, i) != + KBOX_REWRITE_SITE_WRAPPER) { + continue; + } + + memset(&site, 0, sizeof(site)); + site.width = 4; + site.site_class = KBOX_REWRITE_SITE_WRAPPER; + memcpy(site.original, segment_bytes + i - 4, 4); + if (kbox_rewrite_wrapper_syscall_nr(&site, ctx->arch, &nr) == 0) + ctx->mask |= wrapper_family_mask_for_nr(ctx->host_nrs, nr); + } + } + + return 0; +} + +static int wrapper_candidate_scan_segment( + const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque) +{ + struct wrapper_candidate_scan_ctx *ctx = opaque; + + (void) seg; + if (!ctx || !segment_bytes || !ctx->host_nrs || !ctx->cb) + return 0; + + if (ctx->arch == KBOX_REWRITE_ARCH_X86_64) { + if (seg->file_size < X86_64_WRAPPER_SITE_LEN) + return 0; + for (size_t i = 0; i < seg->file_size;) { + int insn_len = + kbox_x86_insn_length(segment_bytes + i, seg->file_size - i); + if (insn_len <= 0) { + i++; + continue; + } + if (insn_len == 5 && + x86_64_is_wrapper_site(segment_bytes, seg->file_size, i)) { + uint64_t nr = + (uint64_t) x86_64_wrapper_syscall_nr(segment_bytes + i); + int rc = emit_wrapper_candidate( + ctx, KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT, + seg->file_offset + i, seg->vaddr + i, nr); + if (rc != 0) + return rc; + i += X86_64_WRAPPER_SITE_LEN; + continue; + } + i += (size_t) insn_len; + } + return 0; + } + + if (ctx->arch == KBOX_REWRITE_ARCH_AARCH64) { + for (size_t i = 0; i + 3 < seg->file_size; i += 4) { + uint32_t insn; + uint32_t nr = UINT32_MAX; + size_t j; + + insn = (uint32_t) segment_bytes[i] | + ((uint32_t) segment_bytes[i + 1] << 8) | + ((uint32_t) segment_bytes[i + 2] << 16) | + ((uint32_t) segment_bytes[i + 3] << 24); + if (aarch64_movz_reg_imm16(insn, 6, &nr) < 0) + continue; + + for (j = i + 4; j + 3 < seg->file_size && j <= i + 32; j += 4) { + uint32_t next = (uint32_t) segment_bytes[j] | + ((uint32_t) segment_bytes[j + 1] << 8) | + ((uint32_t) segment_bytes[j + 2] << 16) | + ((uint32_t) segment_bytes[j + 3] << 24); + + if ((next & 0xfc000000u) == 0x14000000u || + (next & 0xfc000000u) == 0x94000000u) { + int rc = emit_wrapper_candidate( + ctx, KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL, + seg->file_offset + j, seg->vaddr + j, nr); + if (rc != 0) + return rc; + break; + } + + if (next == 0xd4000001u) + break; + } + } + + for (size_t i = 4; i + 3 < seg->file_size; i += 4) { + struct kbox_rewrite_site site; + uint64_t nr = 0; + + if (segment_bytes[i] != 0x01 || segment_bytes[i + 1] != 0x00 || + segment_bytes[i + 2] != 0x00 || segment_bytes[i + 3] != 0xd4) { + continue; + } + if (kbox_rewrite_classify_aarch64_site(segment_bytes, + seg->file_size, i) != + KBOX_REWRITE_SITE_WRAPPER) { + continue; + } + + memset(&site, 0, sizeof(site)); + site.width = 4; + site.site_class = KBOX_REWRITE_SITE_WRAPPER; + memcpy(site.original, segment_bytes + i - 4, 4); + if (kbox_rewrite_wrapper_syscall_nr(&site, ctx->arch, &nr) == 0) { + int rc = emit_wrapper_candidate( + ctx, KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT, + seg->file_offset + i, seg->vaddr + i, nr); + if (rc != 0) + return rc; + } + } + } + + return 0; +} + +static int fork_scan_cb(const struct kbox_rewrite_site *site, void *opaque) +{ + struct fork_scan_ctx *ctx = opaque; + const struct kbox_host_nrs *h = ctx->host_nrs; + int nr; + + if (ctx->found) + return 0; /* Already found one, just skip the rest. */ + + /* x86_64 8-byte wrapper: extract syscall number from MOV imm32. */ + if (site->width == X86_64_WRAPPER_SITE_LEN && site->original[0] == 0xb8) { + nr = (int) x86_64_wrapper_syscall_nr(site->original); + if (nr == h->clone || nr == h->fork || nr == h->vfork || + nr == h->clone3) { + ctx->found = 1; + } + return 0; + } + + /* aarch64 SVC: can't determine syscall number from the SVC site alone + * (it's in x8, set by a prior MOV). We conservatively do NOT flag + * aarch64 sites here; the caller uses additional heuristics. + */ + + return 0; +} + +int kbox_rewrite_has_fork_sites(const unsigned char *buf, + size_t buf_len, + const struct kbox_host_nrs *host_nrs) +{ + struct fork_scan_ctx ctx; + struct kbox_rewrite_report report; + + if (!buf || !host_nrs) + return -1; + ctx.host_nrs = host_nrs; + ctx.found = 0; + if (kbox_rewrite_visit_elf_sites(buf, buf_len, fork_scan_cb, &ctx, + &report) < 0) + return -1; + return ctx.found; +} + +int kbox_rewrite_has_fork_sites_memfd(int fd, + const struct kbox_host_nrs *host_nrs) +{ + struct fork_scan_ctx ctx; + struct kbox_rewrite_report report; + + if (fd < 0 || !host_nrs) + return -1; + ctx.host_nrs = host_nrs; + ctx.found = 0; + if (kbox_rewrite_visit_memfd_sites(fd, fork_scan_cb, &ctx, &report) < 0) + return -1; + return ctx.found; +} + +int kbox_rewrite_has_wrapper_syscalls(const unsigned char *buf, + size_t buf_len, + enum kbox_rewrite_arch arch, + const uint64_t *nrs, + size_t nr_count) +{ + struct wrapper_nr_scan_ctx ctx; + struct kbox_rewrite_report report; + + if (!buf || !nrs || nr_count == 0) + return -1; + ctx.arch = arch; + ctx.nrs = nrs; + ctx.nr_count = nr_count; + ctx.found = 0; + memset(&report, 0, sizeof(report)); + if ((arch != KBOX_REWRITE_ARCH_X86_64 && + arch != KBOX_REWRITE_ARCH_AARCH64) || + kbox_visit_elf_exec_segments(buf, buf_len, wrapper_nr_scan_segment, + &ctx) < 0) { + return -1; + } + return ctx.found; +} + +int kbox_rewrite_has_wrapper_syscalls_memfd(int fd, + const uint64_t *nrs, + size_t nr_count) +{ + struct wrapper_nr_scan_ctx ctx; + struct kbox_rewrite_report report; + + if (fd < 0 || !nrs || nr_count == 0) + return -1; + if (kbox_rewrite_analyze_memfd(fd, &report) < 0) + return -1; + + ctx.arch = report.arch; + ctx.nrs = nrs; + ctx.nr_count = nr_count; + ctx.found = 0; + if (visit_memfd_exec_segments(fd, wrapper_nr_scan_segment, &ctx) < 0) + return -1; + return ctx.found; +} + +int kbox_rewrite_wrapper_family_mask_memfd(int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t *out_mask) +{ + struct kbox_rewrite_report report; + struct wrapper_family_scan_ctx ctx; + + if (fd < 0 || !host_nrs || !out_mask) + return -1; + if (kbox_rewrite_analyze_memfd(fd, &report) < 0) + return -1; + + ctx.arch = report.arch; + ctx.host_nrs = host_nrs; + ctx.mask = 0; + if (visit_memfd_exec_segments(fd, wrapper_family_scan_segment, &ctx) < 0) + return -1; + *out_mask = ctx.mask; + return 0; +} + +int kbox_rewrite_visit_memfd_wrapper_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + kbox_rewrite_wrapper_candidate_cb cb, + void *opaque) +{ + struct kbox_rewrite_report report; + struct wrapper_candidate_scan_ctx ctx; + int rc; + + if (fd < 0 || !host_nrs || family_mask == 0 || !cb) + return -1; + if (kbox_rewrite_analyze_memfd(fd, &report) < 0) + return -1; + + ctx.arch = report.arch; + ctx.host_nrs = host_nrs; + ctx.family_mask = family_mask; + ctx.cb = cb; + ctx.opaque = opaque; + rc = visit_memfd_exec_segments(fd, wrapper_candidate_scan_segment, &ctx); + return rc < 0 ? -1 : 0; +} + +static int collect_wrapper_candidate_cb( + const struct kbox_rewrite_wrapper_candidate *candidate, + void *opaque) +{ + struct wrapper_candidate_collect_ctx *ctx = opaque; + + if (!candidate || !ctx) + return -1; + if (ctx->filter_enabled && candidate->kind != ctx->kind) + return 0; + if (ctx->count < ctx->out_cap && ctx->out) + ctx->out[ctx->count] = *candidate; + ctx->count++; + return 0; +} + +int kbox_rewrite_collect_memfd_wrapper_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count) +{ + struct wrapper_candidate_collect_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.out = out; + ctx.out_cap = out_cap; + if (kbox_rewrite_visit_memfd_wrapper_candidates( + fd, host_nrs, family_mask, collect_wrapper_candidate_cb, &ctx) < 0) + return -1; + if (out_count) + *out_count = ctx.count; + return 0; +} + +int kbox_rewrite_collect_memfd_wrapper_candidates_by_kind( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + enum kbox_rewrite_wrapper_candidate_kind kind, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count) +{ + struct wrapper_candidate_collect_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.out = out; + ctx.out_cap = out_cap; + ctx.filter_enabled = 1; + ctx.kind = kind; + if (kbox_rewrite_visit_memfd_wrapper_candidates( + fd, host_nrs, family_mask, collect_wrapper_candidate_cb, &ctx) < 0) + return -1; + if (out_count) + *out_count = ctx.count; + return 0; +} + +static int collect_elf_wrapper_candidates_by_kind( + const unsigned char *buf, + size_t buf_len, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + enum kbox_rewrite_wrapper_candidate_kind kind, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count) +{ + struct kbox_rewrite_report report; + struct wrapper_candidate_scan_ctx scan; + struct wrapper_candidate_collect_ctx collect; + + if (!buf || !host_nrs || family_mask == 0) + return -1; + if (kbox_rewrite_analyze_elf(buf, buf_len, &report) < 0) + return -1; + + memset(&scan, 0, sizeof(scan)); + memset(&collect, 0, sizeof(collect)); + scan.arch = report.arch; + scan.host_nrs = host_nrs; + scan.family_mask = family_mask; + scan.cb = collect_wrapper_candidate_cb; + scan.opaque = &collect; + collect.out = out; + collect.out_cap = out_cap; + collect.filter_enabled = 1; + collect.kind = kind; + if (kbox_visit_elf_exec_segments( + buf, buf_len, wrapper_candidate_scan_segment, &scan) < 0) { + return -1; + } + if (out_count) + *out_count = collect.count; + return 0; +} + +static int annotate_launch_wrapper_sites_kind( + struct runtime_site_array *array, + const unsigned char *elf, + size_t elf_len, + enum kbox_loader_mapping_source source, + const struct kbox_host_nrs *host_nrs, + enum kbox_rewrite_wrapper_candidate_kind kind) +{ + struct kbox_rewrite_wrapper_candidate *candidates = NULL; + size_t candidate_count = 0; + int rc = -1; + + if (!array || !elf || !host_nrs) + return -1; + if (collect_elf_wrapper_candidates_by_kind( + elf, elf_len, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_PROCINFO | + KBOX_REWRITE_WRAPPER_FAMILY_STAT | + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + kind, NULL, 0, &candidate_count) < 0) { + return -1; + } + if (candidate_count == 0) + return 0; + + candidates = calloc(candidate_count, sizeof(*candidates)); + if (!candidates) + return -1; + if (collect_elf_wrapper_candidates_by_kind( + elf, elf_len, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_PROCINFO | + KBOX_REWRITE_WRAPPER_FAMILY_STAT | + KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + kind, candidates, candidate_count, &candidate_count) < 0) { + goto out; + } + + for (size_t i = 0; i < array->count; i++) { + struct runtime_planned_site *site = &array->sites[i]; + + if (site->source != source) { + continue; + } + for (size_t j = 0; j < candidate_count; j++) { + if (planned_site_matches_wrapper_candidate(&site->planned, + &candidates[j])) { + site->wrapper_kind = kind; + site->wrapper_nr = candidates[j].nr; + break; + } + } + } + + rc = 0; +out: + free(candidates); + return rc; +} + +static int annotate_launch_wrapper_sites(struct runtime_site_array *array, + const unsigned char *elf, + size_t elf_len, + enum kbox_loader_mapping_source source, + const struct kbox_host_nrs *host_nrs) +{ + if (annotate_launch_wrapper_sites_kind( + array, elf, elf_len, source, host_nrs, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT) < 0) { + return -1; + } + if (annotate_launch_wrapper_sites_kind( + array, elf, elf_len, source, host_nrs, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL) < 0) { + return -1; + } + return 0; +} + +int kbox_rewrite_collect_memfd_phase1_path_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count) +{ + return kbox_rewrite_collect_memfd_wrapper_candidates_by_kind( + fd, host_nrs, + KBOX_REWRITE_WRAPPER_FAMILY_STAT | KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT, out, out_cap, out_count); +} + +static int planned_site_matches_wrapper_candidate( + const struct kbox_rewrite_planned_site *planned, + const struct kbox_rewrite_wrapper_candidate *candidate) +{ + if (!planned || !candidate) + return 0; + return planned->site.file_offset == candidate->file_offset && + planned->site.vaddr == candidate->vaddr; +} + +int kbox_rewrite_apply_memfd_phase1_path_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + size_t *applied_count, + struct kbox_rewrite_report *report) +{ + struct planned_site_array array; + struct kbox_rewrite_report local_report; + struct kbox_rewrite_wrapper_candidate *candidates = NULL; + size_t candidate_count = 0; + size_t applied = 0; + int rc; + + if (fd < 0 || !host_nrs) + return -1; + + memset(&array, 0, sizeof(array)); + rc = kbox_rewrite_collect_memfd_phase1_path_candidates(fd, host_nrs, NULL, + 0, &candidate_count); + if (rc < 0) + return -1; + if (candidate_count == 0) { + if (applied_count) + *applied_count = 0; + return 0; + } + + candidates = calloc(candidate_count, sizeof(*candidates)); + if (!candidates) + return -1; + rc = kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, host_nrs, candidates, candidate_count, &candidate_count); + if (rc < 0) { + free(candidates); + return -1; + } + + rc = kbox_rewrite_visit_memfd_planned_sites( + fd, collect_planned_sites_array_cb, &array, + report ? report : &local_report); + if (rc < 0) { + free(candidates); + free_planned_site_array(&array); + return -1; + } + + for (size_t i = 0; i < array.count; i++) { + const struct kbox_rewrite_planned_site *planned = &array.sites[i]; + unsigned char current[KBOX_REWRITE_MAX_PATCH_BYTES]; + size_t width = planned->patch.width; + off_t off = (off_t) planned->site.file_offset; + ssize_t nr; + int matched = 0; + + for (size_t j = 0; j < candidate_count; j++) { + if (planned_site_matches_wrapper_candidate(planned, + &candidates[j])) { + matched = 1; + break; + } + } + if (!matched) + continue; + + if (width == 0 || width > sizeof(current)) { + free(candidates); + free_planned_site_array(&array); + return -1; + } + nr = pread_full(fd, current, width, off); + if (nr < 0 || (size_t) nr != width) { + free(candidates); + free_planned_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + if (memcmp(current, planned->site.original, width) != 0) { + free(candidates); + free_planned_site_array(&array); + errno = EIO; + return -1; + } + nr = pwrite_full(fd, planned->patch.bytes, width, off); + if (nr < 0 || (size_t) nr != width) { + free(candidates); + free_planned_site_array(&array); + if (nr >= 0) + errno = EIO; + return -1; + } + applied++; + } + + free(candidates); + free_planned_site_array(&array); + if (applied_count) + *applied_count = applied; + return 0; +} + +int kbox_rewrite_is_fast_host_syscall0(const struct kbox_host_nrs *host_nrs, + uint64_t nr) +{ + if (!host_nrs) + return 0; + return nr == (uint64_t) host_nrs->getpid || + nr == (uint64_t) host_nrs->getppid || + nr == (uint64_t) host_nrs->gettid; +} + +int kbox_rewrite_wrapper_syscall_nr(const struct kbox_rewrite_site *site, + enum kbox_rewrite_arch arch, + uint64_t *out_nr) +{ + uint32_t nr = UINT32_MAX; + + if (!site || !out_nr || site->site_class != KBOX_REWRITE_SITE_WRAPPER) + return -1; + + switch (arch) { + case KBOX_REWRITE_ARCH_X86_64: + if (site->width == X86_64_WRAPPER_SITE_LEN && + site->original[0] == 0xb8 && site->original[5] == 0x0f && + (site->original[6] == 0x05 || site->original[6] == 0x34) && + site->original[7] == 0xc3) { + nr = x86_64_wrapper_syscall_nr(site->original); + } + break; + case KBOX_REWRITE_ARCH_AARCH64: + nr = aarch64_wrapper_syscall_nr(site); + break; + default: + break; + } + + if (nr == UINT32_MAX) + return -1; + *out_nr = nr; + return 0; +} + +#if defined(__aarch64__) +int64_t kbox_syscall_rewrite_aarch64_dispatch(uint64_t origin_addr, + uint64_t nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5) +{ + struct kbox_syscall_request req; + struct kbox_syscall_regs regs; + struct kbox_guest_mem guest_mem; + struct kbox_dispatch dispatch; + struct kbox_rewrite_runtime *runtime = load_active_rewrite_runtime(); + + if (!runtime || !runtime->ctx || + !kbox_rewrite_origin_map_contains(&runtime->origin_map, origin_addr)) { + _exit(250); + } + + /* Site-aware fast path: return virtualized process-info values for + * WRAPPER sites (getpid=1, gettid=1, getppid=0). COMPLEX sites + * (e.g., raise() -> gettid -> tgkill) must use full dispatch. + */ + if (kbox_rewrite_is_site_fast_eligible(&runtime->origin_map, origin_addr, + runtime->ctx->host_nrs, nr)) { + return rewrite_fast_procinfo_value(runtime->ctx->host_nrs, nr); + } + + memset(®s, 0, sizeof(regs)); + regs.nr = (int) nr; + regs.instruction_pointer = origin_addr; + regs.args[0] = a0; + regs.args[1] = a1; + regs.args[2] = a2; + regs.args[3] = a3; + regs.args[4] = a4; + regs.args[5] = a5; + + guest_mem.ops = &kbox_current_guest_mem_ops; + guest_mem.opaque = 0; + if (kbox_syscall_request_init_from_regs(&req, KBOX_SYSCALL_SOURCE_REWRITE, + runtime->ctx->child_pid, 0, ®s, + &guest_mem) < 0) { + return -ENOSYS; + } + + if (rewrite_dispatch_request(runtime, &req, &dispatch) < 0) + return -ENOSYS; + return rewrite_dispatch_result(runtime, &dispatch, nr, a0, a1, a2, a3, a4, + a5); +} + +#endif + +#if defined(__x86_64__) +int64_t kbox_syscall_rewrite_x86_64_dispatch(uint64_t origin_addr, + uint64_t nr, + const uint64_t *args) +{ + struct kbox_syscall_request req; + struct kbox_syscall_regs regs; + struct kbox_guest_mem guest_mem; + struct kbox_dispatch dispatch; + struct kbox_rewrite_runtime *runtime = load_active_rewrite_runtime(); + + if (!runtime || !runtime->ctx || !args || + !kbox_rewrite_origin_map_contains(&runtime->origin_map, origin_addr)) { + _exit(250); + } + + if (kbox_rewrite_is_site_fast_eligible(&runtime->origin_map, origin_addr, + runtime->ctx->host_nrs, nr)) { + return rewrite_fast_procinfo_value(runtime->ctx->host_nrs, nr); + } + + memset(®s, 0, sizeof(regs)); + regs.nr = (int) nr; + regs.instruction_pointer = origin_addr; + regs.args[0] = args[0]; + regs.args[1] = args[1]; + regs.args[2] = args[2]; + regs.args[3] = args[3]; + regs.args[4] = args[4]; + regs.args[5] = args[5]; + + guest_mem.ops = &kbox_current_guest_mem_ops; + guest_mem.opaque = 0; + if (kbox_syscall_request_init_from_regs(&req, KBOX_SYSCALL_SOURCE_REWRITE, + runtime->ctx->child_pid, 0, ®s, + &guest_mem) < 0) { + return -ENOSYS; + } + + if (rewrite_dispatch_request(runtime, &req, &dispatch) < 0) + return -ENOSYS; + return rewrite_dispatch_result(runtime, &dispatch, nr, args[0], args[1], + args[2], args[3], args[4], args[5]); +} +#endif + +struct x86_64_trampoline_region { + size_t mapping_index; + uint64_t base_addr; + size_t slot_count; + size_t used_slots; +}; + +static int find_exec_mapping_index(const struct kbox_loader_launch *launch, + enum kbox_loader_mapping_source source, + uint64_t addr) +{ + size_t i; + + if (!launch) + return -1; + + for (i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + uint64_t end; + + if (mapping->source != source || (mapping->prot & PROT_EXEC) == 0 || + mapping->size == 0) { + continue; + } + if (__builtin_add_overflow(mapping->addr, mapping->size, &end)) + continue; + if (addr >= mapping->addr && addr < end) + return (int) i; + } + + return -1; +} + +static struct x86_64_trampoline_region *find_x86_64_region( + struct x86_64_trampoline_region *regions, + size_t region_count, + size_t mapping_index) +{ + size_t i; + + for (i = 0; i < region_count; i++) { + if (regions[i].mapping_index == mapping_index) + return ®ions[i]; + } + return NULL; +} + +static int alloc_x86_64_trampoline_region( + struct kbox_rewrite_runtime *runtime, + const struct kbox_loader_mapping *mapping, + size_t slot_count, + uint64_t *base_addr_out) +{ +#if defined(__x86_64__) + uint64_t page_size; + uint64_t size; + uint64_t start; + uint64_t limit; + uint64_t addr; + void *region; + + if (!runtime || !mapping || !slot_count || !base_addr_out) + return -1; + if (runtime->trampoline_region_count >= KBOX_LOADER_MAX_MAPPINGS) + return -1; + + page_size = (uint64_t) sysconf(_SC_PAGESIZE); + if (page_size == 0 || (page_size & (page_size - 1)) != 0) + return -1; + size = align_up_u64_or_zero(slot_count * X86_64_REWRITE_WRAPPER_SLOT_SIZE, + page_size); + if (size == 0) + return -1; + start = align_up_u64_or_zero(mapping->addr + mapping->size, page_size); + if (start == 0) + return -1; + limit = start + X86_64_TRAMPOLINE_SEARCH_LIMIT; + if (limit < start) + limit = UINT64_MAX - size; + + for (addr = start; addr <= limit; addr += X86_64_TRAMPOLINE_SEARCH_STEP) { + region = mmap((void *) (uintptr_t) addr, (size_t) size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); + if (region != MAP_FAILED) { + runtime->trampoline_regions[runtime->trampoline_region_count] + .mapping = region; + runtime->trampoline_regions[runtime->trampoline_region_count].size = + (size_t) size; + runtime->trampoline_region_count++; + *base_addr_out = (uint64_t) (uintptr_t) region; + return 0; + } + if (errno != EEXIST && errno != ENOMEM) + return -1; + } + + errno = ENOSPC; + return -1; +#else + (void) runtime; + (void) mapping; + (void) slot_count; + (void) base_addr_out; + errno = ENOTSUP; + return -1; +#endif +} + +static int collect_launch_sites(struct runtime_site_array *array, + const struct kbox_loader_launch *launch, + const struct kbox_host_nrs *host_nrs) +{ + struct runtime_collect_ctx ctx; + struct kbox_rewrite_report report; + + if (!array || !launch || !host_nrs) + return -1; + + if (launch->main_elf && launch->main_elf_len > 0) { + memset(&ctx, 0, sizeof(ctx)); + ctx.array = array; + ctx.load_bias = launch->layout.main_load_bias; + ctx.source = KBOX_LOADER_MAPPING_MAIN; + if (kbox_rewrite_visit_elf_planned_sites( + launch->main_elf, launch->main_elf_len, + runtime_collect_planned_cb, &ctx, &report) < 0) { + return -1; + } + if (annotate_launch_wrapper_sites( + array, launch->main_elf, launch->main_elf_len, + KBOX_LOADER_MAPPING_MAIN, host_nrs) < 0) { + return -1; + } + } + + if (launch->interp_elf && launch->interp_elf_len > 0) { + memset(&ctx, 0, sizeof(ctx)); + ctx.array = array; + ctx.load_bias = launch->layout.interp_load_bias; + ctx.source = KBOX_LOADER_MAPPING_INTERP; + if (kbox_rewrite_visit_elf_planned_sites( + launch->interp_elf, launch->interp_elf_len, + runtime_collect_planned_cb, &ctx, &report) < 0) { + return -1; + } + if (annotate_launch_wrapper_sites( + array, launch->interp_elf, launch->interp_elf_len, + KBOX_LOADER_MAPPING_INTERP, host_nrs) < 0) { + return -1; + } + } + + return 0; +} + +static int make_exec_mappings_writable(const struct kbox_loader_launch *launch, + int prot_out[KBOX_LOADER_MAX_MAPPINGS]) +{ + size_t i; + + if (!launch || !prot_out) + return -1; + + for (i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + + prot_out[i] = mapping->prot; + if ((mapping->prot & PROT_EXEC) == 0 || mapping->size == 0) + continue; + if (mprotect((void *) (uintptr_t) mapping->addr, (size_t) mapping->size, + mapping->prot | PROT_WRITE) != 0) { + return -1; + } + } + + return 0; +} + +static void restore_exec_mapping_prot( + const struct kbox_loader_launch *launch, + const int prot_in[KBOX_LOADER_MAX_MAPPINGS]) +{ + size_t i; + + if (!launch || !prot_in) + return; + + for (i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + + if ((mapping->prot & PROT_EXEC) == 0 || mapping->size == 0) + continue; + mprotect((void *) (uintptr_t) mapping->addr, (size_t) mapping->size, + prot_in[i]); + } +} + +static void flush_exec_mappings(const struct kbox_loader_launch *launch) +{ + size_t i; + + if (!launch) + return; + + for (i = 0; i < launch->layout.mapping_count; i++) { + const struct kbox_loader_mapping *mapping = &launch->layout.mappings[i]; + + if ((mapping->prot & PROT_EXEC) == 0 || mapping->size == 0) + continue; + __builtin___clear_cache( + (char *) (uintptr_t) mapping->addr, + (char *) (uintptr_t) (mapping->addr + mapping->size)); + } +} + +void kbox_rewrite_runtime_reset(struct kbox_rewrite_runtime *runtime) +{ + size_t i; + + if (!runtime) + return; + if (load_active_rewrite_runtime() == runtime) + store_active_rewrite_runtime(NULL); + for (i = 0; i < runtime->trampoline_region_count; i++) { + if (runtime->trampoline_regions[i].mapping && + runtime->trampoline_regions[i].size > 0) { + munmap(runtime->trampoline_regions[i].mapping, + runtime->trampoline_regions[i].size); + } + } + kbox_rewrite_origin_map_reset(&runtime->origin_map); + memset(runtime, 0, sizeof(*runtime)); +} + +int kbox_rewrite_runtime_install(struct kbox_rewrite_runtime *runtime, + struct kbox_supervisor_ctx *ctx, + struct kbox_loader_launch *launch) +{ + struct runtime_site_array array; + struct x86_64_trampoline_region x86_regions[KBOX_LOADER_MAX_MAPPINGS]; + int prot[KBOX_LOADER_MAX_MAPPINGS]; + size_t i; + size_t x86_region_count = 0; + int rc = -1; + int writable = 0; + + if (!runtime || !ctx || !launch) + return -1; + if (launch->transfer.arch != KBOX_LOADER_ENTRY_ARCH_AARCH64 && + launch->transfer.arch != KBOX_LOADER_ENTRY_ARCH_X86_64) { + errno = ENOTSUP; + return -1; + } + + memset(&array, 0, sizeof(array)); + memset(x86_regions, 0, sizeof(x86_regions)); + memset(prot, 0, sizeof(prot)); + kbox_rewrite_runtime_reset(runtime); + runtime->ctx = ctx; + runtime->arch = launch->transfer.arch == KBOX_LOADER_ENTRY_ARCH_X86_64 + ? KBOX_REWRITE_ARCH_X86_64 + : KBOX_REWRITE_ARCH_AARCH64; + kbox_rewrite_origin_map_init(&runtime->origin_map, runtime->arch); + + if (collect_launch_sites(&array, launch, ctx->host_nrs) < 0) { + if (ctx->verbose) { + fprintf(stderr, + "kbox: rewrite install: collect_launch_sites failed: %s\n", + strerror(errno ? errno : EINVAL)); + } + goto out; + } + + if (ctx->verbose) { + size_t direct_count = 0; + size_t cancel_count = 0; + + for (i = 0; i < array.count; i++) { + if (array.sites[i].wrapper_kind == + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL) + cancel_count++; + else + direct_count++; + } + fprintf( + stderr, + "kbox: rewrite install: planned sites=%zu direct=%zu cancel=%zu\n", + array.count, direct_count, cancel_count); + } + + if (runtime->arch == KBOX_REWRITE_ARCH_X86_64) { + for (i = 0; i < array.count; i++) { + const struct runtime_planned_site *site = &array.sites[i]; + struct x86_64_trampoline_region *region; + int mapping_index; + + if (site->planned.site.width != X86_64_WRAPPER_SITE_LEN) + continue; + + mapping_index = find_exec_mapping_index(launch, site->source, + site->actual_site_addr); + if (mapping_index < 0) { + if (errno == 0) + errno = EINVAL; + goto out; + } + region = find_x86_64_region(x86_regions, x86_region_count, + (size_t) mapping_index); + if (region) { + region->slot_count++; + continue; + } + + if (x86_region_count >= KBOX_LOADER_MAX_MAPPINGS) { + if (errno == 0) + errno = ENOSPC; + goto out; + } + x86_regions[x86_region_count].mapping_index = + (size_t) mapping_index; + x86_regions[x86_region_count].slot_count = 1; + x86_region_count++; + } + + for (i = 0; i < x86_region_count; i++) { + const struct kbox_loader_mapping *mapping = + &launch->layout.mappings[x86_regions[i].mapping_index]; + + { + int trc = alloc_x86_64_trampoline_region( + runtime, mapping, x86_regions[i].slot_count, + &x86_regions[i].base_addr); + if (trc < 0) + goto out; + } + } + } else { + for (i = 0; i < array.count; i++) { + const struct runtime_planned_site *site = &array.sites[i]; + const struct kbox_loader_mapping *mapping = + find_exec_mapping(launch, site->source, site->actual_site_addr); + uint64_t mapping_end; + uint64_t tramp_end; + struct kbox_rewrite_site actual_site; + + if (!mapping) { + if (errno == 0) + errno = EINVAL; + goto out; + } + if (__builtin_add_overflow(mapping->addr, mapping->size, + &mapping_end)) { + if (errno == 0) + errno = EOVERFLOW; + goto out; + } + if (__builtin_add_overflow(site->actual_trampoline_addr, + (uint64_t) AARCH64_REWRITE_SLOT_SIZE, + &tramp_end)) { + if (errno == 0) + errno = EOVERFLOW; + goto out; + } + if (site->actual_trampoline_addr < mapping->addr || + tramp_end > mapping_end) { + errno = ENOSPC; + goto out; + } + + if (!rewrite_runtime_should_patch_site(runtime, site)) + continue; + + actual_site = site->planned.site; + actual_site.vaddr = site->actual_site_addr; + if (kbox_rewrite_origin_map_add_classified( + &runtime->origin_map, &actual_site, site->source, + site->planned.site.site_class) < 0) { + if (ctx->verbose) { + fprintf(stderr, + "kbox: rewrite install: origin-map add failed " + "site=0x%llx tramp=0x%llx nr=%llu kind=%d: %s\n", + (unsigned long long) site->actual_site_addr, + (unsigned long long) site->actual_trampoline_addr, + (unsigned long long) site->wrapper_nr, + (int) site->wrapper_kind, + strerror(errno ? errno : EINVAL)); + } + if (errno == 0) + errno = EINVAL; + goto out; + } + } + } + + if (make_exec_mappings_writable(launch, prot) < 0) + goto out; + writable = 1; + + /* Pass 1: verify all instruction bytes match before writing anything. + * If any site has been modified (e.g., by a JIT or concurrent loader), + * we abort without leaving the binary in a half-patched state. + */ + for (i = 0; i < array.count; i++) { + const struct runtime_planned_site *site = &array.sites[i]; + const unsigned char *patch_ptr = + (const unsigned char *) (uintptr_t) site->actual_site_addr; + + if (runtime->arch == KBOX_REWRITE_ARCH_AARCH64) { + if (!rewrite_runtime_should_patch_site(runtime, site)) + continue; + if (memcmp(patch_ptr, site->planned.site.original, + site->planned.site.width) != 0) { + errno = EIO; + goto out; + } + } else if (site->planned.site.width == X86_64_WRAPPER_SITE_LEN) { + if (memcmp(patch_ptr, site->planned.site.original, + site->planned.site.width) != 0) { + errno = EIO; + goto out; + } + } + } + + /* Pass 2: write trampolines and apply patches. All sites have been + * verified, so failures here are internal errors (bad trampoline + * encoding, origin map allocation). + */ + uint64_t veneer_page_base = 0; + size_t veneer_page_used = 0; + size_t veneer_page_cap = 0; + + for (i = 0; i < array.count; i++) { + const struct runtime_planned_site *site = &array.sites[i]; + unsigned char *patch_ptr = + (unsigned char *) (uintptr_t) site->actual_site_addr; + struct kbox_rewrite_patch patch; + + if (runtime->arch == KBOX_REWRITE_ARCH_AARCH64) { + if (!rewrite_runtime_should_patch_site(runtime, site)) + continue; + if (write_aarch64_trampoline(site->actual_trampoline_addr, + site->actual_site_addr, + site->wrapper_kind) < 0) { + if (ctx->verbose) { + fprintf(stderr, + "kbox: rewrite install: trampoline write failed " + "site=0x%llx tramp=0x%llx nr=%llu kind=%d: %s\n", + (unsigned long long) site->actual_site_addr, + (unsigned long long) site->actual_trampoline_addr, + (unsigned long long) site->wrapper_nr, + (int) site->wrapper_kind, + strerror(errno ? errno : EINVAL)); + } + goto out; + } + patch = site->planned.patch; + /* If the pre-computed B patch is empty (range overflow during + * planning), use a veneer near the SVC site to bridge the gap: + * SVC site -> B veneer -> LDR+BR trampoline. + * Reuse existing veneer pages when they have capacity and + * are within B range of the current site. + */ + if (patch.width == 0) { + uint64_t veneer_addr; + int64_t vdelta; + int reuse = 0; + + if (veneer_page_cap > 0 && veneer_page_used < veneer_page_cap) { + veneer_addr = veneer_page_base + + veneer_page_used * AARCH64_VENEER_SIZE; + vdelta = (int64_t) veneer_addr - + (int64_t) site->actual_site_addr; + if (vdelta > -AARCH64_B_RANGE && vdelta < AARCH64_B_RANGE) + reuse = 1; + } + if (!reuse) { + uint64_t page_size = (uint64_t) sysconf(_SC_PAGESIZE); + if (alloc_aarch64_veneer_page(runtime, + site->actual_site_addr, + &veneer_page_base) < 0) { + goto out; + } + veneer_page_used = 0; + veneer_page_cap = + page_size > 0 + ? (size_t) (page_size / AARCH64_VENEER_SIZE) + : 1; + veneer_addr = veneer_page_base; + } + if (write_aarch64_veneer(veneer_addr, + site->actual_trampoline_addr) < 0) { + goto out; + } + veneer_page_used++; + if (encode_aarch64_b_to_veneer(site->actual_site_addr, + veneer_addr, &patch) < 0) { + goto out; + } + } + } else if (site->planned.site.width == X86_64_WRAPPER_SITE_LEN) { + struct x86_64_trampoline_region *region; + struct kbox_rewrite_site actual_site = site->planned.site; + uint64_t trampoline_addr; + int mapping_index = find_exec_mapping_index(launch, site->source, + site->actual_site_addr); + + if (mapping_index < 0) { + if (errno == 0) + errno = EINVAL; + goto out; + } + region = find_x86_64_region(x86_regions, x86_region_count, + (size_t) mapping_index); + if (!region) { + if (errno == 0) + errno = ENOENT; + goto out; + } + trampoline_addr = + region->base_addr + + region->used_slots * X86_64_REWRITE_WRAPPER_SLOT_SIZE; + region->used_slots++; + if (write_x86_64_wrapper_trampoline( + trampoline_addr, site->actual_site_addr, + x86_64_wrapper_syscall_nr(site->planned.site.original)) < + 0) { + goto out; + } + actual_site.vaddr = site->actual_site_addr; + if (kbox_rewrite_origin_map_add_classified( + &runtime->origin_map, &actual_site, site->source, + site->planned.site.site_class) < 0) { + if (errno == 0) + errno = EINVAL; + goto out; + } + if (kbox_rewrite_encode_patch(&actual_site, trampoline_addr, + &patch) < 0) { + goto out; + } + } else { + continue; + } + memcpy(patch_ptr, patch.bytes, patch.width); + } + + for (i = 0; i < runtime->trampoline_region_count; i++) { + struct kbox_rewrite_runtime_trampoline_region *region = + &runtime->trampoline_regions[i]; + + if (!region->mapping || region->size == 0) + continue; + if (mprotect(region->mapping, region->size, PROT_READ | PROT_EXEC) != + 0) { + if (errno == 0) + errno = ENOMEM; + goto out; + } + __builtin___clear_cache((char *) region->mapping, + (char *) region->mapping + region->size); + } + + flush_exec_mappings(launch); + restore_exec_mapping_prot(launch, prot); + writable = 0; + store_active_rewrite_runtime(runtime); + runtime->installed = 1; + rc = 0; + +out: + if (writable) + restore_exec_mapping_prot(launch, prot); + if (rc < 0) + kbox_rewrite_runtime_reset(runtime); + runtime_site_array_reset(&array); + return rc; +} diff --git a/src/rewrite.h b/src/rewrite.h new file mode 100644 index 0000000..2a0d22d --- /dev/null +++ b/src/rewrite.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_REWRITE_H +#define KBOX_REWRITE_H + +#include +#include + +#include "kbox/cli.h" +#include "kbox/elf.h" +#include "loader-launch.h" +#include "seccomp.h" + +enum kbox_rewrite_arch { + KBOX_REWRITE_ARCH_UNKNOWN = 0, + KBOX_REWRITE_ARCH_X86_64, + KBOX_REWRITE_ARCH_AARCH64, +}; + +#define KBOX_REWRITE_MAX_PATCH_BYTES 8 + +/* Site classification for caller-aware rewrite dispatch. + * + * WRAPPER: the syscall site is inside a simple libc-style wrapper function + * (pattern: [mov NR], syscall, ret). The syscall result is returned + * directly to the caller with no side effects. Safe for expanded + * fast-path dispatch (host-semantic forwarding of process-info, + * simple I/O, etc.). + * + * COMPLEX: the site is inside a larger function where the syscall result + * may be consumed internally (e.g., raise() -> gettid -> tgkill, or + * pthread_create -> clone). Must use full dispatch to preserve the + * virtualization layer's invariants. + * + * UNKNOWN: classification could not be determined (e.g., insufficient + * context, non-standard calling convention). Treated as COMPLEX. + */ +enum kbox_rewrite_site_class { + KBOX_REWRITE_SITE_UNKNOWN = 0, + KBOX_REWRITE_SITE_WRAPPER, + KBOX_REWRITE_SITE_COMPLEX, +}; + +struct kbox_rewrite_report { + enum kbox_rewrite_arch arch; + size_t exec_segment_count; + size_t candidate_count; +}; + +struct kbox_rewrite_site { + uint64_t file_offset; + uint64_t vaddr; + uint64_t segment_vaddr; + uint64_t segment_mem_size; + unsigned char width; + unsigned char original[KBOX_REWRITE_MAX_PATCH_BYTES]; + enum kbox_rewrite_site_class site_class; +}; + +struct kbox_rewrite_patch { + unsigned char width; + unsigned char bytes[KBOX_REWRITE_MAX_PATCH_BYTES]; +}; + +struct kbox_rewrite_runtime_trampoline_region { + void *mapping; + size_t size; +}; + +struct kbox_rewrite_trampoline_probe { + enum kbox_rewrite_arch arch; + int feasible; + uint64_t trampoline_addr; + const char *reason; +}; + +struct kbox_rewrite_trampoline_layout { + enum kbox_rewrite_arch arch; + uint64_t base_addr; + uint64_t slot_size; +}; + +struct kbox_rewrite_planned_site { + struct kbox_rewrite_site site; + uint64_t trampoline_addr; + struct kbox_rewrite_patch patch; +}; + +struct kbox_rewrite_origin_entry { + uint64_t origin; + enum kbox_loader_mapping_source source; + enum kbox_rewrite_site_class site_class; +}; + +struct kbox_rewrite_origin_map { + enum kbox_rewrite_arch arch; + struct kbox_rewrite_origin_entry *entries; + size_t count; + size_t cap; +}; + +struct kbox_rewrite_runtime { + struct kbox_supervisor_ctx *ctx; + struct kbox_rewrite_origin_map origin_map; + enum kbox_rewrite_arch arch; + struct kbox_rewrite_runtime_trampoline_region + trampoline_regions[KBOX_LOADER_MAX_MAPPINGS]; + size_t trampoline_region_count; + int installed; +}; + +enum kbox_rewrite_wrapper_family_mask { + KBOX_REWRITE_WRAPPER_FAMILY_PROCINFO = 1u << 0, + KBOX_REWRITE_WRAPPER_FAMILY_STAT = 1u << 1, + KBOX_REWRITE_WRAPPER_FAMILY_OPEN = 1u << 2, +}; + +enum kbox_rewrite_wrapper_candidate_kind { + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT = 0, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL, +}; + +struct kbox_rewrite_wrapper_candidate { + enum kbox_rewrite_arch arch; + enum kbox_rewrite_wrapper_candidate_kind kind; + uint64_t file_offset; + uint64_t vaddr; + uint64_t nr; + uint32_t family_mask; +}; + +typedef int (*kbox_rewrite_site_cb)(const struct kbox_rewrite_site *site, + void *opaque); +typedef int (*kbox_rewrite_planned_site_cb)( + const struct kbox_rewrite_planned_site *planned, + void *opaque); +typedef int (*kbox_rewrite_wrapper_candidate_cb)( + const struct kbox_rewrite_wrapper_candidate *candidate, + void *opaque); + +const char *kbox_syscall_mode_name(enum kbox_syscall_mode mode); +int kbox_parse_syscall_mode(const char *value, enum kbox_syscall_mode *out); + +const char *kbox_rewrite_arch_name(enum kbox_rewrite_arch arch); +int kbox_rewrite_analyze_elf(const unsigned char *buf, + size_t buf_len, + struct kbox_rewrite_report *report); +int kbox_rewrite_analyze_memfd(int fd, struct kbox_rewrite_report *report); +int kbox_rewrite_visit_elf_sites(const unsigned char *buf, + size_t buf_len, + kbox_rewrite_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report); +int kbox_rewrite_visit_memfd_sites(int fd, + kbox_rewrite_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report); +int kbox_rewrite_visit_elf_planned_sites(const unsigned char *buf, + size_t buf_len, + kbox_rewrite_planned_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report); +int kbox_rewrite_visit_memfd_planned_sites(int fd, + kbox_rewrite_planned_site_cb cb, + void *opaque, + struct kbox_rewrite_report *report); +int kbox_rewrite_apply_elf(unsigned char *buf, + size_t buf_len, + size_t *applied_count, + struct kbox_rewrite_report *report); +int kbox_rewrite_apply_memfd(int fd, + size_t *applied_count, + struct kbox_rewrite_report *report); +int kbox_rewrite_apply_virtual_procinfo_elf(unsigned char *buf, + size_t buf_len, + size_t *applied_count, + struct kbox_rewrite_report *report); +int kbox_rewrite_apply_virtual_procinfo_memfd( + int fd, + size_t *applied_count, + struct kbox_rewrite_report *report); +void kbox_rewrite_origin_map_init(struct kbox_rewrite_origin_map *map, + enum kbox_rewrite_arch arch); +void kbox_rewrite_origin_map_reset(struct kbox_rewrite_origin_map *map); +int kbox_rewrite_origin_map_add_site_source( + struct kbox_rewrite_origin_map *map, + const struct kbox_rewrite_site *site, + enum kbox_loader_mapping_source source); +int kbox_rewrite_origin_map_add_classified( + struct kbox_rewrite_origin_map *map, + const struct kbox_rewrite_site *site, + enum kbox_loader_mapping_source source, + enum kbox_rewrite_site_class site_class); +static inline int kbox_rewrite_origin_map_add_site( + struct kbox_rewrite_origin_map *map, + const struct kbox_rewrite_site *site) +{ + return kbox_rewrite_origin_map_add_site_source(map, site, + KBOX_LOADER_MAPPING_MAIN); +} +int kbox_rewrite_origin_map_contains(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr); +int kbox_rewrite_origin_map_find(const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + struct kbox_rewrite_origin_entry *out); +int kbox_rewrite_origin_map_build_elf(struct kbox_rewrite_origin_map *map, + const unsigned char *buf, + size_t buf_len, + struct kbox_rewrite_report *report); +int kbox_rewrite_origin_map_build_memfd(struct kbox_rewrite_origin_map *map, + int fd, + struct kbox_rewrite_report *report); +int kbox_rewrite_encode_patch(const struct kbox_rewrite_site *site, + uint64_t trampoline_addr, + struct kbox_rewrite_patch *patch); +int kbox_rewrite_encode_x86_64_page_zero_trampoline(unsigned char *buf, + size_t buf_len, + uint64_t entry_addr); +int kbox_rewrite_init_trampoline_layout( + enum kbox_rewrite_arch arch, + const struct kbox_elf_exec_segment *seg, + struct kbox_rewrite_trampoline_layout *layout); +int kbox_rewrite_plan_site(const struct kbox_rewrite_site *site, + const struct kbox_rewrite_trampoline_layout *layout, + size_t slot_index, + struct kbox_rewrite_planned_site *planned); +int kbox_rewrite_probe_x86_64_page_zero( + uint64_t mmap_min_addr, + struct kbox_rewrite_trampoline_probe *probe); +int kbox_rewrite_probe_trampoline(enum kbox_rewrite_arch arch, + struct kbox_rewrite_trampoline_probe *probe); +int kbox_rewrite_is_fast_host_syscall0(const struct kbox_host_nrs *host_nrs, + uint64_t nr); +int kbox_rewrite_wrapper_syscall_nr(const struct kbox_rewrite_site *site, + enum kbox_rewrite_arch arch, + uint64_t *out_nr); +enum kbox_rewrite_site_class kbox_rewrite_classify_x86_64_site( + const unsigned char *segment_bytes, + size_t segment_size, + size_t site_offset, + unsigned char site_width); +enum kbox_rewrite_site_class kbox_rewrite_classify_aarch64_site( + const unsigned char *segment_bytes, + size_t segment_size, + size_t site_offset); +int kbox_rewrite_origin_map_find_class( + const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + enum kbox_rewrite_site_class *out); +int kbox_rewrite_is_site_fast_eligible( + const struct kbox_rewrite_origin_map *map, + uint64_t origin_addr, + const struct kbox_host_nrs *host_nrs, + uint64_t nr); +int kbox_rewrite_has_fork_sites(const unsigned char *buf, + size_t buf_len, + const struct kbox_host_nrs *host_nrs); +int kbox_rewrite_has_fork_sites_memfd(int fd, + const struct kbox_host_nrs *host_nrs); +int kbox_rewrite_has_wrapper_syscalls(const unsigned char *buf, + size_t buf_len, + enum kbox_rewrite_arch arch, + const uint64_t *nrs, + size_t nr_count); +int kbox_rewrite_has_wrapper_syscalls_memfd(int fd, + const uint64_t *nrs, + size_t nr_count); +int kbox_rewrite_wrapper_family_mask_memfd(int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t *out_mask); +int kbox_rewrite_visit_memfd_wrapper_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + kbox_rewrite_wrapper_candidate_cb cb, + void *opaque); +int kbox_rewrite_collect_memfd_wrapper_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count); +int kbox_rewrite_collect_memfd_wrapper_candidates_by_kind( + int fd, + const struct kbox_host_nrs *host_nrs, + uint32_t family_mask, + enum kbox_rewrite_wrapper_candidate_kind kind, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count); + +int kbox_rewrite_collect_memfd_phase1_path_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + struct kbox_rewrite_wrapper_candidate *out, + size_t out_cap, + size_t *out_count); +int kbox_rewrite_apply_memfd_phase1_path_candidates( + int fd, + const struct kbox_host_nrs *host_nrs, + size_t *applied_count, + struct kbox_rewrite_report *report); +void kbox_rewrite_runtime_reset(struct kbox_rewrite_runtime *runtime); +int kbox_rewrite_runtime_install(struct kbox_rewrite_runtime *runtime, + struct kbox_supervisor_ctx *ctx, + struct kbox_loader_launch *launch); + +#endif /* KBOX_REWRITE_H */ diff --git a/src/seccomp-bpf.c b/src/seccomp-bpf.c index 642a726..178c02e 100644 --- a/src/seccomp-bpf.c +++ b/src/seccomp-bpf.c @@ -23,6 +23,7 @@ */ #include +#include #include #include #include @@ -30,43 +31,44 @@ #include #include "seccomp.h" +#include "syscall-trap-signal.h" /* Deny list: arch-specific syscall numbers. */ #if defined(__x86_64__) static const int deny_nrs[] = { - /* Seccomp manipulation:guest can install filters breaking CONTINUE */ + /* Seccomp manipulation: guest can install filters breaking CONTINUE */ 317, /* seccomp */ - /* Tracing:supervisor memory/process access attacks */ + /* Tracing: supervisor memory/process access attacks */ 101, /* ptrace */ 311, /* process_vm_writev */ 440, /* process_madvise */ 448, /* process_mrelease */ - /* Landlock:guest can restrict CONTINUE operations */ + /* Landlock: guest can restrict CONTINUE operations */ 444, /* landlock_create_ruleset */ 445, /* landlock_add_rule */ 446, /* landlock_restrict_self */ - /* System admin:reboot, hostname manipulation */ + /* System admin: reboot, hostname manipulation */ 169, /* reboot */ 170, /* sethostname */ 171, /* setdomainname */ 163, /* acct */ - /* Kernel modules:code injection */ + /* Kernel modules: code injection */ 175, /* init_module */ 313, /* finit_module */ 176, /* delete_module */ 246, /* kexec_load */ 320, /* kexec_file_load */ - /* BPF/perf:kernel tracing and manipulation */ + /* BPF/perf: kernel tracing and manipulation */ 321, /* bpf */ 298, /* perf_event_open */ - /* Namespaces:container escape */ + /* Namespaces: container escape */ 272, /* unshare */ 308, /* setns */ @@ -79,7 +81,7 @@ static const int deny_nrs[] = { 135, /* personality */ 312, /* kcmp */ - /* io_uring:bypasses seccomp entirely */ + /* io_uring: bypasses seccomp entirely */ 425, /* io_uring_setup */ 426, /* io_uring_enter */ 427, /* io_uring_register */ @@ -88,9 +90,9 @@ static const int deny_nrs[] = { 323, /* userfaultfd */ 434, /* pidfd_open */ 438, /* pidfd_getfd */ - 447, /* memfd_secret:breaks process_vm_readv */ + 447, /* memfd_secret: breaks process_vm_readv */ - /* New mount API:host namespace manipulation */ + /* New mount API: host namespace manipulation */ 428, /* open_tree */ 429, /* move_mount */ 430, /* fsopen */ @@ -243,17 +245,146 @@ static const int deny_nrs[] = { #endif #define DENY_COUNT ((int) (sizeof(deny_nrs) / sizeof(deny_nrs[0]))) -#define ALLOW_COUNT 3 +#define ALLOW_COUNT 4 +/* Must be >= KBOX_LOADER_MAX_MAPPINGS (49) to accept all executable + * mappings from the loader without truncation. + */ +#define MAX_IP_RANGE_COUNT 64 /* Maximum BPF program length. Each deny entry is 2 instructions (compare + - * ret_errno), each allow entry is 2 instructions. Plus 4 for arch check + 1 for - * default. + * ret_errno), each allow entry is 2 instructions. Trap-mode range checks use + * 6 instructions per range plus one default-allow before the syscall-number + * path. The trap-ranges filter also has an early rt_sigreturn allow for the + * host signal restorer path. * * Use a generous upper bound for the VLA. + * + * Worst-case sizing: ~30 fixed + 8*N allow-ranges + 8 EMIT_ALLOW + + * 5*5 shadow-allow + 5*8 host-fd-band + 2*DENY_COUNT ≈ 310 instructions + * for typical inputs. 2048 provides >6x headroom; the post-emission check + * catches unexpected growth without per-write bounds testing. */ -#define MAX_PROG_LEN (4 + DENY_COUNT * 2 + ALLOW_COUNT * 2 + 1) +#define MAX_PROG_LEN 2048 -int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) +static void emit_fast_shadow_allow(struct kbox_sock_filter *filter, + int *idx, + int nr) +{ + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, (unsigned int) nr, 0, 3); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_ARG0_LO_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGE | KBOX_BPF_K, KBOX_FD_FAST_BASE, 0, 1); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); +} + +static void emit_host_fd_band_allow(struct kbox_sock_filter *filter, + int *idx, + int nr, + unsigned int min_fd) +{ + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, (unsigned int) nr, 0, 3); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_ARG0_LO_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGE | KBOX_BPF_K, min_fd, 0, 1); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); +} + +static int emit_ip_range_allow(struct kbox_sock_filter *filter, + int *idx, + const struct kbox_syscall_trap_ip_range *range) +{ + uint64_t start; + uint64_t end_inclusive; + uint32_t hi; + uint32_t lo_start; + uint32_t lo_end; + + if (!filter || !idx || !range || range->start >= range->end) + return -1; + + start = (uint64_t) range->start; + end_inclusive = (uint64_t) range->end - 1; + if ((start >> 32) != (end_inclusive >> 32)) + return -1; + + hi = (uint32_t) (start >> 32); + lo_start = (uint32_t) start; + lo_end = (uint32_t) end_inclusive; + + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_IP_HI_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, hi, 0, 4); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_IP_LO_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGE | KBOX_BPF_K, lo_start, 0, 2); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGT | KBOX_BPF_K, lo_end, 1, 0); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); + return 0; +} + +static int emit_ip_range_trap_match( + struct kbox_sock_filter *filter, + int *idx, + const struct kbox_syscall_trap_ip_range *range, + int *jump_index_out) +{ + uint64_t start; + uint64_t end_inclusive; + uint32_t hi; + uint32_t lo_start; + uint32_t lo_end; + + if (!filter || !idx || !range || !jump_index_out || + range->start >= range->end) + return -1; + + start = (uint64_t) range->start; + end_inclusive = (uint64_t) range->end - 1; + if ((start >> 32) != (end_inclusive >> 32)) + return -1; + + hi = (uint32_t) (start >> 32); + lo_start = (uint32_t) start; + lo_end = (uint32_t) end_inclusive; + + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_IP_HI_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, hi, 0, 4); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, + KBOX_SECCOMP_DATA_IP_LO_OFFSET); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGE | KBOX_BPF_K, lo_start, 0, 2); + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JGT | KBOX_BPF_K, lo_end, 1, 0); + *jump_index_out = *idx; + filter[(*idx)++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_K, 0, 0, 0); + return 0; +} + +static int install_seccomp_filter( + const struct kbox_host_nrs *h, + unsigned int default_action, + unsigned int filter_flags, + const struct kbox_syscall_trap_ip_range *allow_ranges, + size_t allow_range_count) { struct kbox_sock_filter filter[MAX_PROG_LEN]; struct kbox_sock_fprog prog; @@ -274,7 +405,16 @@ int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_KILL_PROCESS); - /* [3] Load syscall number. */ + if (allow_ranges) { + for (size_t r = 0; r < allow_range_count; r++) { + if (emit_ip_range_allow(filter, &idx, &allow_ranges[r]) < 0) { + errno = EINVAL; + return -1; + } + } + } + + /* Load syscall number. */ filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_NR_OFFSET); @@ -305,8 +445,28 @@ int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) EMIT_ALLOW(h->sendmsg); EMIT_ALLOW(h->exit); EMIT_ALLOW(h->exit_group); + EMIT_ALLOW(h->rt_sigreturn); #undef EMIT_ALLOW + emit_fast_shadow_allow(filter, &idx, h->read); + emit_fast_shadow_allow(filter, &idx, h->pread64); + emit_fast_shadow_allow(filter, &idx, h->write); + emit_fast_shadow_allow(filter, &idx, h->lseek); + emit_fast_shadow_allow(filter, &idx, h->fstat); + emit_host_fd_band_allow(filter, &idx, h->read, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->pread64, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->lseek, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->fstat, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->close, KBOX_FD_HOSTONLY_BASE); + + /* The arg0-based fast-fd checks above overwrite A with the file + * descriptor. Reload the syscall number before the deny list so low FDs + * do not alias deny-list syscall numbers (for example write(fd=1) vs + * io_destroy on aarch64). + */ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_NR_OFFSET); + /* Deny-list: dangerous syscalls get EPERM without reaching the supervisor. * Skip entries with nr == -1 (not available on this architecture). */ @@ -322,7 +482,7 @@ int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) /* Default: everything else goes to the supervisor. */ filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( - KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_USER_NOTIF); + KBOX_BPF_RET | KBOX_BPF_K, default_action); if (idx > MAX_PROG_LEN) { fprintf(stderr, "kbox: BPF program overflow (%d > %d)\n", idx, @@ -334,15 +494,214 @@ int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) prog.len = (unsigned short) idx; prog.filter = filter; - ret = syscall(__NR_seccomp, KBOX_SECCOMP_SET_MODE_FILTER, - KBOX_SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + ret = syscall(__NR_seccomp, KBOX_SECCOMP_SET_MODE_FILTER, filter_flags, + &prog); + if (ret < 0) { + fprintf(stderr, "kbox: seccomp(SET_MODE_FILTER, 0x%x) failed: %s\n", + filter_flags, strerror(errno)); + return -1; + } + + return (int) ret; +} + +int kbox_install_seccomp_listener(const struct kbox_host_nrs *h) +{ + return install_seccomp_filter(h, KBOX_SECCOMP_RET_USER_NOTIF, + KBOX_SECCOMP_FILTER_FLAG_NEW_LISTENER, NULL, + 0); +} + +int kbox_install_seccomp_trap(const struct kbox_host_nrs *h) +{ + struct kbox_syscall_trap_ip_range allow_range; + + if (kbox_syscall_trap_host_syscall_range(&allow_range) < 0) { + errno = EINVAL; + return -1; + } + return install_seccomp_filter(h, KBOX_SECCOMP_RET_TRAP, 0, &allow_range, 1); +} + +static int install_seccomp_trap_ranges_ex( + const struct kbox_host_nrs *h, + const struct kbox_syscall_trap_ip_range *trap_ranges, + size_t trap_range_count) +{ + struct kbox_sock_filter filter[MAX_PROG_LEN]; + struct kbox_sock_fprog prog; + int match_jumps[MAX_IP_RANGE_COUNT]; + struct kbox_syscall_trap_ip_range internal_ranges[16]; + int internal_jumps[16]; + size_t internal_count = 0; + int idx = 0; + int internal_allow_idx = -1; + int nr_load_idx; + int i; + long ret; + + if (!h || !trap_ranges || trap_range_count == 0 || + trap_range_count > MAX_IP_RANGE_COUNT) { + errno = EINVAL; + return -1; + } + if (kbox_syscall_trap_internal_ip_ranges( + internal_ranges, + sizeof(internal_ranges) / sizeof(internal_ranges[0]), + &internal_count) < 0) { + errno = EINVAL; + return -1; + } + + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_ARCH_OFFSET); + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, KBOX_AUDIT_ARCH_CURRENT, 1, + 0); + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_KILL_PROCESS); + + /* The kernel's signal restorer may live outside guest exec mappings. + * Allow rt_sigreturn before the IP gate so SIGSYS delivery can unwind + * without reopening general host-IP syscall execution. + */ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_NR_OFFSET); + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, + (unsigned int) h->rt_sigreturn, 0, 1); + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); + + for (i = 0; i < (int) internal_count; i++) { + if (emit_ip_range_trap_match(filter, &idx, &internal_ranges[i], + &internal_jumps[i]) < 0) { + errno = EINVAL; + return -1; + } + } + + for (i = 0; i < (int) trap_range_count; i++) { + if (emit_ip_range_trap_match(filter, &idx, &trap_ranges[i], + &match_jumps[i]) < 0) { + errno = EINVAL; + return -1; + } + } + + /* Non-guest, non-trampoline IPs must not reach the host kernel. + * + * The dedicated host trampoline covers the small set of syscalls kbox + * executes on the trapped guest thread. Guest executable mappings are + * matched above and routed into the dispatcher path. Any other IP is + * outside the permitted syscall origin set and is rejected. + */ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ERRNO(EPERM)); + + internal_allow_idx = idx; + for (i = 0; i < (int) internal_count; i++) { + int rel = internal_allow_idx - (internal_jumps[i] + 1); + + if (rel < 0) { + errno = EINVAL; + return -1; + } + filter[internal_jumps[i]].k = (unsigned int) rel; + } + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); + + nr_load_idx = idx; + for (i = 0; i < (int) trap_range_count; i++) { + int rel = nr_load_idx - (match_jumps[i] + 1); + + if (rel < 0) { + errno = EINVAL; + return -1; + } + filter[match_jumps[i]].k = (unsigned int) rel; + } + + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_NR_OFFSET); + +#define EMIT_ALLOW(nr) \ + do { \ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( \ + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, (nr), 0, 1); \ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( \ + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ALLOW); \ + } while (0) + + EMIT_ALLOW(h->sendmsg); + EMIT_ALLOW(h->exit); + EMIT_ALLOW(h->exit_group); + EMIT_ALLOW(h->rt_sigreturn); +#undef EMIT_ALLOW + + emit_fast_shadow_allow(filter, &idx, h->read); + emit_fast_shadow_allow(filter, &idx, h->pread64); + emit_fast_shadow_allow(filter, &idx, h->write); + emit_fast_shadow_allow(filter, &idx, h->lseek); + emit_fast_shadow_allow(filter, &idx, h->fstat); + emit_host_fd_band_allow(filter, &idx, h->read, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->pread64, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->lseek, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->fstat, KBOX_FD_HOSTONLY_BASE); + emit_host_fd_band_allow(filter, &idx, h->close, KBOX_FD_HOSTONLY_BASE); + + /* The arg0-based fast-fd checks above overwrite A with the file + * descriptor. Reload the syscall number before the deny list so low FDs + * do not alias deny-list syscall numbers. + */ + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_LD | KBOX_BPF_W | KBOX_BPF_ABS, KBOX_SECCOMP_DATA_NR_OFFSET); + + for (i = 0; i < DENY_COUNT; i++) { + if (deny_nrs[i] < 0) + continue; + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_JUMP( + KBOX_BPF_JMP | KBOX_BPF_JEQ | KBOX_BPF_K, + (unsigned int) deny_nrs[i], 0, 1); + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_ERRNO(EPERM)); + } + + filter[idx++] = (struct kbox_sock_filter) KBOX_BPF_STMT( + KBOX_BPF_RET | KBOX_BPF_K, KBOX_SECCOMP_RET_TRAP); + + if (idx > MAX_PROG_LEN) { + errno = EINVAL; + return -1; + } + + prog.len = (unsigned short) idx; + prog.filter = filter; + + ret = syscall(__NR_seccomp, KBOX_SECCOMP_SET_MODE_FILTER, 0, &prog); if (ret < 0) { fprintf(stderr, - "kbox: seccomp(SET_MODE_FILTER, " - "NEW_LISTENER) failed: %s\n", + "kbox: seccomp(SET_MODE_FILTER, trap ranges) failed: %s\n", strerror(errno)); return -1; } return (int) ret; } + +int kbox_install_seccomp_trap_ranges( + const struct kbox_host_nrs *h, + const struct kbox_syscall_trap_ip_range *trap_ranges, + size_t trap_range_count) +{ + return install_seccomp_trap_ranges_ex(h, trap_ranges, trap_range_count); +} + +int kbox_install_seccomp_rewrite_ranges( + const struct kbox_host_nrs *h, + const struct kbox_syscall_trap_ip_range *trap_ranges, + size_t trap_range_count) +{ + return install_seccomp_trap_ranges_ex(h, trap_ranges, trap_range_count); +} diff --git a/src/seccomp-defs.h b/src/seccomp-defs.h index 3e52e99..68a8874 100644 --- a/src/seccomp-defs.h +++ b/src/seccomp-defs.h @@ -10,11 +10,14 @@ #define KBOX_BPF_ABS 0x20 #define KBOX_BPF_JMP 0x05 #define KBOX_BPF_JEQ 0x10 +#define KBOX_BPF_JGT 0x20 +#define KBOX_BPF_JGE 0x30 #define KBOX_BPF_K 0x00 #define KBOX_BPF_RET 0x06 #define KBOX_SECCOMP_RET_ALLOW 0x7fff0000U #define KBOX_SECCOMP_RET_USER_NOTIF 0x7fc00000U +#define KBOX_SECCOMP_RET_TRAP 0x00030000U #define KBOX_SECCOMP_RET_KILL_PROCESS 0x80000000U #define KBOX_SECCOMP_RET_ERRNO(err) (0x00050000U | ((err) & 0x0000ffffU)) @@ -23,6 +26,9 @@ #define KBOX_SECCOMP_DATA_NR_OFFSET 0 #define KBOX_SECCOMP_DATA_ARCH_OFFSET 4 +#define KBOX_SECCOMP_DATA_IP_LO_OFFSET 8 +#define KBOX_SECCOMP_DATA_IP_HI_OFFSET 12 +#define KBOX_SECCOMP_DATA_ARG0_LO_OFFSET 16 #if defined(__x86_64__) #define KBOX_AUDIT_ARCH_CURRENT 0xc000003eU diff --git a/src/seccomp-dispatch.c b/src/seccomp-dispatch.c index bcf5534..b543c5d 100644 --- a/src/seccomp-dispatch.c +++ b/src/seccomp-dispatch.c @@ -12,9 +12,11 @@ #include #include /* seccomp types via seccomp.h -> seccomp-defs.h */ +#include #include #include #include +#include #include #include #include @@ -28,11 +30,19 @@ #include "kbox/identity.h" #include "kbox/path.h" #include "lkl-wrap.h" +#include "loader-launch.h" #include "net.h" #include "procmem.h" +#include "rewrite.h" #include "seccomp.h" #include "shadow-fd.h" #include "syscall-nr.h" +#include "syscall-trap-signal.h" +#include "syscall-trap.h" + +#define KBOX_FD_HOST_SAME_FD_SHADOW (-2) +#define KBOX_FD_LOCAL_ONLY_SHADOW (-3) +#define KBOX_LKL_FD_SHADOW_ONLY (-2) /* Argument extraction helpers. */ @@ -41,11 +51,399 @@ static inline int64_t to_c_long_arg(uint64_t v) return (int64_t) v; } +/* Static scratch buffer for I/O dispatch. The dispatcher is single-threaded + * and non-reentrant: only one syscall is dispatched at a time. Using a static + * buffer instead of malloc avoids heap allocation from the SIGSYS handler in + * trap/rewrite mode, where the guest may hold glibc heap locks. + */ +static uint8_t dispatch_scratch[KBOX_IO_CHUNK_LEN]; + static inline long to_dirfd_arg(uint64_t v) { return (long) (int) (uint32_t) v; } +static int guest_mem_read(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + void *out, + size_t len); +static int guest_mem_write(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len); +static int try_cached_shadow_open_dispatch( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long flags, + const char *translated, + struct kbox_dispatch *out); +static int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, + const char *translated, + uint64_t remote_stat, + pid_t pid); +static void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx); +static void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx); +static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long lkl_fd, + long flags, + const char *translated, + struct kbox_dispatch *out); +static void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry); +static void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry); + +static int request_uses_trap_signals(const struct kbox_syscall_request *req) +{ + return req && (req->source == KBOX_SYSCALL_SOURCE_TRAP || + req->source == KBOX_SYSCALL_SOURCE_REWRITE); +} + +static int request_blocks_reserved_sigsys( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + uint64_t set_ptr; + size_t sigset_size; + unsigned char mask[16]; + size_t read_len; + int rc; + + if (!req) + return 0; + set_ptr = kbox_syscall_request_arg(req, 1); + sigset_size = (size_t) kbox_syscall_request_arg(req, 3); + if (set_ptr == 0 || sigset_size == 0) + return 0; + + read_len = sigset_size; + if (read_len > sizeof(mask)) + read_len = sizeof(mask); + memset(mask, 0, sizeof(mask)); + + rc = guest_mem_read(ctx, kbox_syscall_request_pid(req), set_ptr, mask, + read_len); + if (rc < 0) + return rc; + + return kbox_syscall_trap_sigset_blocks_reserved(mask, read_len) ? 1 : 0; +} + +static struct kbox_fd_entry *fd_table_entry(struct kbox_fd_table *t, long fd) +{ + if (!t) + return NULL; + if (fd >= KBOX_FD_BASE && fd < KBOX_FD_BASE + KBOX_FD_TABLE_MAX) + return &t->entries[fd - KBOX_FD_BASE]; + if (fd >= 0 && fd < KBOX_LOW_FD_MAX) + return &t->low_fds[fd]; + return NULL; +} + +static struct kbox_dispatch emulate_trap_rt_sigprocmask( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long how = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t set_ptr = kbox_syscall_request_arg(req, 1); + uint64_t old_ptr = kbox_syscall_request_arg(req, 2); + size_t sigset_size = (size_t) kbox_syscall_request_arg(req, 3); + unsigned char current[sizeof(sigset_t)]; + unsigned char next[sizeof(sigset_t)]; + unsigned char pending[sizeof(sigset_t)]; + unsigned char set_mask[sizeof(sigset_t)]; + size_t mask_len; + + if (sigset_size == 0 || sigset_size > sizeof(current)) + return kbox_dispatch_errno(EINVAL); + mask_len = sigset_size; + + /* In TRAP mode the signal mask lives in the ucontext delivered by the + * kernel; modifying it there takes effect when the handler returns. + * In REWRITE mode there is no ucontext -- the rewrite dispatch runs + * as a normal function call, so fall back to sigprocmask(2) directly. + */ + if (kbox_syscall_trap_get_sigmask(current, sizeof(current)) < 0) { + sigset_t tmp; + if (sigprocmask(SIG_SETMASK, NULL, &tmp) < 0) + return kbox_dispatch_errno(EIO); + memcpy(current, &tmp, sizeof(current)); + } + + memset(set_mask, 0, sizeof(set_mask)); + memcpy(next, current, sizeof(next)); + + if (set_ptr != 0) { + int rc = guest_mem_read(ctx, kbox_syscall_request_pid(req), set_ptr, + set_mask, mask_len); + if (rc < 0) + return kbox_dispatch_errno(-rc); + } + + if (old_ptr != 0) { + int rc = guest_mem_write(ctx, kbox_syscall_request_pid(req), old_ptr, + current, mask_len); + if (rc < 0) + return kbox_dispatch_errno(-rc); + } + + if (set_ptr != 0) { + switch (how) { + case SIG_BLOCK: + for (size_t i = 0; i < mask_len; i++) + next[i] |= set_mask[i]; + break; + case SIG_UNBLOCK: + for (size_t i = 0; i < mask_len; i++) + next[i] &= (unsigned char) ~set_mask[i]; + break; + case SIG_SETMASK: + memcpy(next, set_mask, mask_len); + break; + default: + return kbox_dispatch_errno(EINVAL); + } + } + + if (kbox_syscall_trap_set_sigmask(next, sizeof(next)) < 0) { + sigset_t apply; + memcpy(&apply, next, sizeof(next)); + if (sigprocmask(SIG_SETMASK, &apply, NULL) < 0) + return kbox_dispatch_errno(EIO); + } + + if (kbox_syscall_trap_get_pending(pending, sizeof(pending)) == 0) { + for (size_t i = 0; i < sizeof(pending); i++) + pending[i] &= next[i]; + (void) kbox_syscall_trap_set_pending(pending, sizeof(pending)); + } + + return kbox_dispatch_value(0); +} + +static int trap_sigmask_contains_signal(int signo) +{ + sigset_t current; + + if (signo <= 0) + return 0; + if (kbox_syscall_trap_get_sigmask(¤t, sizeof(current)) < 0) + return 0; + return sigismember(¤t, signo) == 1; +} + +static struct kbox_dispatch emulate_trap_rt_sigpending( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + uint64_t set_ptr = kbox_syscall_request_arg(req, 0); + size_t sigset_size = (size_t) kbox_syscall_request_arg(req, 1); + unsigned char pending[sizeof(sigset_t)]; + int rc; + + (void) ctx; + + if (set_ptr == 0) + return kbox_dispatch_errno(EFAULT); + if (sigset_size == 0 || sigset_size > sizeof(pending)) + return kbox_dispatch_errno(EINVAL); + if (kbox_syscall_trap_get_pending(pending, sizeof(pending)) < 0) + return kbox_dispatch_errno(EIO); + + rc = guest_mem_write(ctx, kbox_syscall_request_pid(req), set_ptr, pending, + sigset_size); + if (rc < 0) + return kbox_dispatch_errno(-rc); + return kbox_dispatch_value(0); +} + +struct kbox_fd_inject_ops { + int (*addfd)(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + uint32_t newfd_flags); + int (*addfd_at)(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + int target_fd, + uint32_t newfd_flags); +}; + +static int seccomp_request_addfd(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + uint32_t newfd_flags) +{ + return kbox_notify_addfd(ctx->listener_fd, cookie, srcfd, newfd_flags); +} + +static int seccomp_request_addfd_at(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + int target_fd, + uint32_t newfd_flags) +{ + return kbox_notify_addfd_at(ctx->listener_fd, cookie, srcfd, target_fd, + newfd_flags); +} + +static const struct kbox_fd_inject_ops seccomp_fd_inject_ops = { + .addfd = seccomp_request_addfd, + .addfd_at = seccomp_request_addfd_at, +}; + +static int local_request_addfd(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + uint32_t newfd_flags) +{ + int ret; + + (void) ctx; + (void) cookie; +#ifdef F_DUPFD_CLOEXEC + if (newfd_flags & O_CLOEXEC) { + ret = (int) kbox_syscall_trap_host_syscall6(SYS_fcntl, (uint64_t) srcfd, + (uint64_t) F_DUPFD_CLOEXEC, + 0, 0, 0, 0); + return ret >= 0 ? ret : -(int) -ret; + } +#endif + ret = (int) kbox_syscall_trap_host_syscall6(SYS_fcntl, (uint64_t) srcfd, + (uint64_t) F_DUPFD, 0, 0, 0, 0); + return ret >= 0 ? ret : -(int) -ret; +} + +static int local_request_addfd_at(const struct kbox_supervisor_ctx *ctx, + uint64_t cookie, + int srcfd, + int target_fd, + uint32_t newfd_flags) +{ + (void) ctx; + (void) cookie; +#ifdef __linux__ + { + int ret = (int) kbox_syscall_trap_host_syscall6( + SYS_dup3, (uint64_t) srcfd, (uint64_t) target_fd, + (uint64_t) ((newfd_flags & O_CLOEXEC) ? O_CLOEXEC : 0), 0, 0, 0); + return ret >= 0 ? ret : -(int) -ret; + } +#else + (void) srcfd; + (void) target_fd; + (void) newfd_flags; + return -ENOSYS; +#endif +} + +static const struct kbox_fd_inject_ops local_fd_inject_ops = { + .addfd = local_request_addfd, + .addfd_at = local_request_addfd_at, +}; + +static int request_addfd(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + uint32_t newfd_flags) +{ + if (!ctx || !ctx->fd_inject_ops || !ctx->fd_inject_ops->addfd || !req) + return -EINVAL; + return ctx->fd_inject_ops->addfd(ctx, kbox_syscall_request_cookie(req), + srcfd, newfd_flags); +} + +static int request_addfd_at(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + int target_fd, + uint32_t newfd_flags) +{ + if (!ctx || !ctx->fd_inject_ops || !ctx->fd_inject_ops->addfd_at || !req) + return -EINVAL; + return ctx->fd_inject_ops->addfd_at(ctx, kbox_syscall_request_cookie(req), + srcfd, target_fd, newfd_flags); +} + +void kbox_dispatch_prepare_request_ctx(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req) +{ + if (!ctx || !req) + return; + + ctx->active_guest_mem = req->guest_mem; + if (!ctx->active_guest_mem.ops) { + ctx->active_guest_mem.ops = &kbox_process_vm_guest_mem_ops; + ctx->active_guest_mem.opaque = (uintptr_t) req->pid; + } + ctx->guest_mem_ops = ctx->active_guest_mem.ops; + if (!ctx->fd_inject_ops) { + if (req->source == KBOX_SYSCALL_SOURCE_TRAP || + req->source == KBOX_SYSCALL_SOURCE_REWRITE) { + ctx->fd_inject_ops = &local_fd_inject_ops; + } else { + ctx->fd_inject_ops = &seccomp_fd_inject_ops; + } + } +} + +static int guest_mem_read(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + void *out, + size_t len) +{ + (void) pid; + return kbox_guest_mem_read(&ctx->active_guest_mem, remote_addr, out, len); +} + +static int guest_mem_write(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) +{ + (void) pid; + return kbox_guest_mem_write(&ctx->active_guest_mem, remote_addr, in, len); +} + +static int guest_mem_write_force(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) +{ + (void) pid; + return kbox_guest_mem_write_force(&ctx->active_guest_mem, remote_addr, in, + len); +} + +static int guest_mem_read_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + char *buf, + size_t max_len) +{ + (void) pid; + return kbox_guest_mem_read_string(&ctx->active_guest_mem, remote_addr, buf, + max_len); +} + +static int guest_mem_read_open_how(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) +{ + (void) pid; + + return kbox_guest_mem_read_open_how(&ctx->active_guest_mem, remote_addr, + size, out); +} + /* Open-flag ABI translation (aarch64 host <-> asm-generic LKL). */ /* aarch64 and asm-generic define four O_* flags differently: @@ -207,40 +605,218 @@ static long resolve_open_dirfd(const char *path, return kbox_fd_table_get_lkl(table, dirfd); } -static int read_guest_string(pid_t pid, uint64_t addr, char *buf, size_t size) +static int read_guest_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t addr, + char *buf, + size_t size) +{ + return guest_mem_read_string(ctx, pid, addr, buf, size); +} + +static struct kbox_translated_path_cache_entry *find_translated_path_cache( + struct kbox_supervisor_ctx *ctx, + const char *guest_path) +{ + size_t i; + + if (!ctx || !guest_path) + return NULL; + for (i = 0; i < KBOX_TRANSLATED_PATH_CACHE_MAX; i++) { + struct kbox_translated_path_cache_entry *entry = + &ctx->translated_path_cache[i]; + if (entry->valid && + entry->generation == ctx->path_translation_generation && + strcmp(entry->guest_path, guest_path) == 0) { + return entry; + } + } + return NULL; +} + +static struct kbox_translated_path_cache_entry *reserve_translated_path_cache( + struct kbox_supervisor_ctx *ctx) +{ + size_t i; + + if (!ctx) + return NULL; + for (i = 0; i < KBOX_TRANSLATED_PATH_CACHE_MAX; i++) { + if (!ctx->translated_path_cache[i].valid) + return &ctx->translated_path_cache[i]; + } + return &ctx->translated_path_cache[0]; +} + +static struct kbox_literal_path_cache_entry *find_literal_path_cache( + struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t guest_addr) +{ + size_t i; + + if (!ctx || guest_addr == 0) + return NULL; + for (i = 0; i < KBOX_LITERAL_PATH_CACHE_MAX; i++) { + struct kbox_literal_path_cache_entry *entry = + &ctx->literal_path_cache[i]; + if (entry->valid && + entry->generation == ctx->path_translation_generation && + entry->pid == pid && entry->guest_addr == guest_addr) { + return entry; + } + } + return NULL; +} + +static struct kbox_literal_path_cache_entry *reserve_literal_path_cache( + struct kbox_supervisor_ctx *ctx) +{ + size_t i; + + if (!ctx) + return NULL; + for (i = 0; i < KBOX_LITERAL_PATH_CACHE_MAX; i++) { + if (!ctx->literal_path_cache[i].valid) + return &ctx->literal_path_cache[i]; + } + return &ctx->literal_path_cache[0]; +} + +static int guest_addr_is_writable(pid_t pid, uint64_t addr) +{ + char maps_path[64]; + FILE *fp; + char line[256]; + + snprintf(maps_path, sizeof(maps_path), "/proc/%d/maps", (int) pid); + fp = fopen(maps_path, "re"); + if (!fp) + return 1; + + while (fgets(line, sizeof(line), fp)) { + unsigned long long start, end; + char perms[8]; + + if (sscanf(line, "%llx-%llx %7s", &start, &end, perms) != 3) + continue; + if (addr < start || addr >= end) + continue; + fclose(fp); + return strchr(perms, 'w') != NULL; + } + + fclose(fp); + return 1; +} + +static void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx) { - return kbox_vm_read_string(pid, addr, buf, size); + size_t i; + + if (!ctx) + return; + ctx->path_translation_generation++; + for (i = 0; i < KBOX_TRANSLATED_PATH_CACHE_MAX; i++) + ctx->translated_path_cache[i].valid = 0; + for (i = 0; i < KBOX_LITERAL_PATH_CACHE_MAX; i++) + ctx->literal_path_cache[i].valid = 0; } -static int translate_guest_path(pid_t pid, +static int translate_guest_path(const struct kbox_supervisor_ctx *ctx, + pid_t pid, uint64_t addr, const char *host_root, char *translated, size_t size) { + struct kbox_supervisor_ctx *mutable_ctx = + (struct kbox_supervisor_ctx *) ctx; char pathbuf[KBOX_MAX_PATH]; - int rc = read_guest_string(pid, addr, pathbuf, sizeof(pathbuf)); + struct kbox_literal_path_cache_entry *literal_entry; + struct kbox_translated_path_cache_entry *entry; + + literal_entry = find_literal_path_cache(mutable_ctx, pid, addr); + if (literal_entry) { + size_t len = strlen(literal_entry->translated); + + if (len >= size) + return -ENAMETOOLONG; + memcpy(translated, literal_entry->translated, len + 1); + return 0; + } + + int rc = read_guest_string(ctx, pid, addr, pathbuf, sizeof(pathbuf)); + if (rc < 0) + return rc; + + entry = find_translated_path_cache(mutable_ctx, pathbuf); + if (entry) { + if (strlen(entry->translated) >= size) + return -ENAMETOOLONG; + memcpy(translated, entry->translated, strlen(entry->translated) + 1); + return 0; + } + + rc = kbox_translate_path_for_lkl(pid, pathbuf, host_root, translated, size); if (rc < 0) return rc; - return kbox_translate_path_for_lkl(pid, pathbuf, host_root, translated, - size); + + entry = reserve_translated_path_cache(mutable_ctx); + if (entry) { + entry->valid = 1; + entry->generation = mutable_ctx->path_translation_generation; + strncpy(entry->guest_path, pathbuf, sizeof(entry->guest_path) - 1); + entry->guest_path[sizeof(entry->guest_path) - 1] = '\0'; + strncpy(entry->translated, translated, sizeof(entry->translated) - 1); + entry->translated[sizeof(entry->translated) - 1] = '\0'; + } + + if (!guest_addr_is_writable(pid, addr)) { + literal_entry = reserve_literal_path_cache(mutable_ctx); + if (literal_entry) { + literal_entry->valid = 1; + literal_entry->generation = + mutable_ctx->path_translation_generation; + literal_entry->pid = pid; + literal_entry->guest_addr = addr; + strncpy(literal_entry->translated, translated, + sizeof(literal_entry->translated) - 1); + literal_entry->translated[sizeof(literal_entry->translated) - 1] = + '\0'; + } + } + return 0; } -static int translate_guest_at_path(const struct kbox_seccomp_notif *notif, - struct kbox_supervisor_ctx *ctx, - size_t dirfd_idx, - size_t path_idx, - char *translated, - size_t size, - long *lkl_dirfd) +static int translate_request_path(const struct kbox_syscall_request *req, + const struct kbox_supervisor_ctx *ctx, + size_t path_idx, + const char *host_root, + char *translated, + size_t size) { - int rc = translate_guest_path(notif->pid, notif->data.args[path_idx], - ctx->host_root, translated, size); + return translate_guest_path(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, path_idx), + host_root, translated, size); +} + +static int translate_request_at_path(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + size_t dirfd_idx, + size_t path_idx, + char *translated, + size_t size, + long *lkl_dirfd) +{ + int rc = translate_request_path(req, ctx, path_idx, ctx->host_root, + translated, size); if (rc < 0) return rc; *lkl_dirfd = resolve_open_dirfd( - translated, to_dirfd_arg(notif->data.args[dirfd_idx]), ctx->fd_table); + translated, to_dirfd_arg(kbox_syscall_request_arg(req, dirfd_idx)), + ctx->fd_table); return 0; } @@ -249,55 +825,328 @@ static int should_continue_for_dirfd(long lkl_dirfd) return lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX; } -/* Attempt to create a shadow memfd for an O_RDONLY regular file and inject - * it into the tracee. On success, records the host_fd in the FD table and - * returns the host-visible FD number via *out_fd. +static int child_fd_is_open(const struct kbox_supervisor_ctx *ctx, long fd) +{ + char link_path[64]; + char target[1]; + + if (!ctx || ctx->child_pid <= 0 || fd < 0) + return 0; + snprintf(link_path, sizeof(link_path), "/proc/%d/fd/%ld", + (int) ctx->child_pid, fd); + if (readlink(link_path, target, sizeof(target)) >= 0) + return 1; + return errno != ENOENT; +} + +static long allocate_passthrough_hostonly_fd(struct kbox_supervisor_ctx *ctx) +{ + long base_fd = KBOX_FD_HOSTONLY_BASE; + long end_fd = KBOX_FD_BASE + KBOX_FD_TABLE_MAX; + long start_fd; + long fd; + + if (!ctx || !ctx->fd_table) + return -1; + + start_fd = ctx->fd_table->next_hostonly_fd; + if (start_fd < base_fd || start_fd >= end_fd) + start_fd = base_fd; + + for (fd = start_fd; fd < end_fd; fd++) { + if (!child_fd_is_open(ctx, fd)) { + ctx->fd_table->next_hostonly_fd = fd + 1; + return fd; + } + } + for (fd = base_fd; fd < start_fd; fd++) { + if (!child_fd_is_open(ctx, fd)) { + ctx->fd_table->next_hostonly_fd = fd + 1; + return fd; + } + } + + return -1; +} + +static long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx) +{ + long fd; + long end_fd = KBOX_FD_BASE + KBOX_FD_TABLE_MAX; + + if (!ctx || !ctx->fd_table) + return -1; + + fd = ctx->fd_table->next_hostonly_fd; + if (fd < KBOX_FD_HOSTONLY_BASE || fd >= end_fd) + fd = KBOX_FD_HOSTONLY_BASE; + return fd; +} + +static int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx) +{ + if (!ctx) + return -1; + if (ctx->proc_self_fd_dirfd >= 0) + return ctx->proc_self_fd_dirfd; + + ctx->proc_self_fd_dirfd = + open("/proc/self/fd", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + return ctx->proc_self_fd_dirfd; +} + +static int ensure_proc_mem_fd(struct kbox_supervisor_ctx *ctx) +{ + char path[64]; + + if (!ctx || ctx->child_pid <= 0) + return -1; + if (ctx->proc_mem_fd >= 0) + return ctx->proc_mem_fd; + + snprintf(path, sizeof(path), "/proc/%d/mem", (int) ctx->child_pid); + ctx->proc_mem_fd = open(path, O_RDWR | O_CLOEXEC); + return ctx->proc_mem_fd; +} + +static int guest_mem_write_small_metadata(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) +{ + struct kbox_supervisor_ctx *mutable_ctx = + (struct kbox_supervisor_ctx *) ctx; + ssize_t n; + int fd; + + if (!ctx || !in) + return -EFAULT; + if (len == 0) + return 0; + if (remote_addr == 0) + return -EFAULT; + if (pid != ctx->child_pid || + ctx->active_guest_mem.ops != &kbox_process_vm_guest_mem_ops) + return guest_mem_write(ctx, pid, remote_addr, in, len); + + fd = ensure_proc_mem_fd(mutable_ctx); + if (fd < 0) + return guest_mem_write(ctx, pid, remote_addr, in, len); + + n = pwrite(fd, in, len, (off_t) remote_addr); + if (n < 0) + return guest_mem_write(ctx, pid, remote_addr, in, len); + if ((size_t) n != len) + return -EIO; + return 0; +} + +static int reopen_cached_shadow_fd( + struct kbox_supervisor_ctx *ctx, + const struct kbox_path_shadow_cache_entry *entry) +{ + char fd_name[32]; + int dirfd; + int fd; + + if (!entry) + return -1; + if (entry->path[0] != '\0') { + fd = open(entry->path, O_RDONLY | O_CLOEXEC); + if (fd >= 0) + return fd; + } + fd = entry->memfd; + if (fd < 0) + return -1; + dirfd = ensure_proc_self_fd_dir(ctx); + if (dirfd < 0) + return -1; + snprintf(fd_name, sizeof(fd_name), "%d", fd); + return openat(dirfd, fd_name, O_RDONLY | O_CLOEXEC); +} + +/* Promote a read-only regular LKL FD to a host-visible shadow at the same + * guest FD number on first eligible read-only access. This avoids paying the + * memfd copy cost at open time while still letting later read/lseek/fstat/mmap + * operations run on a real host FD. * - * Returns 1 if shadowing succeeded (caller should return *out_fd), 0 if - * shadowing is not applicable or failed (caller falls through to virtual - * FD path). + * Returns: + * 1 shadow is available (same-fd injected for seccomp, local-only for + * trap/rewrite) + * 0 shadow promotion not applicable + * -1 promotion attempted but failed */ -static int try_shadow_open(struct kbox_supervisor_ctx *ctx, - const struct kbox_seccomp_notif *notif, - long lkl_fd, - long flags, - const char *translated, - long *out_fd) -{ - /* Only shadow O_RDONLY opens of non-virtual, non-TTY paths. */ - if ((flags & O_ACCMODE) != O_RDONLY) +static int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long fd, + long lkl_fd) +{ + struct kbox_fd_entry *entry; + long flags; + int memfd; + int injected; + + off_t cur_off; + + if (!ctx || !req || !ctx->fd_table || fd < 0 || lkl_fd < 0) return 0; - if (kbox_is_lkl_virtual_path(translated)) + + entry = fd_table_entry(ctx->fd_table, fd); + if (!entry) return 0; - if (kbox_is_tty_like_path(translated)) + if (entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW || + entry->host_fd == KBOX_FD_LOCAL_ONLY_SHADOW) { + return 1; + } + if (entry->host_fd >= 0) + return 0; + + flags = kbox_lkl_fcntl(ctx->sysnrs, lkl_fd, F_GETFL, 0); + if (flags < 0 || (flags & O_ACCMODE) != O_RDONLY) return 0; - int memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); if (memfd < 0) - return 0; /* not shadowable; fall through to virtual FD */ + return -1; - /* Inject the memfd into the tracee. The tracee gets its own FD - * pointing at the same memfd; we close our copy afterward. - */ - int host_fd = kbox_notify_addfd(ctx->listener_fd, notif->id, memfd, 0); - if (host_fd < 0) { + cur_off = (off_t) kbox_lkl_lseek(ctx->sysnrs, lkl_fd, 0, SEEK_CUR); + if (cur_off >= 0 && lseek(memfd, cur_off, SEEK_SET) < 0) { close(memfd); - return 0; + return -1; } - /* Track in FD table: virtual FD holds both lkl_fd and host_fd. */ - long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, 0); - if (vfd < 0) { - close(memfd); - kbox_lkl_close(ctx->sysnrs, lkl_fd); - *out_fd = -EMFILE; - return 1; + if (req->source == KBOX_SYSCALL_SOURCE_SECCOMP) { + injected = request_addfd_at(ctx, req, memfd, (int) fd, + entry->cloexec ? O_CLOEXEC : 0); + if (injected < 0) { + close(memfd); + return -1; + } + entry->host_fd = KBOX_FD_HOST_SAME_FD_SHADOW; + } else { + entry->host_fd = KBOX_FD_LOCAL_ONLY_SHADOW; + } + entry->shadow_sp = memfd; + entry->shadow_writeback = 0; + + if (ctx->verbose) { + fprintf(stderr, "kbox: lazy shadow promote fd=%ld lkl_fd=%ld mode=%s\n", + fd, lkl_fd, + entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW ? "same-fd" + : "local-only"); + } + return 1; +} + +static struct kbox_dispatch forward_local_shadow_read_like( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry, + long lkl_fd, + int is_pread) +{ + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + size_t count; + size_t total = 0; + uint8_t *scratch = dispatch_scratch; + pid_t pid = kbox_syscall_request_pid(req); + + if (!entry || entry->shadow_sp < 0) + return kbox_dispatch_continue(); + if (count_raw < 0) + return kbox_dispatch_errno(EINVAL); + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + count = (size_t) count_raw; + if (count == 0) + return kbox_dispatch_value(0); + if (count > 1024 * 1024) + count = 1024 * 1024; + + while (total < count) { + size_t chunk_len = KBOX_IO_CHUNK_LEN; + ssize_t nr; + + if (chunk_len > count - total) + chunk_len = count - total; + if (is_pread) { + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + nr = pread(entry->shadow_sp, scratch, chunk_len, + (off_t) (offset + (long) total)); + } else { + nr = read(entry->shadow_sp, scratch, chunk_len); + } + if (nr < 0) { + if (total == 0) + return kbox_dispatch_errno(errno); + break; + } + if (nr == 0) + break; + if (guest_mem_write(ctx, pid, remote_buf + total, scratch, + (size_t) nr) < 0) { + return kbox_dispatch_errno(EFAULT); + } + total += (size_t) nr; + if ((size_t) nr < chunk_len) + break; + } + + if (!is_pread) { + off_t cur_off = lseek(entry->shadow_sp, 0, SEEK_CUR); + if (cur_off >= 0) + (void) kbox_lkl_lseek(ctx->sysnrs, lkl_fd, (long) cur_off, + SEEK_SET); + } + + return kbox_dispatch_value((int64_t) total); +} + +static struct kbox_dispatch forward_local_shadow_lseek( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry, + long lkl_fd) +{ + long off; + long whence; + off_t ret; + + if (!entry || entry->shadow_sp < 0) + return kbox_dispatch_continue(); + + off = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + whence = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + ret = lseek(entry->shadow_sp, (off_t) off, (int) whence); + if (ret < 0) + return kbox_dispatch_errno(errno); + + (void) kbox_lkl_lseek(ctx->sysnrs, lkl_fd, (long) ret, SEEK_SET); + return kbox_dispatch_value((int64_t) ret); +} + +static struct kbox_dispatch forward_local_shadow_fstat( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) +{ + struct stat host_stat; + uint64_t remote_stat = kbox_syscall_request_arg(req, 1); + + if (!entry || entry->shadow_sp < 0) + return kbox_dispatch_continue(); + if (remote_stat == 0) + return kbox_dispatch_errno(EFAULT); + if (fstat(entry->shadow_sp, &host_stat) < 0) + return kbox_dispatch_errno(errno); + if (guest_mem_write(ctx, kbox_syscall_request_pid(req), remote_stat, + &host_stat, sizeof(host_stat)) < 0) { + return kbox_dispatch_errno(EFAULT); } - kbox_fd_table_set_host_fd(ctx->fd_table, vfd, (long) host_fd); - - close(memfd); /* tracee has its own copy */ - *out_fd = (long) host_fd; - return 1; + return kbox_dispatch_value(0); } /* statx struct field offsets (standard on x86_64 and aarch64). */ @@ -308,17 +1157,21 @@ static int try_shadow_open(struct kbox_supervisor_ctx *ctx, static struct kbox_dispatch finish_open_dispatch( struct kbox_supervisor_ctx *ctx, - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, long lkl_fd, long flags, const char *translated) { - long shadow_fd; + struct kbox_dispatch shadow_dispatch; - if (try_shadow_open(ctx, notif, lkl_fd, flags, translated, &shadow_fd)) { - if (shadow_fd < 0) - return kbox_dispatch_errno((int) (-shadow_fd)); - return kbox_dispatch_value((int64_t) shadow_fd); + if (req && try_cached_shadow_open_dispatch(ctx, req, flags, translated, + &shadow_dispatch)) { + return shadow_dispatch; + } + + if (req && try_writeback_shadow_open(ctx, req, lkl_fd, flags, translated, + &shadow_dispatch)) { + return shadow_dispatch; } long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, @@ -327,6 +1180,8 @@ static struct kbox_dispatch finish_open_dispatch( kbox_lkl_close(ctx->sysnrs, lkl_fd); return kbox_dispatch_errno(EMFILE); } + if (flags & O_CLOEXEC) + kbox_fd_table_set_cloexec(ctx->fd_table, vfd, 1); return kbox_dispatch_value((int64_t) vfd); } @@ -363,20 +1218,276 @@ static void normalize_statx_if_needed(struct kbox_supervisor_ctx *ctx, memcpy(&statx_buf[STATX_GID_OFFSET], &n_gid, 4); } +static void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx) +{ + size_t i; + + if (!ctx) + return; + for (i = 0; i < KBOX_PATH_SHADOW_CACHE_MAX; i++) { + if (ctx->path_shadow_cache[i].valid && + ctx->path_shadow_cache[i].memfd >= 0) { + close(ctx->path_shadow_cache[i].memfd); + } + memset(&ctx->path_shadow_cache[i], 0, + sizeof(ctx->path_shadow_cache[i])); + ctx->path_shadow_cache[i].memfd = -1; + } + invalidate_translated_path_cache(ctx); +} + +static struct kbox_path_shadow_cache_entry *find_path_shadow_cache( + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ + size_t i; + + if (!ctx || !translated) + return NULL; + for (i = 0; i < KBOX_PATH_SHADOW_CACHE_MAX; i++) { + struct kbox_path_shadow_cache_entry *entry = &ctx->path_shadow_cache[i]; + if (entry->valid && strcmp(entry->path, translated) == 0) + return entry; + } + return NULL; +} + +static struct kbox_path_shadow_cache_entry *reserve_path_shadow_cache_slot( + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ + size_t i; + struct kbox_path_shadow_cache_entry *entry; + + entry = find_path_shadow_cache(ctx, translated); + if (entry) + return entry; + + for (i = 0; i < KBOX_PATH_SHADOW_CACHE_MAX; i++) { + entry = &ctx->path_shadow_cache[i]; + if (!entry->valid) + return entry; + } + + entry = &ctx->path_shadow_cache[0]; + if (entry->memfd >= 0) + close(entry->memfd); + memset(entry, 0, sizeof(*entry)); + entry->memfd = -1; + return entry; +} + +static int ensure_path_shadow_cache(struct kbox_supervisor_ctx *ctx, + const char *translated) +{ + struct kbox_path_shadow_cache_entry *entry; + struct stat host_stat; + int host_fd; + + if (!ctx || !translated || translated[0] == '\0' || + ctx->active_writeback_shadows > 0 || + kbox_is_lkl_virtual_path(translated) || + kbox_is_tty_like_path(translated)) + return 0; + + entry = find_path_shadow_cache(ctx, translated); + if (entry) + return 1; + + host_fd = open(translated, O_RDONLY | O_CLOEXEC); + if (host_fd < 0) + return 0; + + if (fstat(host_fd, &host_stat) < 0) { + close(host_fd); + return 0; + } + if (!S_ISREG(host_stat.st_mode)) { + close(host_fd); + return 0; + } + normalize_host_stat_if_needed(ctx, translated, &host_stat); + + entry = reserve_path_shadow_cache_slot(ctx, translated); + if (!entry) { + close(host_fd); + return 0; + } + + entry->valid = 1; + entry->memfd = host_fd; + strncpy(entry->path, translated, sizeof(entry->path) - 1); + entry->path[sizeof(entry->path) - 1] = '\0'; + entry->host_stat = host_stat; + return 1; +} + +static int try_cached_shadow_open_dispatch( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long flags, + const char *translated, + struct kbox_dispatch *out) +{ + struct kbox_path_shadow_cache_entry *entry; + int injected; + int dup_fd; + long fast_fd; + + if (!ctx || !req || !translated || !out) + return 0; + if ((flags & O_ACCMODE) != O_RDONLY) + return 0; + if (flags & ~(O_RDONLY | O_CLOEXEC)) + return 0; + if (!ensure_path_shadow_cache(ctx, translated)) + return 0; + + entry = find_path_shadow_cache(ctx, translated); + if (!entry || entry->memfd < 0) + return 0; + + dup_fd = reopen_cached_shadow_fd(ctx, entry); + if (dup_fd < 0) + return 0; + + fast_fd = next_hostonly_fd_hint(ctx); + if (fast_fd < 0) { + close(dup_fd); + return 0; + } + injected = request_addfd_at(ctx, req, dup_fd, (int) fast_fd, + (flags & O_CLOEXEC) ? O_CLOEXEC : 0); + if (injected < 0) { + fast_fd = allocate_passthrough_hostonly_fd(ctx); + if (fast_fd < 0) { + close(dup_fd); + return 0; + } + injected = request_addfd_at(ctx, req, dup_fd, (int) fast_fd, + (flags & O_CLOEXEC) ? O_CLOEXEC : 0); + } + close(dup_fd); + if (injected < 0) + return 0; + ctx->fd_table->next_hostonly_fd = fast_fd; + + *out = kbox_dispatch_value((int64_t) fast_fd); + return 1; +} + +static int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, + const char *translated, + uint64_t remote_stat, + pid_t pid) +{ + struct kbox_path_shadow_cache_entry *entry; + + if (!ctx || !translated || remote_stat == 0) + return 0; + if (!ensure_path_shadow_cache(ctx, translated)) + return 0; + + entry = find_path_shadow_cache(ctx, translated); + if (!entry) + return 0; + + return guest_mem_write_small_metadata(ctx, pid, remote_stat, + &entry->host_stat, + sizeof(entry->host_stat)) == 0; +} + +static void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) +{ + if (!ctx || !entry || entry->shadow_writeback) + return; + entry->shadow_writeback = 1; + ctx->active_writeback_shadows++; + invalidate_path_shadow_cache(ctx); +} + +static void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) +{ + if (!ctx || !entry || !entry->shadow_writeback) + return; + entry->shadow_writeback = 0; + if (ctx->active_writeback_shadows > 0) + ctx->active_writeback_shadows--; +} + +static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long lkl_fd, + long flags, + const char *translated, + struct kbox_dispatch *out) +{ + struct kbox_fd_entry *entry; + int memfd; + int injected; + long fast_fd; + + if (!ctx || !req || !out || lkl_fd < 0 || !translated) + return 0; + if ((flags & O_ACCMODE) == O_RDONLY) + return 0; + if (kbox_is_lkl_virtual_path(translated) || + kbox_is_tty_like_path(translated)) + return 0; + + memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + if (memfd < 0) + return 0; + + fast_fd = kbox_fd_table_insert_fast(ctx->fd_table, lkl_fd, 0); + if (fast_fd < 0) { + close(memfd); + return 0; + } + + injected = request_addfd_at(ctx, req, memfd, (int) fast_fd, + (flags & O_CLOEXEC) ? O_CLOEXEC : 0); + if (injected < 0) { + kbox_fd_table_remove(ctx->fd_table, fast_fd); + close(memfd); + return 0; + } + + entry = fd_table_entry(ctx->fd_table, fast_fd); + if (!entry) { + kbox_fd_table_remove(ctx->fd_table, fast_fd); + close(memfd); + return 0; + } + + entry->host_fd = KBOX_FD_HOST_SAME_FD_SHADOW; + entry->shadow_sp = memfd; + note_shadow_writeback_open(ctx, entry); + if (ctx->verbose) { + fprintf(stderr, + "kbox: writable shadow promote fd=%ld lkl_fd=%ld path=%s\n", + fast_fd, lkl_fd, translated); + } + *out = kbox_dispatch_value((int64_t) fast_fd); + return 1; +} + typedef long (*kbox_getdents_fn)(const struct kbox_sysnrs *sysnrs, long fd, void *buf, long count); static struct kbox_dispatch forward_getdents_common( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, kbox_getdents_fn getdents_fn) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - uint64_t remote_dirp = notif->data.args[1]; - int64_t count_raw = to_c_long_arg(notif->data.args[2]); + uint64_t remote_dirp = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); size_t count, n; uint8_t *buf; long ret; @@ -392,27 +1503,21 @@ static struct kbox_dispatch forward_getdents_common( return kbox_dispatch_value(0); if (remote_dirp == 0) return kbox_dispatch_errno(EFAULT); - if (count > 1024 * 1024) - count = 1024 * 1024; + if (count > KBOX_IO_CHUNK_LEN) + count = KBOX_IO_CHUNK_LEN; - buf = malloc(count); - if (!buf) - return kbox_dispatch_errno(ENOMEM); + buf = dispatch_scratch; ret = getdents_fn(ctx->sysnrs, lkl_fd, buf, (long) count); - if (ret < 0) { - free(buf); + if (ret < 0) return kbox_dispatch_errno((int) (-ret)); - } n = (size_t) ret; - if (n > count) { - free(buf); + if (n > count) return kbox_dispatch_errno(EIO); - } - wrc = kbox_vm_write(notif->pid, remote_dirp, buf, n); - free(buf); + wrc = guest_mem_write(ctx, kbox_syscall_request_pid(req), remote_dirp, buf, + n); if (wrc < 0) return kbox_dispatch_errno(-wrc); return kbox_dispatch_value((int64_t) n); @@ -421,18 +1526,19 @@ static struct kbox_dispatch forward_getdents_common( /* forward_openat. */ static struct kbox_dispatch forward_openat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); - long flags = host_to_lkl_open_flags(to_c_long_arg(notif->data.args[2])); - long mode = to_c_long_arg(notif->data.args[3]); + long flags = + host_to_lkl_open_flags(to_c_long_arg(kbox_syscall_request_arg(req, 2))); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 3)); if (kbox_is_lkl_virtual_path(translated)) return kbox_dispatch_continue(); @@ -442,28 +1548,38 @@ static struct kbox_dispatch forward_openat( if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); + { + struct kbox_dispatch cached_dispatch; + if (try_cached_shadow_open_dispatch(ctx, req, flags, translated, + &cached_dispatch)) + return cached_dispatch; + } + long ret = kbox_lkl_openat(ctx->sysnrs, lkl_dirfd, translated, flags, mode); if (ret < 0) return kbox_dispatch_errno((int) (-ret)); - return finish_open_dispatch(ctx, notif, ret, flags, translated); + if ((flags & O_ACCMODE) != O_RDONLY || (flags & O_TRUNC)) + invalidate_path_shadow_cache(ctx); + return finish_open_dispatch(ctx, req, ret, flags, translated); } /* forward_openat2. */ static struct kbox_dispatch forward_openat2( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); struct kbox_open_how how; - rc = kbox_vm_read_open_how(notif->pid, notif->data.args[2], - notif->data.args[3], &how); + rc = guest_mem_read_open_how(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, 2), + kbox_syscall_request_arg(req, 3), &how); if (rc < 0) return kbox_dispatch_errno(-rc); how.flags = (uint64_t) host_to_lkl_open_flags((long) how.flags); @@ -476,6 +1592,14 @@ static struct kbox_dispatch forward_openat2( if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); + if (((long) how.flags & O_ACCMODE) == O_RDONLY) { + struct kbox_dispatch cached_dispatch; + if (try_cached_shadow_open_dispatch(ctx, req, (long) how.flags, + translated, &cached_dispatch)) { + return cached_dispatch; + } + } + long ret = kbox_lkl_openat2(ctx->sysnrs, lkl_dirfd, translated, &how, (long) sizeof(how)); if (ret == -ENOSYS) { @@ -486,47 +1610,119 @@ static struct kbox_dispatch forward_openat2( } if (ret < 0) return kbox_dispatch_errno((int) (-ret)); - return finish_open_dispatch(ctx, notif, ret, (long) how.flags, translated); + if (((long) how.flags & O_ACCMODE) != O_RDONLY || + ((long) how.flags & O_TRUNC)) { + invalidate_path_shadow_cache(ctx); + } + return finish_open_dispatch(ctx, req, ret, (long) how.flags, translated); } /* forward_open_legacy (x86_64 open(2), nr=2). */ static struct kbox_dispatch forward_open_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - long flags = host_to_lkl_open_flags(to_c_long_arg(notif->data.args[1])); - long mode = to_c_long_arg(notif->data.args[2]); + long flags = + host_to_lkl_open_flags(to_c_long_arg(kbox_syscall_request_arg(req, 1))); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (kbox_is_lkl_virtual_path(translated)) return kbox_dispatch_continue(); if (kbox_is_tty_like_path(translated)) return kbox_dispatch_continue(); + { + struct kbox_dispatch cached_dispatch; + if (try_cached_shadow_open_dispatch(ctx, req, flags, translated, + &cached_dispatch)) + return cached_dispatch; + } + long ret = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, translated, flags, mode); if (ret < 0) return kbox_dispatch_errno((int) (-ret)); - return finish_open_dispatch(ctx, notif, ret, flags, translated); + if ((flags & O_ACCMODE) != O_RDONLY || (flags & O_TRUNC)) + invalidate_path_shadow_cache(ctx); + return finish_open_dispatch(ctx, req, ret, flags, translated); +} + +static int sync_shadow_writeback(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) +{ + struct stat st; + uint8_t *buf = NULL; + off_t off = 0; + + if (!ctx || !entry || !entry->shadow_writeback || entry->shadow_sp < 0 || + entry->lkl_fd < 0) + return 0; + + if (fstat(entry->shadow_sp, &st) < 0) + return -errno; + if (kbox_lkl_ftruncate(ctx->sysnrs, entry->lkl_fd, (long) st.st_size) < 0) + return -EIO; + if (lseek(entry->shadow_sp, 0, SEEK_SET) < 0) + return -errno; + + buf = dispatch_scratch; + + while (off < st.st_size) { + size_t chunk = KBOX_IO_CHUNK_LEN; + ssize_t rd; + long wr; + + if ((off_t) chunk > st.st_size - off) + chunk = (size_t) (st.st_size - off); + rd = read(entry->shadow_sp, buf, chunk); + if (rd < 0) + return -errno; + if (rd == 0) + break; + wr = kbox_lkl_pwrite64(ctx->sysnrs, entry->lkl_fd, buf, (long) rd, + (long) off); + if (wr < 0) + return (int) wr; + off += rd; + } + + return 0; } /* forward_close. */ static struct kbox_dispatch forward_close( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + int same_fd_shadow = entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW; + + if (entry && entry->lkl_fd == KBOX_LKL_FD_SHADOW_ONLY && + entry->shadow_sp >= 0) { + kbox_fd_table_remove(ctx->fd_table, fd); + return kbox_dispatch_continue(); + } if (lkl_fd >= 0) { + if (same_fd_shadow) { + if (entry && entry->shadow_writeback) + (void) sync_shadow_writeback(ctx, entry); + note_shadow_writeback_close(ctx, entry); + kbox_lkl_close(ctx->sysnrs, lkl_fd); + kbox_fd_table_remove(ctx->fd_table, fd); + return kbox_dispatch_continue(); + } + long ret = kbox_lkl_close(ctx->sysnrs, lkl_fd); if (ret < 0 && fd >= KBOX_FD_BASE) return kbox_dispatch_errno((int) (-ret)); @@ -549,7 +1745,12 @@ static struct kbox_dispatch forward_close( */ long vfd = kbox_fd_table_find_by_host_fd(ctx->fd_table, fd); if (vfd >= 0) { + struct kbox_fd_entry *shadow_entry = fd_table_entry(ctx->fd_table, vfd); long lkl = kbox_fd_table_get_lkl(ctx->fd_table, vfd); + + if (shadow_entry && shadow_entry->shadow_writeback) + (void) sync_shadow_writeback(ctx, shadow_entry); + note_shadow_writeback_close(ctx, shadow_entry); kbox_fd_table_remove(ctx->fd_table, vfd); if (lkl >= 0) { @@ -580,17 +1781,34 @@ static struct kbox_dispatch forward_close( /* forward_read_like (read and pread64). */ static struct kbox_dispatch forward_read_like( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, int is_pread) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + { + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + } + { + struct kbox_fd_entry *entry; + int shadow_rc = ensure_same_fd_shadow(ctx, req, fd, lkl_fd); + if (shadow_rc > 0) { + entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_LOCAL_ONLY_SHADOW) { + return forward_local_shadow_read_like(req, ctx, entry, lkl_fd, + is_pread); + } + return kbox_dispatch_continue(); + } + } - uint64_t remote_buf = notif->data.args[1]; - int64_t count_raw = to_c_long_arg(notif->data.args[2]); + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (count_raw < 0) return kbox_dispatch_errno(EINVAL); size_t count = (size_t) count_raw; @@ -600,15 +1818,13 @@ static struct kbox_dispatch forward_read_like( if (count == 0) return kbox_dispatch_value(0); - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); size_t max_count = 1024 * 1024; if (count > max_count) count = max_count; size_t total = 0; - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) - return kbox_dispatch_errno(ENOMEM); + uint8_t *scratch = dispatch_scratch; while (total < count) { size_t chunk_len = KBOX_IO_CHUNK_LEN; @@ -617,7 +1833,7 @@ static struct kbox_dispatch forward_read_like( long ret; if (is_pread) { - long offset = to_c_long_arg(notif->data.args[3]); + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 3)); ret = kbox_lkl_pread64(ctx->sysnrs, lkl_fd, scratch, (long) chunk_len, offset + (long) total); } else { @@ -626,7 +1842,6 @@ static struct kbox_dispatch forward_read_like( if (ret < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno((int) (-ret)); } break; @@ -637,9 +1852,15 @@ static struct kbox_dispatch forward_read_like( break; uint64_t remote = remote_buf + total; - int wrc = kbox_vm_write(pid, remote, scratch, n); + if (ctx->verbose) { + fprintf( + stderr, + "kbox: %s fd=%ld lkl_fd=%ld remote=0x%llx chunk=%zu ret=%ld\n", + is_pread ? "pread64" : "read", fd, lkl_fd, + (unsigned long long) remote, chunk_len, ret); + } + int wrc = guest_mem_write(ctx, pid, remote, scratch, n); if (wrc < 0) { - free(scratch); return kbox_dispatch_errno(-wrc); } @@ -648,25 +1869,28 @@ static struct kbox_dispatch forward_read_like( break; } - free(scratch); return kbox_dispatch_value((int64_t) total); } /* forward_write. */ static struct kbox_dispatch forward_write( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); int mirror_host = kbox_fd_table_mirror_tty(ctx->fd_table, fd); - uint64_t remote_buf = notif->data.args[1]; - int64_t count_raw = to_c_long_arg(notif->data.args[2]); + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (count_raw < 0) return kbox_dispatch_errno(EINVAL); size_t count = (size_t) count_raw; @@ -676,15 +1900,13 @@ static struct kbox_dispatch forward_write( if (count == 0) return kbox_dispatch_value(0); - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); size_t max_count = 1024 * 1024; if (count > max_count) count = max_count; size_t total = 0; - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) - return kbox_dispatch_errno(ENOMEM); + uint8_t *scratch = dispatch_scratch; while (total < count) { size_t chunk_len = KBOX_IO_CHUNK_LEN; @@ -692,11 +1914,10 @@ static struct kbox_dispatch forward_write( chunk_len = count - total; uint64_t remote = remote_buf + total; - int rrc = kbox_vm_read(pid, remote, scratch, chunk_len); + int rrc = guest_mem_read(ctx, pid, remote, scratch, chunk_len); if (rrc < 0) { if (total > 0) break; - free(scratch); return kbox_dispatch_errno(-rrc); } @@ -704,7 +1925,6 @@ static struct kbox_dispatch forward_write( kbox_lkl_write(ctx->sysnrs, lkl_fd, scratch, (long) chunk_len); if (ret < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno((int) (-ret)); } break; @@ -725,7 +1945,8 @@ static struct kbox_dispatch forward_write( break; } - free(scratch); + if (total > 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_value((int64_t) total); } @@ -741,13 +1962,13 @@ static struct kbox_dispatch forward_write( * falling back to read+write, so returning ENOSYS is not viable. */ static struct kbox_dispatch forward_sendfile( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long out_fd = to_c_long_arg(notif->data.args[0]); - long in_fd = to_c_long_arg(notif->data.args[1]); - uint64_t offset_ptr = notif->data.args[2]; - int64_t count_raw = to_c_long_arg(notif->data.args[3]); + long out_fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long in_fd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + uint64_t offset_ptr = kbox_syscall_request_arg(req, 2); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 3)); long in_lkl = kbox_fd_table_get_lkl(ctx->fd_table, in_fd); long out_lkl = kbox_fd_table_get_lkl(ctx->fd_table, out_fd); @@ -786,18 +2007,16 @@ static struct kbox_dispatch forward_sendfile( count = 1024 * 1024; /* Read optional offset from tracee memory. */ - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); off_t offset = 0; int has_offset = (offset_ptr != 0); if (has_offset) { - int rc = kbox_vm_read(pid, offset_ptr, &offset, sizeof(offset)); + int rc = guest_mem_read(ctx, pid, offset_ptr, &offset, sizeof(offset)); if (rc < 0) return kbox_dispatch_errno(-rc); } - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) - return kbox_dispatch_errno(ENOMEM); + uint8_t *scratch = dispatch_scratch; size_t total = 0; @@ -816,7 +2035,6 @@ static struct kbox_dispatch forward_sendfile( if (nr < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno((int) (-nr)); } break; @@ -831,7 +2049,6 @@ static struct kbox_dispatch forward_sendfile( long wr = kbox_lkl_write(ctx->sysnrs, out_lkl, scratch, (long) n); if (wr < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno((int) (-wr)); } break; @@ -844,7 +2061,6 @@ static struct kbox_dispatch forward_sendfile( ssize_t wr = write((int) out_fd, scratch, n); if (wr < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno(errno); } break; @@ -859,47 +2075,66 @@ static struct kbox_dispatch forward_sendfile( /* Update offset in tracee memory if provided. */ if (has_offset && total > 0) { off_t new_off = offset + (off_t) total; - kbox_vm_write(pid, offset_ptr, &new_off, sizeof(new_off)); + guest_mem_write(ctx, pid, offset_ptr, &new_off, sizeof(new_off)); } - free(scratch); return kbox_dispatch_value((int64_t) total); } /* forward_lseek. */ static struct kbox_dispatch forward_lseek( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + { + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + } + { + struct kbox_fd_entry *entry; + int shadow_rc = ensure_same_fd_shadow(ctx, req, fd, lkl_fd); + if (shadow_rc > 0) { + entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_LOCAL_ONLY_SHADOW) + return forward_local_shadow_lseek(req, ctx, entry, lkl_fd); + return kbox_dispatch_continue(); + } + } - long off = to_c_long_arg(notif->data.args[1]); - long whence = to_c_long_arg(notif->data.args[2]); + long off = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long whence = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long ret = kbox_lkl_lseek(ctx->sysnrs, lkl_fd, off, whence); + if (ctx->verbose) { + fprintf(stderr, + "kbox: lseek fd=%ld lkl_fd=%ld off=%ld whence=%ld ret=%ld\n", + fd, lkl_fd, off, whence, ret); + } return kbox_dispatch_from_lkl(ret); } /* forward_fcntl. */ static struct kbox_dispatch forward_fcntl( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) { /* Shadow socket: handle F_DUPFD* and F_SETFL. */ long svfd = kbox_fd_table_find_by_host_fd(ctx->fd_table, fd); if (svfd >= 0) { - long scmd = to_c_long_arg(notif->data.args[1]); + long scmd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); if (scmd == F_DUPFD || scmd == F_DUPFD_CLOEXEC) { - long minfd = to_c_long_arg(notif->data.args[2]); + long minfd = to_c_long_arg(kbox_syscall_request_arg(req, 2)); /* When minfd > 0, skip ADDFD (can't honor the minimum) * and let CONTINUE handle it correctly. The dup is * untracked but no FD leaks. @@ -913,8 +2148,7 @@ static struct kbox_dispatch forward_fcntl( orig = &ctx->fd_table->low_fds[svfd]; if (orig && orig->shadow_sp >= 0) { uint32_t af = (scmd == F_DUPFD_CLOEXEC) ? O_CLOEXEC : 0; - int nh = kbox_notify_addfd(ctx->listener_fd, notif->id, - orig->shadow_sp, af); + int nh = request_addfd(ctx, req, orig->shadow_sp, af); if (nh >= 0) { long nv = kbox_fd_table_insert(ctx->fd_table, orig->lkl_fd, 0); @@ -941,14 +2175,14 @@ static struct kbox_dispatch forward_fcntl( } } if (scmd == F_SETFL) { - long sarg = to_c_long_arg(notif->data.args[2]); + long sarg = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long slkl = kbox_fd_table_get_lkl(ctx->fd_table, svfd); if (slkl >= 0) kbox_lkl_fcntl(ctx->sysnrs, slkl, F_SETFL, sarg); } if (scmd == F_SETFD) { /* Keep fd-table cloexec in sync with host kernel. */ - long sarg = to_c_long_arg(notif->data.args[2]); + long sarg = to_c_long_arg(kbox_syscall_request_arg(req, 2)); kbox_fd_table_set_cloexec(ctx->fd_table, svfd, (sarg & FD_CLOEXEC) ? 1 : 0); } @@ -957,8 +2191,8 @@ static struct kbox_dispatch forward_fcntl( return kbox_dispatch_continue(); } - long cmd = to_c_long_arg(notif->data.args[1]); - long arg = to_c_long_arg(notif->data.args[2]); + long cmd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long arg = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC) { long ret = kbox_lkl_fcntl(ctx->sysnrs, lkl_fd, cmd, arg); @@ -991,10 +2225,10 @@ static struct kbox_dispatch forward_fcntl( /* forward_dup. */ -static struct kbox_dispatch forward_dup(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch forward_dup(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) { @@ -1015,8 +2249,7 @@ static struct kbox_dispatch forward_dup(const struct kbox_seccomp_notif *notif, return kbox_dispatch_continue(); long orig_lkl = orig->lkl_fd; - int new_host = - kbox_notify_addfd(ctx->listener_fd, notif->id, orig->shadow_sp, 0); + int new_host = request_addfd(ctx, req, orig->shadow_sp, 0); if (new_host < 0) return kbox_dispatch_errno(-new_host); @@ -1061,11 +2294,11 @@ static struct kbox_dispatch forward_dup(const struct kbox_seccomp_notif *notif, /* forward_dup2. */ -static struct kbox_dispatch forward_dup2(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch forward_dup2(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long oldfd = to_c_long_arg(notif->data.args[0]); - long newfd = to_c_long_arg(notif->data.args[1]); + long oldfd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long newfd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long lkl_old = kbox_fd_table_get_lkl(ctx->fd_table, oldfd); if (lkl_old < 0) { @@ -1082,8 +2315,7 @@ static struct kbox_dispatch forward_dup2(const struct kbox_seccomp_notif *notif, orig = &ctx->fd_table->low_fds[orig_vfd]; if (orig && orig->shadow_sp >= 0) { int new_host = - kbox_notify_addfd_at(ctx->listener_fd, notif->id, - orig->shadow_sp, (int) newfd, 0); + request_addfd_at(ctx, req, orig->shadow_sp, (int) newfd, 0); if (new_host >= 0) { /* Remove any stale mapping at newfd (virtual or shadow). */ long stale = kbox_fd_table_get_lkl(ctx->fd_table, newfd); @@ -1190,12 +2422,12 @@ static struct kbox_dispatch forward_dup2(const struct kbox_seccomp_notif *notif, /* forward_dup3. */ -static struct kbox_dispatch forward_dup3(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch forward_dup3(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long oldfd = to_c_long_arg(notif->data.args[0]); - long newfd = to_c_long_arg(notif->data.args[1]); - long flags = to_c_long_arg(notif->data.args[2]); + long oldfd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long newfd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); /* dup3 only accepts O_CLOEXEC; reject anything else per POSIX. */ if (flags & ~((long) O_CLOEXEC)) @@ -1218,9 +2450,8 @@ static struct kbox_dispatch forward_dup3(const struct kbox_seccomp_notif *notif, orig = &ctx->fd_table->low_fds[orig_vfd]; if (orig && orig->shadow_sp >= 0) { uint32_t af = (flags & O_CLOEXEC) ? O_CLOEXEC : 0; - int new_host = - kbox_notify_addfd_at(ctx->listener_fd, notif->id, - orig->shadow_sp, (int) newfd, af); + int new_host = request_addfd_at(ctx, req, orig->shadow_sp, + (int) newfd, af); if (new_host >= 0) { /* Remove stale mapping at newfd (virtual or shadow). */ long stale3 = kbox_fd_table_get_lkl(ctx->fd_table, newfd); @@ -1326,16 +2557,31 @@ static struct kbox_dispatch forward_dup3(const struct kbox_seccomp_notif *notif, /* forward_fstat. */ static struct kbox_dispatch forward_fstat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + { + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + } + { + struct kbox_fd_entry *entry; + int shadow_rc = ensure_same_fd_shadow(ctx, req, fd, lkl_fd); + if (shadow_rc > 0) { + entry = fd_table_entry(ctx->fd_table, fd); + if (entry && entry->host_fd == KBOX_FD_LOCAL_ONLY_SHADOW) + return forward_local_shadow_fstat(req, ctx, entry); + return kbox_dispatch_continue(); + } + } - uint64_t remote_stat = notif->data.args[1]; + uint64_t remote_stat = kbox_syscall_request_arg(req, 1); if (remote_stat == 0) return kbox_dispatch_errno(EFAULT); @@ -1348,8 +2594,9 @@ static struct kbox_dispatch forward_fstat( struct stat host_stat; kbox_lkl_stat_to_host(&kst, &host_stat); - pid_t pid = notif->pid; - int wrc = kbox_vm_write(pid, remote_stat, &host_stat, sizeof(host_stat)); + int wrc = guest_mem_write_small_metadata(ctx, kbox_syscall_request_pid(req), + remote_stat, &host_stat, + sizeof(host_stat)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -1359,23 +2606,29 @@ static struct kbox_dispatch forward_fstat( /* forward_newfstatat. */ static struct kbox_dispatch forward_newfstatat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - uint64_t remote_stat = notif->data.args[2]; + uint64_t remote_stat = kbox_syscall_request_arg(req, 2); if (remote_stat == 0) return kbox_dispatch_errno(EFAULT); - long flags = to_c_long_arg(notif->data.args[3]); + if (translated[0] != '\0' && + try_cached_shadow_stat_dispatch(ctx, translated, remote_stat, + kbox_syscall_request_pid(req))) { + return kbox_dispatch_value(0); + } + + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); struct kbox_lkl_stat kst; memset(&kst, 0, sizeof(kst)); @@ -1393,32 +2646,89 @@ static struct kbox_dispatch forward_newfstatat( kbox_lkl_stat_to_host(&kst, &host_stat); normalize_host_stat_if_needed(ctx, translated, &host_stat); - int wrc = - kbox_vm_write(notif->pid, remote_stat, &host_stat, sizeof(host_stat)); + int wrc = guest_mem_write_small_metadata(ctx, kbox_syscall_request_pid(req), + remote_stat, &host_stat, + sizeof(host_stat)); if (wrc < 0) return kbox_dispatch_errno(-wrc); return kbox_dispatch_value(0); } +int kbox_dispatch_try_rewrite_wrapper_fast_path( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out) +{ + if (!ctx || !req || !out || !ctx->host_nrs) + return 0; + if (req->source != KBOX_SYSCALL_SOURCE_REWRITE) + return 0; + + if (req->nr == ctx->host_nrs->newfstatat) { + *out = forward_newfstatat(req, ctx); + return 1; + } + if (req->nr == ctx->host_nrs->fstat) { + *out = forward_fstat(req, ctx); + return 1; + } + if (req->nr == ctx->host_nrs->openat) { + *out = forward_openat(req, ctx); + return 1; + } + if (ctx->host_nrs->openat2 >= 0 && req->nr == ctx->host_nrs->openat2) { + *out = forward_openat2(req, ctx); + return 1; + } +#if defined(__x86_64__) + if (ctx->host_nrs->open >= 0 && req->nr == ctx->host_nrs->open) { + *out = forward_open_legacy(req, ctx); + return 1; + } +#endif + if (req->nr == ctx->host_nrs->read) { + *out = forward_read_like(req, ctx, 0); + return 1; + } + if (ctx->host_nrs->pread64 >= 0 && req->nr == ctx->host_nrs->pread64) { + *out = forward_read_like(req, ctx, 1); + return 1; + } + if (req->nr == ctx->host_nrs->write) { + *out = forward_write(req, ctx); + return 1; + } + if (req->nr == ctx->host_nrs->lseek) { + *out = forward_lseek(req, ctx); + return 1; + } + if (req->nr == ctx->host_nrs->close) { + *out = forward_close(req, ctx); + return 1; + } + + return 0; +} + /* forward_statx. */ static struct kbox_dispatch forward_statx( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - int flags = (int) to_c_long_arg(notif->data.args[2]); - unsigned mask = (unsigned) to_c_long_arg(notif->data.args[3]); - uint64_t remote_statx = notif->data.args[4]; + int flags = (int) to_c_long_arg(kbox_syscall_request_arg(req, 2)); + unsigned mask = (unsigned) to_c_long_arg(kbox_syscall_request_arg(req, 3)); + uint64_t remote_statx = kbox_syscall_request_arg(req, 4); if (remote_statx == 0) return kbox_dispatch_errno(EFAULT); @@ -1432,8 +2742,8 @@ static struct kbox_dispatch forward_statx( normalize_statx_if_needed(ctx, translated, statx_buf); - int wrc = - kbox_vm_write(notif->pid, remote_statx, statx_buf, sizeof(statx_buf)); + int wrc = guest_mem_write(ctx, kbox_syscall_request_pid(req), remote_statx, + statx_buf, sizeof(statx_buf)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -1442,67 +2752,67 @@ static struct kbox_dispatch forward_statx( /* forward_faccessat / forward_faccessat2. */ -static struct kbox_dispatch do_faccessat(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch do_faccessat(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, long flags) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long mode = to_c_long_arg(notif->data.args[2]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long ret = kbox_lkl_faccessat2(ctx->sysnrs, lkl_dirfd, translated, mode, flags); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_faccessat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return do_faccessat(notif, ctx, 0); + return do_faccessat(req, ctx, 0); } static struct kbox_dispatch forward_faccessat2( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return do_faccessat(notif, ctx, to_c_long_arg(notif->data.args[3])); + return do_faccessat(req, ctx, + to_c_long_arg(kbox_syscall_request_arg(req, 3))); } /* forward_getdents64. */ static struct kbox_dispatch forward_getdents64( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return forward_getdents_common(notif, ctx, kbox_lkl_getdents64); + return forward_getdents_common(req, ctx, kbox_lkl_getdents64); } /* forward_getdents (legacy). */ static struct kbox_dispatch forward_getdents( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return forward_getdents_common(notif, ctx, kbox_lkl_getdents); + return forward_getdents_common(req, ctx, kbox_lkl_getdents); } /* forward_chdir. */ static struct kbox_dispatch forward_chdir( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -1510,34 +2820,37 @@ static struct kbox_dispatch forward_chdir( if (ret < 0) return kbox_dispatch_errno((int) (-ret)); + invalidate_translated_path_cache(ctx); return kbox_dispatch_value(0); } /* forward_fchdir. */ static struct kbox_dispatch forward_fchdir( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); long ret = kbox_lkl_fchdir(ctx->sysnrs, lkl_fd); + if (ret >= 0) + invalidate_translated_path_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_getcwd. */ static struct kbox_dispatch forward_getcwd( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t remote_buf = notif->data.args[0]; - int64_t size_raw = to_c_long_arg(notif->data.args[1]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_buf = kbox_syscall_request_arg(req, 0); + int64_t size_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); if (remote_buf == 0) return kbox_dispatch_errno(EFAULT); @@ -1557,7 +2870,7 @@ static struct kbox_dispatch forward_getcwd( if (n == 0 || n > size) return kbox_dispatch_errno(EIO); - int wrc = kbox_vm_write(pid, remote_buf, out, n); + int wrc = guest_mem_write(ctx, pid, remote_buf, out, n); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -1567,58 +2880,62 @@ static struct kbox_dispatch forward_getcwd( /* forward_mkdirat. */ static struct kbox_dispatch forward_mkdirat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long mode = to_c_long_arg(notif->data.args[2]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long ret = kbox_lkl_mkdirat(ctx->sysnrs, lkl_dirfd, translated, mode); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_unlinkat. */ static struct kbox_dispatch forward_unlinkat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long flags = to_c_long_arg(notif->data.args[2]); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long ret = kbox_lkl_unlinkat(ctx->sysnrs, lkl_dirfd, translated, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_renameat / forward_renameat2. */ -static struct kbox_dispatch do_renameat(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch do_renameat(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, long flags) { char oldtrans[KBOX_MAX_PATH]; char newtrans[KBOX_MAX_PATH]; long olddirfd, newdirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, oldtrans, - sizeof(oldtrans), &olddirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, oldtrans, + sizeof(oldtrans), &olddirfd); if (rc < 0) return kbox_dispatch_errno(-rc); - rc = translate_guest_at_path(notif, ctx, 2, 3, newtrans, sizeof(newtrans), - &newdirfd); + rc = translate_request_at_path(req, ctx, 2, 3, newtrans, sizeof(newtrans), + &newdirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(olddirfd)) @@ -1628,40 +2945,43 @@ static struct kbox_dispatch do_renameat(const struct kbox_seccomp_notif *notif, long ret = kbox_lkl_renameat2(ctx->sysnrs, olddirfd, oldtrans, newdirfd, newtrans, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_renameat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return do_renameat(notif, ctx, 0); + return do_renameat(req, ctx, 0); } static struct kbox_dispatch forward_renameat2( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return do_renameat(notif, ctx, to_c_long_arg(notif->data.args[4])); + return do_renameat(req, ctx, + to_c_long_arg(kbox_syscall_request_arg(req, 4))); } /* forward_fchmodat. */ static struct kbox_dispatch forward_fchmodat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long mode = to_c_long_arg(notif->data.args[2]); - long flags = to_c_long_arg(notif->data.args[3]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); long ret = kbox_lkl_fchmodat(ctx->sysnrs, lkl_dirfd, translated, mode, flags); return kbox_dispatch_from_lkl(ret); @@ -1670,21 +2990,21 @@ static struct kbox_dispatch forward_fchmodat( /* forward_fchownat. */ static struct kbox_dispatch forward_fchownat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; - int rc = translate_guest_at_path(notif, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); + int rc = translate_request_at_path(req, ctx, 0, 1, translated, + sizeof(translated), &lkl_dirfd); if (rc < 0) return kbox_dispatch_errno(-rc); if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long owner = to_c_long_arg(notif->data.args[2]); - long group = to_c_long_arg(notif->data.args[3]); - long flags = to_c_long_arg(notif->data.args[4]); + long owner = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long group = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); long ret = kbox_lkl_fchownat(ctx->sysnrs, lkl_dirfd, translated, owner, group, flags); return kbox_dispatch_from_lkl(ret); @@ -1693,10 +3013,10 @@ static struct kbox_dispatch forward_fchownat( /* forward_mount. */ static struct kbox_dispatch forward_mount( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); char srcbuf[KBOX_MAX_PATH]; char tgtbuf[KBOX_MAX_PATH]; char fsbuf[KBOX_MAX_PATH]; @@ -1704,33 +3024,34 @@ static struct kbox_dispatch forward_mount( int rc; const char *source = NULL; - if (notif->data.args[0] != 0) { - rc = kbox_vm_read_string(pid, notif->data.args[0], srcbuf, - sizeof(srcbuf)); + if (kbox_syscall_request_arg(req, 0) != 0) { + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 0), + srcbuf, sizeof(srcbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); source = srcbuf; } - rc = kbox_vm_read_string(pid, notif->data.args[1], tgtbuf, sizeof(tgtbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + tgtbuf, sizeof(tgtbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); const char *fstype = NULL; - if (notif->data.args[2] != 0) { - rc = - kbox_vm_read_string(pid, notif->data.args[2], fsbuf, sizeof(fsbuf)); + if (kbox_syscall_request_arg(req, 2) != 0) { + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 2), + fsbuf, sizeof(fsbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); fstype = fsbuf; } - long flags = to_c_long_arg(notif->data.args[3]); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); const void *data = NULL; - if (notif->data.args[4] != 0) { - rc = kbox_vm_read_string(pid, notif->data.args[4], databuf, - sizeof(databuf)); + if (kbox_syscall_request_arg(req, 4) != 0) { + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 4), + databuf, sizeof(databuf)); if (rc < 0) return kbox_dispatch_errno(-rc); data = databuf; @@ -1743,19 +3064,19 @@ static struct kbox_dispatch forward_mount( /* forward_umount2. */ static struct kbox_dispatch forward_umount2( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); char pathbuf[KBOX_MAX_PATH]; int rc; - rc = - kbox_vm_read_string(pid, notif->data.args[0], pathbuf, sizeof(pathbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 0), + pathbuf, sizeof(pathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); - long flags = to_c_long_arg(notif->data.args[1]); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_umount2(ctx->sysnrs, pathbuf, flags); return kbox_dispatch_from_lkl(ret); } @@ -1763,21 +3084,26 @@ static struct kbox_dispatch forward_umount2( /* Legacy x86_64 syscall forwarders (stat, lstat, access, etc.). */ static struct kbox_dispatch forward_stat_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, int nofollow) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - uint64_t remote_stat = notif->data.args[1]; + uint64_t remote_stat = kbox_syscall_request_arg(req, 1); if (remote_stat == 0) return kbox_dispatch_errno(EFAULT); + if (translated[0] != '\0' && + try_cached_shadow_stat_dispatch(ctx, translated, remote_stat, + kbox_syscall_request_pid(req))) { + return kbox_dispatch_value(0); + } + long flags = nofollow ? AT_SYMLINK_NOFOLLOW : 0; struct kbox_lkl_stat kst; @@ -1791,8 +3117,9 @@ static struct kbox_dispatch forward_stat_legacy( kbox_lkl_stat_to_host(&kst, &host_stat); normalize_host_stat_if_needed(ctx, translated, &host_stat); - int wrc = - kbox_vm_write(notif->pid, remote_stat, &host_stat, sizeof(host_stat)); + int wrc = guest_mem_write_small_metadata(ctx, kbox_syscall_request_pid(req), + remote_stat, &host_stat, + sizeof(host_stat)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -1800,46 +3127,43 @@ static struct kbox_dispatch forward_stat_legacy( } static struct kbox_dispatch forward_access_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - long mode = to_c_long_arg(notif->data.args[1]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_faccessat2(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, 0); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_mkdir_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - long mode = to_c_long_arg(notif->data.args[1]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_mkdir(ctx->sysnrs, translated, (int) mode); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_unlink_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -1848,13 +3172,12 @@ static struct kbox_dispatch forward_unlink_legacy( } static struct kbox_dispatch forward_rmdir_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -1864,17 +3187,17 @@ static struct kbox_dispatch forward_rmdir_legacy( } static struct kbox_dispatch forward_rename_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char oldtrans[KBOX_MAX_PATH]; char newtrans[KBOX_MAX_PATH]; - int rc = translate_guest_path(notif->pid, notif->data.args[0], - ctx->host_root, oldtrans, sizeof(oldtrans)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, oldtrans, + sizeof(oldtrans)); if (rc < 0) return kbox_dispatch_errno(-rc); - rc = translate_guest_path(notif->pid, notif->data.args[1], ctx->host_root, - newtrans, sizeof(newtrans)); + rc = translate_request_path(req, ctx, 1, ctx->host_root, newtrans, + sizeof(newtrans)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -1884,35 +3207,33 @@ static struct kbox_dispatch forward_rename_legacy( } static struct kbox_dispatch forward_chmod_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - long mode = to_c_long_arg(notif->data.args[1]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_fchmodat(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, 0); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_chown_legacy( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { char translated[KBOX_MAX_PATH]; - int rc = - translate_guest_path(notif->pid, notif->data.args[0], ctx->host_root, - translated, sizeof(translated)); + int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, + sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); - long owner = to_c_long_arg(notif->data.args[1]); - long group = to_c_long_arg(notif->data.args[2]); + long owner = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long group = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long ret = kbox_lkl_fchownat(ctx->sysnrs, AT_FDCWD_LINUX, translated, owner, group, 0); return kbox_dispatch_from_lkl(ret); @@ -1921,20 +3242,20 @@ static struct kbox_dispatch forward_chown_legacy( /* Identity forwarders: getuid, geteuid, getresuid, etc. */ static struct kbox_dispatch forward_getresuid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t ruid_ptr = notif->data.args[0]; - uint64_t euid_ptr = notif->data.args[1]; - uint64_t suid_ptr = notif->data.args[2]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t ruid_ptr = kbox_syscall_request_arg(req, 0); + uint64_t euid_ptr = kbox_syscall_request_arg(req, 1); + uint64_t suid_ptr = kbox_syscall_request_arg(req, 2); if (ruid_ptr != 0) { long r = kbox_lkl_getuid(ctx->sysnrs); if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, ruid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, ruid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -1943,7 +3264,7 @@ static struct kbox_dispatch forward_getresuid( if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, euid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, euid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -1953,7 +3274,7 @@ static struct kbox_dispatch forward_getresuid( if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, suid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, suid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -1961,17 +3282,18 @@ static struct kbox_dispatch forward_getresuid( } static struct kbox_dispatch forward_getresuid_override( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, uid_t uid) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); unsigned val = (unsigned) uid; int i; for (i = 0; i < 3; i++) { - uint64_t ptr = notif->data.args[i]; + uint64_t ptr = kbox_syscall_request_arg(req, i); if (ptr != 0) { - int wrc = kbox_vm_write(pid, ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(EIO); } @@ -1980,20 +3302,20 @@ static struct kbox_dispatch forward_getresuid_override( } static struct kbox_dispatch forward_getresgid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t rgid_ptr = notif->data.args[0]; - uint64_t egid_ptr = notif->data.args[1]; - uint64_t sgid_ptr = notif->data.args[2]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t rgid_ptr = kbox_syscall_request_arg(req, 0); + uint64_t egid_ptr = kbox_syscall_request_arg(req, 1); + uint64_t sgid_ptr = kbox_syscall_request_arg(req, 2); if (rgid_ptr != 0) { long r = kbox_lkl_getgid(ctx->sysnrs); if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, rgid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, rgid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2002,7 +3324,7 @@ static struct kbox_dispatch forward_getresgid( if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, egid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, egid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2011,7 +3333,7 @@ static struct kbox_dispatch forward_getresgid( if (r < 0) return kbox_dispatch_errno((int) (-r)); unsigned val = (unsigned) r; - int wrc = kbox_vm_write(pid, sgid_ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, sgid_ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2019,17 +3341,18 @@ static struct kbox_dispatch forward_getresgid( } static struct kbox_dispatch forward_getresgid_override( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, gid_t gid) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); unsigned val = (unsigned) gid; int i; for (i = 0; i < 3; i++) { - uint64_t ptr = notif->data.args[i]; + uint64_t ptr = kbox_syscall_request_arg(req, i); if (ptr != 0) { - int wrc = kbox_vm_write(pid, ptr, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, ptr, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(EIO); } @@ -2038,11 +3361,11 @@ static struct kbox_dispatch forward_getresgid_override( } static struct kbox_dispatch forward_getgroups( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long size = to_c_long_arg(notif->data.args[0]); - uint64_t list = notif->data.args[1]; + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t list = kbox_syscall_request_arg(req, 1); if (size < 0) return kbox_dispatch_errno(EINVAL); @@ -2060,47 +3383,43 @@ static struct kbox_dispatch forward_getgroups( return kbox_dispatch_errno(EINVAL); size_t byte_len = (size_t) count * sizeof(unsigned); - unsigned *buf = malloc(byte_len > 0 ? byte_len : 1); - if (!buf) + if (byte_len > KBOX_IO_CHUNK_LEN) return kbox_dispatch_errno(ENOMEM); + unsigned *buf = (unsigned *) dispatch_scratch; long ret = kbox_lkl_getgroups(ctx->sysnrs, count, buf); - if (ret < 0) { - free(buf); + if (ret < 0) return kbox_dispatch_errno((int) (-ret)); - } if (list != 0 && ret > 0) { size_t write_len = (size_t) ret * sizeof(unsigned); - pid_t pid = notif->pid; - int wrc = kbox_vm_write(pid, list, buf, write_len); - if (wrc < 0) { - free(buf); + pid_t pid = kbox_syscall_request_pid(req); + int wrc = guest_mem_write(ctx, pid, list, buf, write_len); + if (wrc < 0) return kbox_dispatch_errno(-wrc); - } } - free(buf); return kbox_dispatch_value((int64_t) ret); } static struct kbox_dispatch forward_getgroups_override( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, gid_t gid) { - long size = to_c_long_arg(notif->data.args[0]); + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); if (size < 0) return kbox_dispatch_errno(EINVAL); if (size == 0) return kbox_dispatch_value(1); - uint64_t list = notif->data.args[1]; + uint64_t list = kbox_syscall_request_arg(req, 1); if (list == 0) return kbox_dispatch_errno(EFAULT); - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); unsigned val = (unsigned) gid; - int wrc = kbox_vm_write(pid, list, &val, sizeof(val)); + int wrc = guest_mem_write(ctx, pid, list, &val, sizeof(val)); if (wrc < 0) return kbox_dispatch_errno(EIO); @@ -2110,67 +3429,67 @@ static struct kbox_dispatch forward_getgroups_override( /* Identity set forwarders. */ static struct kbox_dispatch forward_setuid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long uid = to_c_long_arg(notif->data.args[0]); + long uid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); return kbox_dispatch_from_lkl(kbox_lkl_setuid(ctx->sysnrs, uid)); } static struct kbox_dispatch forward_setreuid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long ruid = to_c_long_arg(notif->data.args[0]); - long euid = to_c_long_arg(notif->data.args[1]); + long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); return kbox_dispatch_from_lkl(kbox_lkl_setreuid(ctx->sysnrs, ruid, euid)); } static struct kbox_dispatch forward_setresuid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long ruid = to_c_long_arg(notif->data.args[0]); - long euid = to_c_long_arg(notif->data.args[1]); - long suid = to_c_long_arg(notif->data.args[2]); + long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long suid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); return kbox_dispatch_from_lkl( kbox_lkl_setresuid(ctx->sysnrs, ruid, euid, suid)); } static struct kbox_dispatch forward_setgid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long gid = to_c_long_arg(notif->data.args[0]); + long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); return kbox_dispatch_from_lkl(kbox_lkl_setgid(ctx->sysnrs, gid)); } static struct kbox_dispatch forward_setregid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long rgid = to_c_long_arg(notif->data.args[0]); - long egid = to_c_long_arg(notif->data.args[1]); + long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); return kbox_dispatch_from_lkl(kbox_lkl_setregid(ctx->sysnrs, rgid, egid)); } static struct kbox_dispatch forward_setresgid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long rgid = to_c_long_arg(notif->data.args[0]); - long egid = to_c_long_arg(notif->data.args[1]); - long sgid = to_c_long_arg(notif->data.args[2]); + long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long sgid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); return kbox_dispatch_from_lkl( kbox_lkl_setresgid(ctx->sysnrs, rgid, egid, sgid)); } static struct kbox_dispatch forward_setgroups( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long size = to_c_long_arg(notif->data.args[0]); - uint64_t list = notif->data.args[1]; + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t list = kbox_syscall_request_arg(req, 1); if (size < 0 || size > 65536) return kbox_dispatch_errno(EINVAL); @@ -2179,27 +3498,24 @@ static struct kbox_dispatch forward_setgroups( return kbox_dispatch_from_lkl(kbox_lkl_setgroups(ctx->sysnrs, 0, NULL)); size_t byte_len = (size_t) size * sizeof(unsigned); - unsigned *buf = malloc(byte_len); - if (!buf) + if (byte_len > KBOX_IO_CHUNK_LEN) return kbox_dispatch_errno(ENOMEM); + unsigned *buf = (unsigned *) dispatch_scratch; - pid_t pid = notif->pid; - int rrc = kbox_vm_read(pid, list, buf, byte_len); - if (rrc < 0) { - free(buf); + pid_t pid = kbox_syscall_request_pid(req); + int rrc = guest_mem_read(ctx, pid, list, buf, byte_len); + if (rrc < 0) return kbox_dispatch_errno(-rrc); - } long ret = kbox_lkl_setgroups(ctx->sysnrs, size, buf); - free(buf); return kbox_dispatch_from_lkl(ret); } static struct kbox_dispatch forward_setfsgid( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long gid = to_c_long_arg(notif->data.args[0]); + long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); return kbox_dispatch_from_lkl(kbox_lkl_setfsgid(ctx->sysnrs, gid)); } @@ -2226,12 +3542,12 @@ static struct kbox_dispatch forward_setfsgid( * without --net or via a future deferred-bridge approach. */ static struct kbox_dispatch forward_socket( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long domain = to_c_long_arg(notif->data.args[0]); - long type_raw = to_c_long_arg(notif->data.args[1]); - long protocol = to_c_long_arg(notif->data.args[2]); + long domain = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long type_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long protocol = to_c_long_arg(kbox_syscall_request_arg(req, 2)); int base_type = (int) type_raw & 0xFF; @@ -2286,8 +3602,7 @@ static struct kbox_dispatch forward_socket( uint32_t addfd_flags = 0; if (type_raw & SOCK_CLOEXEC) addfd_flags = O_CLOEXEC; - int host_fd = - kbox_notify_addfd(ctx->listener_fd, notif->id, sp[1], addfd_flags); + int host_fd = request_addfd(ctx, req, sp[1], addfd_flags); if (host_fd < 0) { /* Deregister closes sp[0] and marks inactive. */ kbox_net_deregister_socket((int) lkl_fd); @@ -2318,18 +3633,18 @@ static struct kbox_dispatch forward_socket( static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd); -static struct kbox_dispatch forward_bind(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch forward_bind(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t addr_ptr = notif->data.args[1]; - int64_t len_raw = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (len_raw < 0) return kbox_dispatch_errno(EINVAL); size_t len = (size_t) len_raw; @@ -2341,7 +3656,7 @@ static struct kbox_dispatch forward_bind(const struct kbox_seccomp_notif *notif, return kbox_dispatch_errno(EINVAL); uint8_t buf[4096]; - int rrc = kbox_vm_read(pid, addr_ptr, buf, len); + int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2370,18 +3685,18 @@ static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd) } static struct kbox_dispatch forward_connect( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t addr_ptr = notif->data.args[1]; - int64_t len_raw = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (len_raw < 0) return kbox_dispatch_errno(EINVAL); size_t len = (size_t) len_raw; @@ -2393,7 +3708,7 @@ static struct kbox_dispatch forward_connect( return kbox_dispatch_errno(EINVAL); uint8_t buf[4096]; - int rrc = kbox_vm_read(pid, addr_ptr, buf, len); + int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2411,26 +3726,26 @@ static struct kbox_dispatch forward_connect( /* forward_getsockopt. */ static struct kbox_dispatch forward_getsockopt( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - long level = to_c_long_arg(notif->data.args[1]); - long optname = to_c_long_arg(notif->data.args[2]); - uint64_t optval_ptr = notif->data.args[3]; - uint64_t optlen_ptr = notif->data.args[4]; + pid_t pid = kbox_syscall_request_pid(req); + long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); + uint64_t optlen_ptr = kbox_syscall_request_arg(req, 4); if (optval_ptr == 0 || optlen_ptr == 0) return kbox_dispatch_errno(EFAULT); /* Read the optlen from tracee. */ unsigned int optlen; - int rrc = kbox_vm_read(pid, optlen_ptr, &optlen, sizeof(optlen)); + int rrc = guest_mem_read(ctx, pid, optlen_ptr, &optlen, sizeof(optlen)); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2447,10 +3762,10 @@ static struct kbox_dispatch forward_getsockopt( /* Write min(out_len, optlen) to avoid leaking stack data. */ unsigned int write_len = out_len < optlen ? out_len : optlen; - int wrc = kbox_vm_write(pid, optval_ptr, optval, write_len); + int wrc = guest_mem_write(ctx, pid, optval_ptr, optval, write_len); if (wrc < 0) return kbox_dispatch_errno(-wrc); - wrc = kbox_vm_write(pid, optlen_ptr, &out_len, sizeof(out_len)); + wrc = guest_mem_write(ctx, pid, optlen_ptr, &out_len, sizeof(out_len)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -2460,26 +3775,26 @@ static struct kbox_dispatch forward_getsockopt( /* forward_setsockopt. */ static struct kbox_dispatch forward_setsockopt( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - long level = to_c_long_arg(notif->data.args[1]); - long optname = to_c_long_arg(notif->data.args[2]); - uint64_t optval_ptr = notif->data.args[3]; - long optlen = to_c_long_arg(notif->data.args[4]); + pid_t pid = kbox_syscall_request_pid(req); + long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); + long optlen = to_c_long_arg(kbox_syscall_request_arg(req, 4)); if (optlen < 0 || optlen > 4096) return kbox_dispatch_errno(EINVAL); uint8_t optval[4096] = {0}; if (optval_ptr != 0 && optlen > 0) { - int rrc = kbox_vm_read(pid, optval_ptr, optval, (size_t) optlen); + int rrc = guest_mem_read(ctx, pid, optval_ptr, optval, (size_t) optlen); if (rrc < 0) return kbox_dispatch_errno(-rrc); } @@ -2497,24 +3812,24 @@ typedef long (*sockaddr_query_fn)(const struct kbox_sysnrs *s, void *addrlen); static struct kbox_dispatch forward_sockaddr_query( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, sockaddr_query_fn query) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t addr_ptr = notif->data.args[1]; - uint64_t len_ptr = notif->data.args[2]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + uint64_t len_ptr = kbox_syscall_request_arg(req, 2); if (addr_ptr == 0 || len_ptr == 0) return kbox_dispatch_errno(EFAULT); unsigned int addrlen; - int rrc = kbox_vm_read(pid, len_ptr, &addrlen, sizeof(addrlen)); + int rrc = guest_mem_read(ctx, pid, len_ptr, &addrlen, sizeof(addrlen)); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2529,10 +3844,10 @@ static struct kbox_dispatch forward_sockaddr_query( return kbox_dispatch_from_lkl(ret); unsigned int write_len = out_len < addrlen ? out_len : addrlen; - int wrc = kbox_vm_write(pid, addr_ptr, addr, write_len); + int wrc = guest_mem_write(ctx, pid, addr_ptr, addr, write_len); if (wrc < 0) return kbox_dispatch_errno(-wrc); - wrc = kbox_vm_write(pid, len_ptr, &out_len, sizeof(out_len)); + wrc = guest_mem_write(ctx, pid, len_ptr, &out_len, sizeof(out_len)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -2540,31 +3855,31 @@ static struct kbox_dispatch forward_sockaddr_query( } static struct kbox_dispatch forward_getsockname( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return forward_sockaddr_query(notif, ctx, kbox_lkl_getsockname); + return forward_sockaddr_query(req, ctx, kbox_lkl_getsockname); } static struct kbox_dispatch forward_getpeername( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - return forward_sockaddr_query(notif, ctx, kbox_lkl_getpeername); + return forward_sockaddr_query(req, ctx, kbox_lkl_getpeername); } /* forward_shutdown. */ static struct kbox_dispatch forward_shutdown( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - long how = to_c_long_arg(notif->data.args[1]); + long how = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_shutdown(ctx->sysnrs, lkl_fd, how); return kbox_dispatch_from_lkl(ret); } @@ -2580,24 +3895,24 @@ static struct kbox_dispatch forward_shutdown( * args[4]=dest_addr, args[5]=addrlen */ static struct kbox_dispatch forward_sendto( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - uint64_t dest_ptr = notif->data.args[4]; + uint64_t dest_ptr = kbox_syscall_request_arg(req, 4); if (dest_ptr == 0) return kbox_dispatch_continue(); /* no dest addr: stream data path */ /* Has a destination address: forward via LKL sendto. */ - pid_t pid = notif->pid; - uint64_t buf_ptr = notif->data.args[1]; - int64_t len_raw = to_c_long_arg(notif->data.args[2]); - long flags = to_c_long_arg(notif->data.args[3]); - int64_t addrlen_raw = to_c_long_arg(notif->data.args[5]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + int64_t addrlen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 5)); if (len_raw < 0 || addrlen_raw < 0) return kbox_dispatch_errno(EINVAL); @@ -2612,10 +3927,10 @@ static struct kbox_dispatch forward_sendto( uint8_t buf[65536]; uint8_t addr[128]; - int rrc = kbox_vm_read(pid, buf_ptr, buf, len); + int rrc = guest_mem_read(ctx, pid, buf_ptr, buf, len); if (rrc < 0) return kbox_dispatch_errno(-rrc); - rrc = kbox_vm_read(pid, dest_ptr, addr, addrlen); + rrc = guest_mem_read(ctx, pid, dest_ptr, addr, addrlen); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2632,23 +3947,23 @@ static struct kbox_dispatch forward_sendto( * args[4]=src_addr, args[5]=addrlen */ static struct kbox_dispatch forward_recvfrom( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - uint64_t src_ptr = notif->data.args[4]; + uint64_t src_ptr = kbox_syscall_request_arg(req, 4); if (src_ptr == 0) return kbox_dispatch_continue(); /* no addr buffer: stream path */ - pid_t pid = notif->pid; - uint64_t buf_ptr = notif->data.args[1]; - int64_t len_raw = to_c_long_arg(notif->data.args[2]); - long flags = to_c_long_arg(notif->data.args[3]); - uint64_t addrlen_ptr = notif->data.args[5]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + uint64_t addrlen_ptr = kbox_syscall_request_arg(req, 5); if (len_raw < 0) return kbox_dispatch_errno(EINVAL); @@ -2658,7 +3973,8 @@ static struct kbox_dispatch forward_recvfrom( unsigned int addrlen = 0; if (addrlen_ptr != 0) { - int rrc = kbox_vm_read(pid, addrlen_ptr, &addrlen, sizeof(addrlen)); + int rrc = + guest_mem_read(ctx, pid, addrlen_ptr, &addrlen, sizeof(addrlen)); if (rrc < 0) return kbox_dispatch_errno(-rrc); } @@ -2674,19 +3990,19 @@ static struct kbox_dispatch forward_recvfrom( if (ret < 0) return kbox_dispatch_from_lkl(ret); - int wrc = kbox_vm_write(pid, buf_ptr, buf, (size_t) ret); + int wrc = guest_mem_write(ctx, pid, buf_ptr, buf, (size_t) ret); if (wrc < 0) return kbox_dispatch_errno(-wrc); if (src_ptr != 0 && out_addrlen > 0) { unsigned int write_len = out_addrlen < addrlen ? out_addrlen : addrlen; - wrc = kbox_vm_write(pid, src_ptr, addr, write_len); + wrc = guest_mem_write(ctx, pid, src_ptr, addr, write_len); if (wrc < 0) return kbox_dispatch_errno(-wrc); } if (addrlen_ptr != 0) { - wrc = - kbox_vm_write(pid, addrlen_ptr, &out_addrlen, sizeof(out_addrlen)); + wrc = guest_mem_write(ctx, pid, addrlen_ptr, &out_addrlen, + sizeof(out_addrlen)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2701,17 +4017,17 @@ static struct kbox_dispatch forward_recvfrom( * args[0]=fd, args[1]=msg_ptr, args[2]=flags */ static struct kbox_dispatch forward_recvmsg( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = resolve_lkl_socket(ctx, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t msg_ptr = notif->data.args[1]; - long flags = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t msg_ptr = kbox_syscall_request_arg(req, 1); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (msg_ptr == 0) return kbox_dispatch_errno(EFAULT); @@ -2726,7 +4042,7 @@ static struct kbox_dispatch forward_recvmsg( uint64_t msg_controllen; int msg_flags; } mh; - int rrc = kbox_vm_read(pid, msg_ptr, &mh, sizeof(mh)); + int rrc = guest_mem_read(ctx, pid, msg_ptr, &mh, sizeof(mh)); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2746,7 +4062,7 @@ static struct kbox_dispatch forward_recvmsg( uint64_t iov_base; uint64_t iov_len; } iovs[64]; - rrc = kbox_vm_read(pid, mh.msg_iov, iovs, niov * sizeof(iovs[0])); + rrc = guest_mem_read(ctx, pid, mh.msg_iov, iovs, niov * sizeof(iovs[0])); if (rrc < 0) return kbox_dispatch_errno(-rrc); @@ -2775,8 +4091,8 @@ static struct kbox_dispatch forward_recvmsg( if (chunk > (size_t) iovs[v].iov_len) chunk = (size_t) iovs[v].iov_len; if (chunk > 0 && iovs[v].iov_base != 0) { - int wrc2 = - kbox_vm_write(pid, iovs[v].iov_base, buf + written, chunk); + int wrc2 = guest_mem_write(ctx, pid, iovs[v].iov_base, + buf + written, chunk); if (wrc2 < 0) return kbox_dispatch_errno(-wrc2); written += chunk; @@ -2787,14 +4103,15 @@ static struct kbox_dispatch forward_recvmsg( if (out_addrlen > 0) { unsigned int write_len = out_addrlen < mh.msg_namelen ? out_addrlen : mh.msg_namelen; - int awrc = kbox_vm_write(pid, mh.msg_name, addr, write_len); + int awrc = guest_mem_write(ctx, pid, mh.msg_name, addr, write_len); if (awrc < 0) return kbox_dispatch_errno(-awrc); } /* Update msg_namelen in the msghdr. */ - int nwrc = kbox_vm_write(pid, msg_ptr + 8 /* offset of msg_namelen */, - &out_addrlen, sizeof(out_addrlen)); + int nwrc = + guest_mem_write(ctx, pid, msg_ptr + 8 /* offset of msg_namelen */, + &out_addrlen, sizeof(out_addrlen)); if (nwrc < 0) return kbox_dispatch_errno(-nwrc); @@ -2804,12 +4121,12 @@ static struct kbox_dispatch forward_recvmsg( /* forward_clock_gettime. */ static struct kbox_dispatch forward_clock_gettime( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - int clockid = (int) to_c_long_arg(notif->data.args[0]); - uint64_t remote_ts = notif->data.args[1]; + pid_t pid = kbox_syscall_request_pid(req); + int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_ts = kbox_syscall_request_arg(req, 1); if (remote_ts == 0) return kbox_dispatch_errno(EFAULT); @@ -2818,7 +4135,7 @@ static struct kbox_dispatch forward_clock_gettime( if (clock_gettime(clockid, &ts) < 0) return kbox_dispatch_errno(errno); - int wrc = kbox_vm_write(pid, remote_ts, &ts, sizeof(ts)); + int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -2828,19 +4145,19 @@ static struct kbox_dispatch forward_clock_gettime( /* forward_clock_getres. */ static struct kbox_dispatch forward_clock_getres( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - int clockid = (int) to_c_long_arg(notif->data.args[0]); - uint64_t remote_ts = notif->data.args[1]; + pid_t pid = kbox_syscall_request_pid(req); + int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_ts = kbox_syscall_request_arg(req, 1); struct timespec ts; if (clock_getres(clockid, remote_ts ? &ts : NULL) < 0) return kbox_dispatch_errno(errno); if (remote_ts != 0) { - int wrc = kbox_vm_write(pid, remote_ts, &ts, sizeof(ts)); + int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2851,12 +4168,12 @@ static struct kbox_dispatch forward_clock_getres( /* forward_gettimeofday. */ static struct kbox_dispatch forward_gettimeofday( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t remote_tv = notif->data.args[0]; - uint64_t remote_tz = notif->data.args[1]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_tv = kbox_syscall_request_arg(req, 0); + uint64_t remote_tz = kbox_syscall_request_arg(req, 1); /* Use clock_gettime(CLOCK_REALTIME) as the underlying source, which * works on both x86_64 and aarch64. @@ -2873,7 +4190,7 @@ static struct kbox_dispatch forward_gettimeofday( tv.tv_sec = ts.tv_sec; tv.tv_usec = ts.tv_nsec / 1000; - int wrc = kbox_vm_write(pid, remote_tv, &tv, sizeof(tv)); + int wrc = guest_mem_write(ctx, pid, remote_tv, &tv, sizeof(tv)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2885,7 +4202,7 @@ static struct kbox_dispatch forward_gettimeofday( int tz_dsttime; } tz = {0, 0}; - int wrc = kbox_vm_write(pid, remote_tz, &tz, sizeof(tz)); + int wrc = guest_mem_write(ctx, pid, remote_tz, &tz, sizeof(tz)); if (wrc < 0) return kbox_dispatch_errno(-wrc); } @@ -2896,21 +4213,19 @@ static struct kbox_dispatch forward_gettimeofday( /* forward_readlinkat. */ static struct kbox_dispatch forward_readlinkat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - long dirfd_raw = to_dirfd_arg(notif->data.args[0]); + pid_t pid = kbox_syscall_request_pid(req); + long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); char pathbuf[KBOX_MAX_PATH]; - int rc; - - rc = - kbox_vm_read_string(pid, notif->data.args[1], pathbuf, sizeof(pathbuf)); + int rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + pathbuf, sizeof(pathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); - uint64_t remote_buf = notif->data.args[2]; - int64_t bufsiz_raw = to_c_long_arg(notif->data.args[3]); + uint64_t remote_buf = kbox_syscall_request_arg(req, 2); + int64_t bufsiz_raw = to_c_long_arg(kbox_syscall_request_arg(req, 3)); if (bufsiz_raw < 0) return kbox_dispatch_errno(EINVAL); size_t bufsiz = (size_t) bufsiz_raw; @@ -2938,7 +4253,7 @@ static struct kbox_dispatch forward_readlinkat( return kbox_dispatch_errno((int) (-ret)); size_t n = (size_t) ret; - int wrc = kbox_vm_write(pid, remote_buf, linkbuf, n); + int wrc = guest_mem_write(ctx, pid, remote_buf, linkbuf, n); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -2948,12 +4263,12 @@ static struct kbox_dispatch forward_readlinkat( /* forward_pipe2. */ static struct kbox_dispatch forward_pipe2( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t remote_pipefd = notif->data.args[0]; - long flags = to_c_long_arg(notif->data.args[1]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_pipefd = kbox_syscall_request_arg(req, 0); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 1)); if (remote_pipefd == 0) return kbox_dispatch_errno(EFAULT); @@ -2972,16 +4287,14 @@ static struct kbox_dispatch forward_pipe2( uint32_t cloexec_flag = (flags & O_CLOEXEC) ? O_CLOEXEC : 0; - int tracee_fd0 = kbox_notify_addfd(ctx->listener_fd, notif->id, - host_pipefd[0], cloexec_flag); + int tracee_fd0 = request_addfd(ctx, req, host_pipefd[0], cloexec_flag); if (tracee_fd0 < 0) { close(host_pipefd[0]); close(host_pipefd[1]); return kbox_dispatch_errno(-tracee_fd0); } - int tracee_fd1 = kbox_notify_addfd(ctx->listener_fd, notif->id, - host_pipefd[1], cloexec_flag); + int tracee_fd1 = request_addfd(ctx, req, host_pipefd[1], cloexec_flag); if (tracee_fd1 < 0) { close(host_pipefd[0]); close(host_pipefd[1]); @@ -2993,7 +4306,8 @@ static struct kbox_dispatch forward_pipe2( close(host_pipefd[1]); int guest_fds[2] = {tracee_fd0, tracee_fd1}; - int wrc = kbox_vm_write(pid, remote_pipefd, guest_fds, sizeof(guest_fds)); + int wrc = + guest_mem_write(ctx, pid, remote_pipefd, guest_fds, sizeof(guest_fds)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -3003,11 +4317,11 @@ static struct kbox_dispatch forward_pipe2( /* forward_uname. */ static struct kbox_dispatch forward_uname( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t remote_buf = notif->data.args[0]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_buf = kbox_syscall_request_arg(req, 0); if (remote_buf == 0) return kbox_dispatch_errno(EFAULT); @@ -3026,7 +4340,7 @@ static struct kbox_dispatch forward_uname( snprintf(uts.machine, sizeof(uts.machine), "unknown"); #endif - int wrc = kbox_vm_write(pid, remote_buf, &uts, sizeof(uts)); + int wrc = guest_mem_write(ctx, pid, remote_buf, &uts, sizeof(uts)); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -3036,12 +4350,12 @@ static struct kbox_dispatch forward_uname( /* forward_getrandom. */ static struct kbox_dispatch forward_getrandom( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - uint64_t remote_buf = notif->data.args[0]; - int64_t buflen_raw = to_c_long_arg(notif->data.args[1]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_buf = kbox_syscall_request_arg(req, 0); + int64_t buflen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); if (buflen_raw < 0) return kbox_dispatch_errno(EINVAL); @@ -3074,7 +4388,7 @@ static struct kbox_dispatch forward_getrandom( return kbox_dispatch_errno((int) (-ret)); size_t n = (size_t) ret; - int wrc = kbox_vm_write(pid, remote_buf, scratch, n); + int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -3097,13 +4411,13 @@ static struct kbox_dispatch forward_getrandom( #define SYSLOG_ACTION_SIZE_BUFFER 10 static struct kbox_dispatch forward_syslog( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - long type = to_c_long_arg(notif->data.args[0]); - uint64_t remote_buf = notif->data.args[1]; - long len = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + long type = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + long len = to_c_long_arg(kbox_syscall_request_arg(req, 2)); int needs_buf = (type == SYSLOG_ACTION_READ || type == SYSLOG_ACTION_READ_ALL || @@ -3144,7 +4458,7 @@ static struct kbox_dispatch forward_syslog( return kbox_dispatch_errno((int) (-ret)); size_t n = (size_t) ret; - int wrc = kbox_vm_write(pid, remote_buf, scratch, n); + int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); if (wrc < 0) return kbox_dispatch_errno(-wrc); @@ -3168,10 +4482,10 @@ static struct kbox_dispatch forward_syslog( #endif static struct kbox_dispatch forward_prctl( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long option = to_c_long_arg(notif->data.args[0]); + long option = to_c_long_arg(kbox_syscall_request_arg(req, 0)); /* Block PR_SET_DUMPABLE(0): clearing dumpability makes process_vm_readv * fail, which would bypass clone3 namespace-flag sanitization (the @@ -3179,7 +4493,8 @@ static struct kbox_dispatch forward_prctl( * Return success without actually clearing; the tracee thinks it * worked, but the supervisor retains read access. */ - if (option == PR_SET_DUMPABLE && to_c_long_arg(notif->data.args[1]) == 0) + if (option == PR_SET_DUMPABLE && + to_c_long_arg(kbox_syscall_request_arg(req, 1)) == 0) return kbox_dispatch_value(0); /* Match: report dumpable even if guest tried to clear it. */ if (option == PR_GET_DUMPABLE) @@ -3195,15 +4510,15 @@ static struct kbox_dispatch forward_prctl( if (option != PR_SET_NAME && option != PR_GET_NAME) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t remote_name = notif->data.args[1]; + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_name = kbox_syscall_request_arg(req, 1); if (remote_name == 0) return kbox_dispatch_errno(EFAULT); /* PR_SET_NAME: read 16-byte name from tracee, pass local copy to LKL. */ if (option == PR_SET_NAME) { char name[16]; - int rrc = kbox_vm_read(pid, remote_name, name, sizeof(name)); + int rrc = guest_mem_read(ctx, pid, remote_name, name, sizeof(name)); if (rrc < 0) return kbox_dispatch_errno(-rrc); name[15] = '\0'; /* ensure NUL termination */ @@ -3218,7 +4533,7 @@ static struct kbox_dispatch forward_prctl( lkl_syscall6(ctx->sysnrs->prctl, option, (long) name, 0, 0, 0, 0); if (ret < 0) return kbox_dispatch_from_lkl(ret); - int wrc = kbox_vm_write(pid, remote_name, name, sizeof(name)); + int wrc = guest_mem_write(ctx, pid, remote_name, name, sizeof(name)); if (wrc < 0) return kbox_dispatch_errno(-wrc); return kbox_dispatch_value(0); @@ -3227,10 +4542,10 @@ static struct kbox_dispatch forward_prctl( /* forward_umask. */ static struct kbox_dispatch forward_umask( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long mask = to_c_long_arg(notif->data.args[0]); + long mask = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long ret = kbox_lkl_umask(ctx->sysnrs, mask); return kbox_dispatch_from_lkl(ret); } @@ -3238,35 +4553,37 @@ static struct kbox_dispatch forward_umask( /* forward_pwrite64. */ static struct kbox_dispatch forward_pwrite64( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); - uint64_t remote_buf = notif->data.args[1]; - int64_t count_raw = to_c_long_arg(notif->data.args[2]); + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (count_raw < 0) return kbox_dispatch_errno(EINVAL); size_t count = (size_t) count_raw; - long offset = to_c_long_arg(notif->data.args[3]); + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 3)); if (remote_buf == 0) return kbox_dispatch_errno(EFAULT); if (count == 0) return kbox_dispatch_value(0); - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); size_t max_count = 1024 * 1024; if (count > max_count) count = max_count; size_t total = 0; - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) - return kbox_dispatch_errno(ENOMEM); + uint8_t *scratch = dispatch_scratch; while (total < count) { size_t chunk_len = KBOX_IO_CHUNK_LEN; @@ -3274,11 +4591,10 @@ static struct kbox_dispatch forward_pwrite64( chunk_len = count - total; uint64_t remote = remote_buf + total; - int rrc = kbox_vm_read(pid, remote, scratch, chunk_len); + int rrc = guest_mem_read(ctx, pid, remote, scratch, chunk_len); if (rrc < 0) { if (total > 0) break; - free(scratch); return kbox_dispatch_errno(-rrc); } @@ -3286,7 +4602,6 @@ static struct kbox_dispatch forward_pwrite64( (long) chunk_len, offset + (long) total); if (ret < 0) { if (total == 0) { - free(scratch); return kbox_dispatch_errno((int) (-ret)); } break; @@ -3298,7 +4613,8 @@ static struct kbox_dispatch forward_pwrite64( break; } - free(scratch); + if (total > 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_value((int64_t) total); } @@ -3308,20 +4624,27 @@ static struct kbox_dispatch forward_pwrite64( * On 64-bit: 16 bytes per entry. */ #define IOV_ENTRY_SIZE 16 +/* Match the kernel's UIO_MAXIOV. The iov_buf is static (not stack-allocated) + * because in trap/rewrite mode dispatch runs in signal handler context where + * 16 KB on the stack risks overflow on threads with small stacks. The + * dispatcher is single-threaded (documented invariant), so a static buffer + * is safe. + */ #define IOV_MAX_COUNT 1024 +static uint8_t iov_scratch[IOV_MAX_COUNT * IOV_ENTRY_SIZE]; static struct kbox_dispatch forward_writev( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t remote_iov = notif->data.args[1]; - int64_t iovcnt_raw = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_iov = kbox_syscall_request_arg(req, 1); + int64_t iovcnt_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (iovcnt_raw <= 0 || iovcnt_raw > IOV_MAX_COUNT) return kbox_dispatch_errno(EINVAL); @@ -3330,31 +4653,23 @@ static struct kbox_dispatch forward_writev( int iovcnt = (int) iovcnt_raw; size_t iov_bytes = (size_t) iovcnt * IOV_ENTRY_SIZE; - uint8_t *iov_buf = malloc(iov_bytes); - if (!iov_buf) - return kbox_dispatch_errno(ENOMEM); - int rrc = kbox_vm_read(pid, remote_iov, iov_buf, iov_bytes); + int rrc = guest_mem_read(ctx, pid, remote_iov, iov_scratch, iov_bytes); if (rrc < 0) { - free(iov_buf); return kbox_dispatch_errno(-rrc); } int mirror_host = kbox_fd_table_mirror_tty(ctx->fd_table, fd); size_t total = 0; - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) { - free(iov_buf); - return kbox_dispatch_errno(ENOMEM); - } + uint8_t *scratch = dispatch_scratch; int err = 0; int i; for (i = 0; i < iovcnt; i++) { uint64_t base; uint64_t len; - memcpy(&base, &iov_buf[i * IOV_ENTRY_SIZE], 8); - memcpy(&len, &iov_buf[i * IOV_ENTRY_SIZE + 8], 8); + memcpy(&base, &iov_scratch[i * IOV_ENTRY_SIZE], 8); + memcpy(&len, &iov_scratch[i * IOV_ENTRY_SIZE + 8], 8); if (base == 0 || len == 0) continue; @@ -3365,7 +4680,7 @@ static struct kbox_dispatch forward_writev( if (chunk > len - seg_total) chunk = len - seg_total; - rrc = kbox_vm_read(pid, base + seg_total, scratch, chunk); + rrc = guest_mem_read(ctx, pid, base + seg_total, scratch, chunk); if (rrc < 0) { err = -rrc; goto done; @@ -3390,8 +4705,8 @@ static struct kbox_dispatch forward_writev( } done: - free(scratch); - free(iov_buf); + if (total > 0) + invalidate_path_shadow_cache(ctx); if (total == 0 && err) return kbox_dispatch_errno(err); return kbox_dispatch_value((int64_t) total); @@ -3400,17 +4715,17 @@ static struct kbox_dispatch forward_writev( /* forward_readv. */ static struct kbox_dispatch forward_readv( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - pid_t pid = notif->pid; - uint64_t remote_iov = notif->data.args[1]; - int64_t iovcnt_raw = to_c_long_arg(notif->data.args[2]); + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_iov = kbox_syscall_request_arg(req, 1); + int64_t iovcnt_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); if (iovcnt_raw <= 0 || iovcnt_raw > IOV_MAX_COUNT) return kbox_dispatch_errno(EINVAL); @@ -3419,29 +4734,21 @@ static struct kbox_dispatch forward_readv( int iovcnt = (int) iovcnt_raw; size_t iov_bytes = (size_t) iovcnt * IOV_ENTRY_SIZE; - uint8_t *iov_buf = malloc(iov_bytes); - if (!iov_buf) - return kbox_dispatch_errno(ENOMEM); - int rrc = kbox_vm_read(pid, remote_iov, iov_buf, iov_bytes); + int rrc = guest_mem_read(ctx, pid, remote_iov, iov_scratch, iov_bytes); if (rrc < 0) { - free(iov_buf); return kbox_dispatch_errno(-rrc); } size_t total = 0; - uint8_t *scratch = malloc(KBOX_IO_CHUNK_LEN); - if (!scratch) { - free(iov_buf); - return kbox_dispatch_errno(ENOMEM); - } + uint8_t *scratch = dispatch_scratch; int i; for (i = 0; i < iovcnt; i++) { uint64_t base; uint64_t len; - memcpy(&base, &iov_buf[i * IOV_ENTRY_SIZE], 8); - memcpy(&len, &iov_buf[i * IOV_ENTRY_SIZE + 8], 8); + memcpy(&base, &iov_scratch[i * IOV_ENTRY_SIZE], 8); + memcpy(&len, &iov_scratch[i * IOV_ENTRY_SIZE + 8], 8); if (base == 0 || len == 0) continue; @@ -3456,8 +4763,6 @@ static struct kbox_dispatch forward_readv( kbox_lkl_read(ctx->sysnrs, lkl_fd, scratch, (long) chunk); if (ret < 0) { if (total == 0) { - free(scratch); - free(iov_buf); return kbox_dispatch_errno((int) (-ret)); } goto done_readv; @@ -3467,10 +4772,8 @@ static struct kbox_dispatch forward_readv( if (n == 0) goto done_readv; - int wrc = kbox_vm_write(pid, base + seg_total, scratch, n); + int wrc = guest_mem_write(ctx, pid, base + seg_total, scratch, n); if (wrc < 0) { - free(scratch); - free(iov_buf); return kbox_dispatch_errno(-wrc); } @@ -3482,62 +4785,70 @@ static struct kbox_dispatch forward_readv( } done_readv: - free(scratch); - free(iov_buf); return kbox_dispatch_value((int64_t) total); } /* forward_ftruncate. */ static struct kbox_dispatch forward_ftruncate( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); - long length = to_c_long_arg(notif->data.args[1]); + long length = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_ftruncate(ctx->sysnrs, lkl_fd, length); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_fallocate. */ static struct kbox_dispatch forward_fallocate( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); - long mode = to_c_long_arg(notif->data.args[1]); - long offset = to_c_long_arg(notif->data.args[2]); - long len = to_c_long_arg(notif->data.args[3]); + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long len = to_c_long_arg(kbox_syscall_request_arg(req, 3)); long ret = kbox_lkl_fallocate(ctx->sysnrs, lkl_fd, mode, offset, len); if (ret == -ENOSYS) return kbox_dispatch_errno(ENOSYS); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_flock. */ static struct kbox_dispatch forward_flock( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); - long operation = to_c_long_arg(notif->data.args[1]); + long operation = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long ret = kbox_lkl_flock(ctx->sysnrs, lkl_fd, operation); return kbox_dispatch_from_lkl(ret); } @@ -3545,14 +4856,21 @@ static struct kbox_dispatch forward_flock( /* forward_fsync. */ static struct kbox_dispatch forward_fsync( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->shadow_writeback) { + int rc = sync_shadow_writeback(ctx, entry); + if (rc < 0) + return kbox_dispatch_errno(-rc); + return kbox_dispatch_value(0); + } long ret = kbox_lkl_fsync(ctx->sysnrs, lkl_fd); return kbox_dispatch_from_lkl(ret); @@ -3561,14 +4879,21 @@ static struct kbox_dispatch forward_fsync( /* forward_fdatasync. */ static struct kbox_dispatch forward_fdatasync( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - long fd = to_c_long_arg(notif->data.args[0]); + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); if (lkl_fd < 0) return kbox_dispatch_continue(); + if (entry && entry->shadow_writeback) { + int rc = sync_shadow_writeback(ctx, entry); + if (rc < 0) + return kbox_dispatch_errno(-rc); + return kbox_dispatch_value(0); + } long ret = kbox_lkl_fdatasync(ctx->sysnrs, lkl_fd); return kbox_dispatch_from_lkl(ret); @@ -3576,10 +4901,10 @@ static struct kbox_dispatch forward_fdatasync( /* forward_sync. */ -static struct kbox_dispatch forward_sync(const struct kbox_seccomp_notif *notif, +static struct kbox_dispatch forward_sync(const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - (void) notif; + (void) req; long ret = kbox_lkl_sync(ctx->sysnrs); return kbox_dispatch_from_lkl(ret); } @@ -3587,23 +4912,23 @@ static struct kbox_dispatch forward_sync(const struct kbox_seccomp_notif *notif, /* forward_symlinkat. */ static struct kbox_dispatch forward_symlinkat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); char targetbuf[KBOX_MAX_PATH]; char linkpathbuf[KBOX_MAX_PATH]; int rc; - rc = kbox_vm_read_string(pid, notif->data.args[0], targetbuf, - sizeof(targetbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 0), + targetbuf, sizeof(targetbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); - long newdirfd_raw = to_dirfd_arg(notif->data.args[1]); + long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 1)); - rc = kbox_vm_read_string(pid, notif->data.args[2], linkpathbuf, - sizeof(linkpathbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 2), + linkpathbuf, sizeof(linkpathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -3619,34 +4944,36 @@ static struct kbox_dispatch forward_symlinkat( /* Target is stored as-is (not translated). */ long ret = kbox_lkl_symlinkat(ctx->sysnrs, targetbuf, newdirfd, linktrans); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } /* forward_linkat. */ static struct kbox_dispatch forward_linkat( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = notif->pid; - long olddirfd_raw = to_dirfd_arg(notif->data.args[0]); + pid_t pid = kbox_syscall_request_pid(req); + long olddirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); char oldpathbuf[KBOX_MAX_PATH]; int rc; - rc = kbox_vm_read_string(pid, notif->data.args[1], oldpathbuf, - sizeof(oldpathbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + oldpathbuf, sizeof(oldpathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); - long newdirfd_raw = to_dirfd_arg(notif->data.args[2]); + long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 2)); char newpathbuf[KBOX_MAX_PATH]; - rc = kbox_vm_read_string(pid, notif->data.args[3], newpathbuf, - sizeof(newpathbuf)); + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 3), + newpathbuf, sizeof(newpathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); - long flags = to_c_long_arg(notif->data.args[4]); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); char oldtrans[KBOX_MAX_PATH]; rc = kbox_translate_path_for_lkl(pid, oldpathbuf, ctx->host_root, oldtrans, @@ -3660,222 +4987,758 @@ static struct kbox_dispatch forward_linkat( if (rc < 0) return kbox_dispatch_errno(-rc); - long olddirfd = resolve_open_dirfd(oldtrans, olddirfd_raw, ctx->fd_table); - if (olddirfd < 0 && olddirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); + long olddirfd = resolve_open_dirfd(oldtrans, olddirfd_raw, ctx->fd_table); + if (olddirfd < 0 && olddirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + long newdirfd = resolve_open_dirfd(newtrans, newdirfd_raw, ctx->fd_table); + if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + long ret = kbox_lkl_linkat(ctx->sysnrs, olddirfd, oldtrans, newdirfd, + newtrans, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_from_lkl(ret); +} + +/* forward_utimensat. */ + +/* struct timespec is 16 bytes on 64-bit: tv_sec(8) + tv_nsec(8). */ +#define TIMESPEC_SIZE 16 + +static struct kbox_dispatch forward_utimensat( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); + + /* pathname can be NULL for utimensat (operates on dirfd itself). In + * that case args[1] == 0. + */ + char pathbuf[KBOX_MAX_PATH]; + const char *translated_path = NULL; + char translated[KBOX_MAX_PATH]; + long lkl_dirfd; + int rc; + + if (kbox_syscall_request_arg(req, 1) != 0) { + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + pathbuf, sizeof(pathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, + translated, sizeof(translated)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + translated_path = translated; + lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); + if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + } else { + translated_path = NULL; + /* dirfd must be a virtual FD when path is NULL. */ + lkl_dirfd = kbox_fd_table_get_lkl(ctx->fd_table, dirfd_raw); + if (lkl_dirfd < 0) + return kbox_dispatch_continue(); + } + + /* Read the times array (2 x struct timespec) if provided. */ + uint8_t times_buf[TIMESPEC_SIZE * 2]; + const void *times = NULL; + if (kbox_syscall_request_arg(req, 2) != 0) { + rc = guest_mem_read(ctx, pid, kbox_syscall_request_arg(req, 2), + times_buf, sizeof(times_buf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + times = times_buf; + } + + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + long ret = kbox_lkl_utimensat(ctx->sysnrs, lkl_dirfd, translated_path, + times, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_from_lkl(ret); +} + +/* forward_ioctl. */ + +/* Terminal ioctl constants. */ +#ifndef TCGETS +#define TCGETS 0x5401 +#endif +#ifndef TCSETS +#define TCSETS 0x5402 +#endif +#ifndef TIOCGWINSZ +#define TIOCGWINSZ 0x5413 +#endif +#ifndef TIOCSWINSZ +#define TIOCSWINSZ 0x5414 +#endif +#ifndef TIOCGPGRP +#define TIOCGPGRP 0x540F +#endif +#ifndef TIOCSPGRP +#define TIOCSPGRP 0x5410 +#endif +#ifndef TIOCSCTTY +#define TIOCSCTTY 0x540E +#endif + +static struct kbox_dispatch forward_ioctl( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long cmd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + + if (lkl_fd < 0) { + /* Host FD (stdin/stdout/stderr or pipe). Most ioctls pass through + * to the host kernel. However, job-control ioctls (TIOCSPGRP/ + * TIOCGPGRP) fail with EPERM under seccomp-unotify because the + * supervised child is not the session leader. Return ENOTTY so + * shells fall back to non-job-control mode instead of aborting. + */ + if (cmd == TIOCSPGRP || cmd == TIOCGPGRP || cmd == TIOCSCTTY) + return kbox_dispatch_errno(ENOTTY); + return kbox_dispatch_continue(); + } + + (void) lkl_fd; + + /* For virtual FDs backed by LKL, terminal ioctls return ENOTTY since + * LKL file-backed FDs are not terminals. Non-terminal ioctls also + * return ENOTTY, matching regular-file semantics. + */ + return kbox_dispatch_errno(ENOTTY); +} + +/* forward_mmap. */ + +/* mmap dispatch: if the FD is a virtual FD with no host shadow, create + * the shadow on demand (lazy shadow) and inject it into the tracee at + * the same FD number, then CONTINUE so the host kernel mmaps the real fd. + * + * Lazy shadow creation avoids the memfd_create + file-copy cost at every + * open. The shadow is only materialized when the guest actually mmaps. + */ +static struct kbox_dispatch forward_mmap(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + /* W^X enforcement for mmap in trap/rewrite mode. */ + if (request_uses_trap_signals(req)) { + int prot = (int) kbox_syscall_request_arg(req, 2); + if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: mmap denied: W^X violation " + "(prot=0x%x, pid=%u)\n", + prot, kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EACCES); + } + } + + long fd = to_dirfd_arg(kbox_syscall_request_arg(req, 4)); + + if (fd == -1) + return kbox_dispatch_continue(); + + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + if (lkl_fd >= 0) { + long host = kbox_fd_table_get_host_fd(ctx->fd_table, fd); + if (host == -1) { + /* Only create lazy shadows for read-only/private mappings. + * Writable MAP_SHARED mappings on LKL files cannot be + * supported via memfd (writes would go to the copy, not LKL). + */ + int mmap_flags = (int) kbox_syscall_request_arg(req, 3); + int mmap_prot = (int) kbox_syscall_request_arg(req, 2); + if ((mmap_flags & MAP_SHARED) && (mmap_prot & PROT_WRITE)) + return kbox_dispatch_errno(ENODEV); + + int memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + if (memfd < 0) + return kbox_dispatch_errno(ENODEV); + int injected = request_addfd_at(ctx, req, memfd, (int) fd, 0); + if (injected < 0) { + close(memfd); + return kbox_dispatch_errno(ENODEV); + } + /* Mark that a shadow was injected so repeated mmaps don't + * re-create it. Use -2 as a sentinel: host_fd >= 0 means + * "supervisor-owned shadow fd" (closed on remove). host_fd + * == -2 means "tracee-owned shadow, don't close in supervisor." + * fd_table_remove only closes host_fd when host_fd >= 0 AND + * shadow_sp < 0, so -2 is safe. + */ + kbox_fd_table_set_host_fd(ctx->fd_table, fd, + KBOX_FD_HOST_SAME_FD_SHADOW); + { + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (entry) + entry->shadow_sp = memfd; + } + } + } + + return kbox_dispatch_continue(); +} + +/* Identity dispatch helpers */ +/* */ +/* In host+root_identity mode, get* returns 0 and set* returns 0. */ +/* In host+override mode, get* returns the override value. */ +/* In host+neither mode, CONTINUE to host kernel. */ +/* In image mode, forward to LKL. */ + +static struct kbox_dispatch dispatch_get_uid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx) +{ + if (ctx->host_root) { + if (ctx->root_identity) + return kbox_dispatch_value(0); + if (ctx->override_uid != (uid_t) -1) + return kbox_dispatch_value((int64_t) ctx->override_uid); + return kbox_dispatch_continue(); + } + return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); +} + +static struct kbox_dispatch dispatch_get_gid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx) +{ + if (ctx->host_root) { + if (ctx->root_identity) + return kbox_dispatch_value(0); + if (ctx->override_gid != (gid_t) -1) + return kbox_dispatch_value((int64_t) ctx->override_gid); + return kbox_dispatch_continue(); + } + return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); +} + +static struct kbox_dispatch dispatch_set_id( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_dispatch (*lkl_forward)(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx)) +{ + if (ctx->host_root) { + if (ctx->root_identity) + return kbox_dispatch_value(0); + return kbox_dispatch_continue(); + } + return lkl_forward(req, ctx); +} + +/* forward_execve. */ + +/* AT_EMPTY_PATH flag for execveat: indicates fexecve() usage. Defined + * here to avoid pulling in the full linux/fcntl.h. + */ +#define KBOX_AT_EMPTY_PATH 0x1000 + +/* Load biases for the userspace ELF loader. Must match image.c + * prepare_userspace_launch. The loader places main and interpreter + * ELFs at these fixed virtual addresses, and the stack just below + * stack_top. + */ +#define KBOX_EXEC_MAIN_LOAD_BIAS 0x600000000000ULL +#define KBOX_EXEC_INTERP_LOAD_BIAS 0x610000000000ULL +#define KBOX_EXEC_STACK_TOP 0x700000010000ULL + +/* Alternate stack region for userspace re-exec. During re-exec the + * SIGSYS handler is running on the old guest stack, so we cannot + * unmap it until after transferring to the new binary. Place the + * new stack at a different address; the old stack region is reclaimed + * by the subsequent munmap in teardown_old_guest_mappings during the + * NEXT re-exec. + */ +#define KBOX_EXEC_REEXEC_STACK_TOP 0x6F0000010000ULL + +/* Maximum entries in argv or envp for userspace exec. */ +#define KBOX_EXEC_MAX_ARGS 4096 + +/* Track which stack region is in use by the current guest. The + * initial launch uses KBOX_EXEC_STACK_TOP; re-exec alternates + * between the two addresses. The signal handler runs on the + * current guest's stack, so we must not unmap it during re-exec. + */ +static uint64_t reexec_current_stack_top; + +/* Safely count a null-terminated pointer array in guest address space. + * Uses process_vm_readv to avoid SIGSEGV on bad guest pointers. + * Returns the count (not including the final NULL), or -EFAULT on bad memory. + */ +static long count_user_ptrs_safe(uint64_t arr_addr, size_t max_count) +{ + size_t n = 0; + uint64_t ptr; + int rc; + + if (arr_addr == 0) + return -EFAULT; + + while (n < max_count) { + uint64_t offset, probe_addr; + if (__builtin_mul_overflow((uint64_t) n, sizeof(uint64_t), &offset) || + __builtin_add_overflow(arr_addr, offset, &probe_addr)) + return -EFAULT; + rc = kbox_current_read(probe_addr, &ptr, sizeof(ptr)); + if (rc < 0) + return -EFAULT; + if (ptr == 0) + return (long) n; + n++; + } + + return -E2BIG; +} + +/* Safely measure the length of a guest string. + * Returns the length (not including NUL), or -EFAULT on bad memory. + */ +static long strlen_user_safe(uint64_t str_addr) +{ + char buf[256]; + size_t total = 0; + + if (str_addr == 0) + return -EFAULT; + + for (;;) { + int rc = kbox_current_read(str_addr + total, buf, sizeof(buf)); + if (rc < 0) + return -EFAULT; + for (size_t i = 0; i < sizeof(buf); i++) { + if (buf[i] == '\0') + return (long) (total + i); + } + total += sizeof(buf); + if (total > (size_t) (256 * 1024)) + return -ENAMETOOLONG; + } +} + +/* Safely read a single guest pointer (8 bytes). */ +static int read_user_ptr(uint64_t addr, uint64_t *out) +{ + return kbox_current_read(addr, out, sizeof(*out)); +} + +/* Safely copy a guest string into a destination buffer. + * Returns the string length (not including NUL), or -EFAULT. + */ +static long copy_user_string(uint64_t str_addr, char *dst, size_t dst_size) +{ + return kbox_current_read_string(str_addr, dst, dst_size); +} + +/* Tear down old guest code/data mappings and the stale stack at the + * new stack address. The current guest stack (which the SIGSYS + * handler is running on) is at the OTHER address and left alone. + * It leaks one stack-sized region until the next re-exec cycle. + */ +static void teardown_old_guest_mappings(uint64_t new_stack_top) +{ + /* Main binary region: up to 256 MB from the load bias. */ + munmap((void *) (uintptr_t) KBOX_EXEC_MAIN_LOAD_BIAS, 256UL * 1024 * 1024); + /* Interpreter region: up to 256 MB from the load bias. */ + munmap((void *) (uintptr_t) KBOX_EXEC_INTERP_LOAD_BIAS, + 256UL * 1024 * 1024); + /* Unmap any stale stack at the new stack address. On the first + * re-exec (new = REEXEC), this is a no-op (nothing mapped there). + * On the second re-exec (new = STACK_TOP), this unmaps the + * initial launch stack. Subsequent cycles alternate and reclaim. + */ + munmap((void *) (uintptr_t) (new_stack_top - 16UL * 1024 * 1024), + 16UL * 1024 * 1024 + 0x10000UL); +} + +/* Perform userspace exec for trap mode. Called from inside the SIGSYS + * handler when the guest calls execve/execveat. This replaces the + * current process image without a real exec syscall, preserving the + * SIGSYS handler and seccomp filter chain. + * + * The function is noreturn on success: it transfers control to the new + * binary's entry point. On failure, it returns a dispatch with errno. + */ +static struct kbox_dispatch trap_userspace_exec( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + int exec_memfd, + const char *pathname, + int is_execveat) +{ + unsigned char *elf_buf = NULL; + size_t elf_buf_len = 0; + char interp_path[256]; + int interp_memfd = -1; + int ilen = 0; + struct kbox_loader_launch_spec spec; + struct kbox_loader_launch launch = {0}; + struct kbox_syscall_trap_ip_range ranges[KBOX_LOADER_MAX_MAPPINGS]; + struct kbox_loader_exec_range exec_ranges[KBOX_LOADER_MAX_MAPPINGS]; + size_t exec_count = 0; + size_t range_count = 0; + unsigned char random_bytes[KBOX_LOADER_RANDOM_SIZE]; + + /* execve(path, argv, envp): argv=args[1], envp=args[2] + * execveat(dirfd, path, argv, envp, flags): argv=args[2], envp=args[3] + * + * In trap mode these are guest pointers in our address space, but still + * guest-controlled. All accesses must use safe reads (process_vm_readv) + * to return EFAULT on bad pointers instead of crashing the SIGSYS handler. + */ + uint64_t argv_addr = kbox_syscall_request_arg(req, is_execveat ? 2 : 1); + uint64_t envp_addr = kbox_syscall_request_arg(req, is_execveat ? 3 : 2); + long argc_long = count_user_ptrs_safe(argv_addr, KBOX_EXEC_MAX_ARGS); + long envc_long = count_user_ptrs_safe(envp_addr, KBOX_EXEC_MAX_ARGS); + size_t argc, envc; + + if (argc_long < 0) { + close(exec_memfd); + return kbox_dispatch_errno(argc_long == -E2BIG ? EINVAL : EFAULT); + } + if (envc_long < 0) { + close(exec_memfd); + return kbox_dispatch_errno(envc_long == -E2BIG ? EINVAL : EFAULT); + } + argc = (size_t) argc_long; + envc = (size_t) envc_long; + if (argc == 0) { + close(exec_memfd); + return kbox_dispatch_errno(EINVAL); + } + + /* Deep-copy argv and envp into a single mmap'd arena. Using mmap + * instead of malloc/strdup because we are inside the SIGSYS handler + * and glibc's allocator is not async-signal-safe. + * + * Two passes: first measure total size (via safe string length reads), + * then copy. All guest pointer reads use process_vm_readv. + */ + size_t arena_size = (argc + envc) * sizeof(char *); + for (size_t i = 0; i < argc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + slen = strlen_user_safe(str_addr); + if (slen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + arena_size += (size_t) slen + 1; + } + for (size_t i = 0; i < envc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + slen = strlen_user_safe(str_addr); + if (slen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + arena_size += (size_t) slen + 1; + } + arena_size = (arena_size + 4095) & ~(size_t) 4095; + + char *arena = mmap(NULL, arena_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (arena == MAP_FAILED) { + close(exec_memfd); + return kbox_dispatch_errno(ENOMEM); + } + size_t arena_used = 0; + char **argv_copy = (char **) (arena + arena_used); + arena_used += argc * sizeof(char *); + char **envp_copy = (char **) (arena + arena_used); + arena_used += envc * sizeof(char *); + for (size_t i = 0; i < argc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) + goto fail_arena; + slen = copy_user_string(str_addr, arena + arena_used, + arena_size - arena_used); + if (slen < 0) + goto fail_arena; + argv_copy[i] = arena + arena_used; + arena_used += (size_t) slen + 1; + } + for (size_t i = 0; i < envc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) + goto fail_arena; + slen = copy_user_string(str_addr, arena + arena_used, + arena_size - arena_used); + if (slen < 0) + goto fail_arena; + envp_copy[i] = arena + arena_used; + arena_used += (size_t) slen + 1; + } - long newdirfd = resolve_open_dirfd(newtrans, newdirfd_raw, ctx->fd_table); - if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); + /* Check for PT_INTERP (dynamic binary needing an interpreter). */ + if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, &elf_buf_len) == + 0) { + uint64_t pt_offset, pt_filesz; - long ret = kbox_lkl_linkat(ctx->sysnrs, olddirfd, oldtrans, newdirfd, - newtrans, flags); - return kbox_dispatch_from_lkl(ret); -} + ilen = kbox_find_elf_interp_loc(elf_buf, elf_buf_len, interp_path, + sizeof(interp_path), &pt_offset, + &pt_filesz); + munmap(elf_buf, elf_buf_len); + elf_buf = NULL; -/* forward_utimensat. */ + if (ilen < 0) { + ilen = -ENOEXEC; + goto fail_early; + } -/* struct timespec is 16 bytes on 64-bit: tv_sec(8) + tv_nsec(8). */ -#define TIMESPEC_SIZE 16 + if (ilen > 0) { + long interp_lkl = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, + interp_path, O_RDONLY, 0); + if (interp_lkl < 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: cannot open " + "interpreter %s: %s\n", + pathname, interp_path, kbox_err_text(interp_lkl)); + ilen = (int) interp_lkl; + goto fail_early; + } -static struct kbox_dispatch forward_utimensat( - const struct kbox_seccomp_notif *notif, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = notif->pid; - long dirfd_raw = to_dirfd_arg(notif->data.args[0]); + interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); + kbox_lkl_close(ctx->sysnrs, interp_lkl); - /* pathname can be NULL for utimensat (operates on dirfd itself). In - * that case args[1] == 0. + if (interp_memfd < 0) { + ilen = interp_memfd; + goto fail_early; + } + } + } + /* else: kbox_read_elf_header_window_fd failed, elf_buf is still NULL. + * Nothing to unmap. Treat as static binary (no interpreter). */ - char pathbuf[KBOX_MAX_PATH]; - const char *translated_path = NULL; - char translated[KBOX_MAX_PATH]; - long lkl_dirfd; - int rc; - if (notif->data.args[1] != 0) { - rc = kbox_vm_read_string(pid, notif->data.args[1], pathbuf, - sizeof(pathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); + /* Generate random bytes for AT_RANDOM auxv entry. Use the raw + * syscall to avoid depending on sys/random.h availability. + */ + memset(random_bytes, 0x42, sizeof(random_bytes)); +#ifdef __NR_getrandom + { + long gr = + syscall(__NR_getrandom, random_bytes, sizeof(random_bytes), 0); + (void) gr; + } +#endif - rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, - translated, sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); + /* Pick a stack address that does not collide with the old guest + * stack (which we are currently running on from inside the SIGSYS + * handler). Alternate between two stack tops so the old one + * survives until the next re-exec reclaims it. + */ + uint64_t new_stack_top = + (reexec_current_stack_top == KBOX_EXEC_REEXEC_STACK_TOP) + ? KBOX_EXEC_STACK_TOP + : KBOX_EXEC_REEXEC_STACK_TOP; - translated_path = translated; - lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); - if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - } else { - translated_path = NULL; - /* dirfd must be a virtual FD when path is NULL. */ - lkl_dirfd = kbox_fd_table_get_lkl(ctx->fd_table, dirfd_raw); - if (lkl_dirfd < 0) - return kbox_dispatch_continue(); - } + /* Build the loader launch spec. Use the same load biases as the + * initial launch so the address space layout is consistent. + */ + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = exec_memfd; + spec.interp_fd = interp_memfd; + spec.argv = (const char *const *) argv_copy; + spec.argc = argc; + spec.envp = (const char *const *) envp_copy; + spec.envc = envc; + spec.execfn = pathname; + spec.random_bytes = random_bytes; + spec.page_size = (uint64_t) sysconf(_SC_PAGESIZE); + spec.stack_top = new_stack_top; + spec.main_load_bias = KBOX_EXEC_MAIN_LOAD_BIAS; + spec.interp_load_bias = KBOX_EXEC_INTERP_LOAD_BIAS; + spec.uid = ctx->root_identity ? 0 : (uint32_t) getuid(); + spec.euid = ctx->root_identity ? 0 : (uint32_t) getuid(); + spec.gid = ctx->root_identity ? 0 : (uint32_t) getgid(); + spec.egid = ctx->root_identity ? 0 : (uint32_t) getgid(); + spec.secure = 0; + + /* Tear down old guest code/data mappings BEFORE materializing new + * ones (MAP_FIXED_NOREPLACE requires the addresses to be free). + * But do NOT teardown before reading the memfds; the reads use + * pread which doesn't depend on the old mappings. + */ + teardown_old_guest_mappings(new_stack_top); - /* Read the times array (2 x struct timespec) if provided. */ - uint8_t times_buf[TIMESPEC_SIZE * 2]; - const void *times = NULL; - if (notif->data.args[2] != 0) { - rc = kbox_vm_read(pid, notif->data.args[2], times_buf, - sizeof(times_buf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - times = times_buf; + { + int launch_rc = kbox_loader_prepare_launch(&spec, &launch); + if (launch_rc < 0) { + const char msg[] = "kbox: trap exec: loader prepare failed\n"; + (void) write(STDERR_FILENO, msg, sizeof(msg) - 1); + _exit(127); + } } - long flags = to_c_long_arg(notif->data.args[3]); - long ret = kbox_lkl_utimensat(ctx->sysnrs, lkl_dirfd, translated_path, - times, flags); - return kbox_dispatch_from_lkl(ret); -} + /* The memfds have been read into launch buffers; close them. */ + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); -/* forward_ioctl. */ + /* Collect executable ranges from the new layout for the BPF + * filter. The new filter is appended to the filter chain; the + * old filter is harmless (matches unmapped addresses). + */ + if (kbox_loader_collect_exec_ranges( + &launch, exec_ranges, KBOX_LOADER_MAX_MAPPINGS, &exec_count) < 0) { + if (ctx->verbose) + fprintf(stderr, "kbox: trap exec %s: cannot collect exec ranges\n", + pathname); + kbox_loader_launch_reset(&launch); + _exit(127); + } + for (size_t i = 0; i < exec_count; i++) { + ranges[i].start = (uintptr_t) exec_ranges[i].start; + ranges[i].end = (uintptr_t) exec_ranges[i].end; + } + range_count = exec_count; -/* Terminal ioctl constants. */ -#ifndef TCGETS -#define TCGETS 0x5401 -#endif -#ifndef TCSETS -#define TCSETS 0x5402 -#endif -#ifndef TIOCGWINSZ -#define TIOCGWINSZ 0x5413 -#endif -#ifndef TIOCSWINSZ -#define TIOCSWINSZ 0x5414 -#endif -#ifndef TIOCGPGRP -#define TIOCGPGRP 0x540F -#endif -#ifndef TIOCSPGRP -#define TIOCSPGRP 0x5410 -#endif -#ifndef TIOCSCTTY -#define TIOCSCTTY 0x540E -#endif + /* Install a new BPF RET_TRAP filter covering the new binary's + * executable ranges. seccomp filters form a chain; calling + * seccomp(SET_MODE_FILTER) adds to it rather than replacing. + */ + if (kbox_install_seccomp_trap_ranges(ctx->host_nrs, ranges, range_count) < + 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: cannot install new BPF filter\n", + pathname); + kbox_loader_launch_reset(&launch); + _exit(127); + } -static struct kbox_dispatch forward_ioctl( - const struct kbox_seccomp_notif *notif, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(notif->data.args[0]); - long cmd = to_c_long_arg(notif->data.args[1]); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + /* Clean up CLOEXEC entries from the FD table, matching what a + * real exec would do. + */ + kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); - if (lkl_fd < 0) { - /* Host FD (stdin/stdout/stderr or pipe). Most ioctls pass through - * to the host kernel. However, job-control ioctls (TIOCSPGRP/ - * TIOCGPGRP) fail with EPERM under seccomp-unotify because the - * supervised child is not the session leader. Return ENOTTY so - * shells fall back to non-job-control mode instead of aborting. + /* If the original launch used rewrite mode, re-apply binary rewriting + * to the new binary. This patches syscall instructions in the newly + * loaded executable segments and sets up trampoline regions, promoting + * the new binary from Tier 1 (SIGSYS ~3us) to Tier 2 (~41ns) for + * rewritten sites. + * + * If rewrite installation fails (e.g., trampoline allocation), the + * binary still works correctly via the SIGSYS handler (Tier 1). + */ + if (req->source == KBOX_SYSCALL_SOURCE_REWRITE) { + /* Static: the runtime is stored globally via + * store_active_rewrite_runtime and must survive past the noreturn + * transfer_to_guest. Single-threaded trap mode guarantees no concurrent + * re-exec. */ - if (cmd == TIOCSPGRP || cmd == TIOCGPGRP || cmd == TIOCSCTTY) - return kbox_dispatch_errno(ENOTTY); - return kbox_dispatch_continue(); + static struct kbox_rewrite_runtime rewrite_rt; + kbox_rewrite_runtime_reset(&rewrite_rt); + if (kbox_rewrite_runtime_install(&rewrite_rt, ctx, &launch) == 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: rewrite installed " + "(%zu trampoline regions)\n", + pathname, rewrite_rt.trampoline_region_count); + } else { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: rewrite failed, " + "falling back to SIGSYS\n", + pathname); + } } - (void) lkl_fd; - - /* For virtual FDs backed by LKL, terminal ioctls return ENOTTY since - * LKL file-backed FDs are not terminals. Non-terminal ioctls also - * return ENOTTY, matching regular-file semantics. +#if defined(__x86_64__) + /* Reset the guest FS base to the host (kbox) FS base. We are + * inside the SIGSYS handler where FS already points to kbox's + * TLS. The new binary starts with no TLS set up; it will call + * arch_prctl(ARCH_SET_FS) during libc init to establish its own. + * Until then, SIGSYS handler entry should see FS == host FS and + * the save/restore becomes a no-op, which is correct. */ - return kbox_dispatch_errno(ENOTTY); -} + { + uint64_t host_fs = 0; -/* forward_mmap. */ + kbox_syscall_trap_host_arch_prctl_get_fs(&host_fs); + kbox_syscall_trap_set_guest_fs(host_fs); + } +#endif -/* mmap dispatch: the only case we intercept is a virtual FD with no host - * shadow. Everything else (MAP_ANONYMOUS, shadow FDs, host FDs) passes - * through to the host kernel via CONTINUE. - */ -static struct kbox_dispatch forward_mmap(const struct kbox_seccomp_notif *notif, - struct kbox_supervisor_ctx *ctx) -{ - /* mmap fd is a 32-bit int. In seccomp_data.args[] it is zero-extended - * to uint64_t, so -1 appears as 0xffffffff. Use to_dirfd_arg to - * properly sign-extend from 32 bits. + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: transferring to new image " + "pc=0x%llx sp=0x%llx\n", + pathname, (unsigned long long) launch.transfer.pc, + (unsigned long long) launch.transfer.sp); + + /* Record which stack the new guest is using. The next re-exec + * will pick the other address and reclaim this one. */ - long fd = to_dirfd_arg(notif->data.args[4]); - - /* MAP_ANONYMOUS: fd is -1, no FD involved. */ - if (fd == -1) - return kbox_dispatch_continue(); + reexec_current_stack_top = new_stack_top; - /* If fd is a virtual FD (tracked in our table) and has no host shadow, - * the host kernel cannot resolve it. Return ENODEV. This covers both - * high-range (>= KBOX_FD_BASE) and low-range (dup2 redirects) virtual - * FDs. + /* Free staging buffers before transferring. The image regions + * (mmap'd guest code/data/stack) must survive. */ - if (kbox_fd_table_get_lkl(ctx->fd_table, fd) >= 0) { - long host = kbox_fd_table_get_host_fd(ctx->fd_table, fd); - if (host < 0) - return kbox_dispatch_errno(ENODEV); + munmap(arena, arena_size); + if (launch.main_elf && launch.main_elf_len > 0) + munmap(launch.main_elf, launch.main_elf_len); + launch.main_elf = NULL; + if (launch.interp_elf && launch.interp_elf_len > 0) + munmap(launch.interp_elf, launch.interp_elf_len); + launch.interp_elf = NULL; + kbox_loader_stack_image_reset(&launch.layout.stack); + + /* Unblock SIGSYS before transferring. We are inside the SIGSYS + * handler, which runs with SIGSYS blocked (SA_SIGINFO default). + * Since we jump to the new entry point instead of returning from + * the handler, the kernel never restores the pre-handler signal + * mask. The new binary needs SIGSYS unblocked so the BPF RET_TRAP + * filter can deliver it. + */ + { + uint64_t mask[2] = {0, 0}; + unsigned int signo = SIGSYS - 1; + mask[signo / 64] = 1ULL << (signo % 64); + kbox_syscall_trap_host_rt_sigprocmask_unblock(mask, + 8 /* kernel sigset_t */); } - /* fd is a real host FD (shadow or native): let the kernel handle it. */ - return kbox_dispatch_continue(); -} - -/* Identity dispatch helpers */ -/* */ -/* In host+root_identity mode, get* returns 0 and set* returns 0. */ -/* In host+override mode, get* returns the override value. */ -/* In host+neither mode, CONTINUE to host kernel. */ -/* In image mode, forward to LKL. */ - -/* Macro to reduce repetition in the identity dispatch. For a given identity - * syscall, check the mode and route accordingly. - * - * GET_ID: host+root -> override(0), host+!root+override -> - * override(uid/gid), host+!root+!override -> CONTINUE, - * image -> forward to LKL. - */ -#define DISPATCH_GET_UID(notif, ctx, override_val, lkl_func) \ - do { \ - if (ctx->host_root) { \ - if (ctx->root_identity) \ - return kbox_dispatch_value(0); \ - if (ctx->override_uid != (uid_t) - 1) \ - return kbox_dispatch_value((int64_t) (override_val)); \ - return kbox_dispatch_continue(); \ - } \ - return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); \ - } while (0) - -#define DISPATCH_GET_GID(notif, ctx, override_val, lkl_func) \ - do { \ - if (ctx->host_root) { \ - if (ctx->root_identity) \ - return kbox_dispatch_value(0); \ - if (ctx->override_gid != (gid_t) - 1) \ - return kbox_dispatch_value((int64_t) (override_val)); \ - return kbox_dispatch_continue(); \ - } \ - return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); \ - } while (0) - -#define DISPATCH_SET_ID(notif, ctx, lkl_forward) \ - do { \ - if (ctx->host_root) { \ - if (ctx->root_identity) \ - return kbox_dispatch_value(0); \ - return kbox_dispatch_continue(); \ - } \ - return lkl_forward(notif, ctx); \ - } while (0) + /* Transfer control to the new binary. This is noreturn. */ + kbox_loader_transfer_to_guest(&launch.transfer); -/* forward_execve. */ +fail_arena: + munmap(arena, arena_size); + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); -/* AT_EMPTY_PATH flag for execveat: indicates fexecve() usage. Defined - * here to avoid pulling in the full linux/fcntl.h. - */ -#define KBOX_AT_EMPTY_PATH 0x1000 +fail_early: + munmap(arena, arena_size); + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); + return kbox_dispatch_errno((int) (-ilen)); +} /* Handle execve/execveat from inside the image. * @@ -3897,27 +5760,28 @@ static struct kbox_dispatch forward_mmap(const struct kbox_seccomp_notif *notif, * the seccomp check), so the overwrite is race-free. */ static struct kbox_dispatch forward_execve( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx, int is_execveat) { - pid_t pid = notif->pid; + pid_t pid = kbox_syscall_request_pid(req); /* Detect fexecve: execveat(fd, "", argv, envp, AT_EMPTY_PATH). This * is the initial exec from image.c on the host memfd. Let the kernel * handle it directly. */ if (is_execveat) { - long flags = to_c_long_arg(notif->data.args[4]); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); if (flags & KBOX_AT_EMPTY_PATH) return kbox_dispatch_continue(); } /* Read pathname from tracee memory. */ - uint64_t path_addr = - is_execveat ? notif->data.args[1] : notif->data.args[0]; + uint64_t path_addr = is_execveat ? kbox_syscall_request_arg(req, 1) + : kbox_syscall_request_arg(req, 0); char pathbuf[KBOX_MAX_PATH]; - int rc = kbox_vm_read_string(pid, path_addr, pathbuf, sizeof(pathbuf)); + int rc = + guest_mem_read_string(ctx, pid, path_addr, pathbuf, sizeof(pathbuf)); if (rc < 0) return kbox_dispatch_errno(-rc); @@ -3945,20 +5809,35 @@ static struct kbox_dispatch forward_execve( if (exec_memfd < 0) return kbox_dispatch_errno(-exec_memfd); - /* Check for PT_INTERP (dynamic binary). Read the first 4 KB; enough - * for any reasonable ELF header plus the full program header table. + /* Trap mode: the SIGSYS handler and BPF filter do not survive a + * real exec, so perform a userspace exec instead. This replaces + * the process image in-place (unmap old, map new, jump to entry) + * without invoking the kernel's execve. On success the function + * does not return. */ + if (request_uses_trap_signals(req)) + return trap_userspace_exec(req, ctx, exec_memfd, pathbuf, is_execveat); + + /* Check for PT_INTERP (dynamic binary). */ { - unsigned char elf_buf[4096]; - ssize_t nr_read = pread(exec_memfd, elf_buf, sizeof(elf_buf), 0); + unsigned char *elf_buf = NULL; + size_t elf_buf_len = 0; - if (nr_read > 0) { + if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, + &elf_buf_len) == 0) { char interp_path[256]; uint64_t pt_offset, pt_filesz; int ilen = kbox_find_elf_interp_loc( - elf_buf, (size_t) nr_read, interp_path, sizeof(interp_path), + elf_buf, elf_buf_len, interp_path, sizeof(interp_path), &pt_offset, &pt_filesz); + munmap(elf_buf, elf_buf_len); + + if (ilen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(ENOEXEC); + } + if (ilen > 0) { /* Dynamic binary. Extract the interpreter from LKL and * inject it into the tracee. @@ -3990,8 +5869,8 @@ static struct kbox_dispatch forward_execve( * open_exec() before begin_new_exec() closes CLOEXEC * descriptors. */ - int tracee_interp_fd = kbox_notify_addfd( - ctx->listener_fd, notif->id, interp_memfd, O_CLOEXEC); + int tracee_interp_fd = + request_addfd(ctx, req, interp_memfd, O_CLOEXEC); close(interp_memfd); if (tracee_interp_fd < 0) { @@ -4030,14 +5909,15 @@ static struct kbox_dispatch forward_execve( "-> /proc/self/fd/%d\n", pathbuf, interp_path, tracee_interp_fd); } + } else { + munmap(elf_buf, elf_buf_len); } } /* Inject the exec memfd into the tracee. O_CLOEXEC keeps the tracee's * FD table clean after exec succeeds. */ - int tracee_exec_fd = - kbox_notify_addfd(ctx->listener_fd, notif->id, exec_memfd, O_CLOEXEC); + int tracee_exec_fd = request_addfd(ctx, req, exec_memfd, O_CLOEXEC); close(exec_memfd); if (tracee_exec_fd < 0) @@ -4065,13 +5945,13 @@ static struct kbox_dispatch forward_execve( /* Check if argv[0] is aliased with the pathname. argv pointer is args[1] * for execve, args[2] for execveat. */ - uint64_t argv_addr = - is_execveat ? notif->data.args[2] : notif->data.args[1]; + uint64_t argv_addr = is_execveat ? kbox_syscall_request_arg(req, 2) + : kbox_syscall_request_arg(req, 1); uint64_t argv0_ptr = 0; int argv0_aliased = 0; if (argv_addr != 0) { - rc = kbox_vm_read(pid, argv_addr, &argv0_ptr, sizeof(argv0_ptr)); + rc = guest_mem_read(ctx, pid, argv_addr, &argv0_ptr, sizeof(argv0_ptr)); if (rc == 0 && argv0_ptr == path_addr) argv0_aliased = 1; } @@ -4093,9 +5973,9 @@ static struct kbox_dispatch forward_execve( if (argv0_aliased) memcpy(write_buf + new_path_len + 1, pathbuf, orig_len + 1); - rc = kbox_vm_write(pid, path_addr, write_buf, total_write); + rc = guest_mem_write(ctx, pid, path_addr, write_buf, total_write); if (rc < 0) { - rc = kbox_vm_write_force(pid, path_addr, write_buf, total_write); + rc = guest_mem_write_force(ctx, pid, path_addr, write_buf, total_write); if (rc < 0) { if (ctx->verbose) fprintf(stderr, @@ -4111,9 +5991,11 @@ static struct kbox_dispatch forward_execve( */ if (argv0_aliased) { uint64_t new_argv0 = path_addr + (uint64_t) (new_path_len + 1); - rc = kbox_vm_write(pid, argv_addr, &new_argv0, sizeof(new_argv0)); + rc = + guest_mem_write(ctx, pid, argv_addr, &new_argv0, sizeof(new_argv0)); if (rc < 0) - kbox_vm_write_force(pid, argv_addr, &new_argv0, sizeof(new_argv0)); + guest_mem_write_force(ctx, pid, argv_addr, &new_argv0, + sizeof(new_argv0)); } if (ctx->verbose) @@ -4140,6 +6022,16 @@ static struct kbox_dispatch forward_execve( */ kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); + /* Invalidate the cached /proc/pid/mem FD. After exec, the kernel + * may revoke access to the old FD even though the PID is the same + * (credential check against the new binary). Forcing a reopen on + * the next write ensures we have valid access. + */ + if (ctx->proc_mem_fd >= 0) { + close(ctx->proc_mem_fd); + ctx->proc_mem_fd = -1; + } + return kbox_dispatch_continue(); } @@ -4148,21 +6040,84 @@ static struct kbox_dispatch forward_execve( /* CLONE_NEW* flags that clone3 can smuggle in via clone_args.flags. The BPF * deny-list blocks unshare/setns, but clone3 bypasses it unless we check here. */ +#ifndef CLONE_NEWNS #define CLONE_NEWNS 0x00020000ULL +#endif +#ifndef CLONE_NEWTIME #define CLONE_NEWTIME 0x00000080ULL +#endif +#ifndef CLONE_NEWCGROUP #define CLONE_NEWCGROUP 0x02000000ULL +#endif +#ifndef CLONE_NEWUTS #define CLONE_NEWUTS 0x04000000ULL +#endif +#ifndef CLONE_NEWIPC #define CLONE_NEWIPC 0x08000000ULL +#endif +#ifndef CLONE_NEWUSER #define CLONE_NEWUSER 0x10000000ULL +#endif +#ifndef CLONE_NEWPID #define CLONE_NEWPID 0x20000000ULL +#endif +#ifndef CLONE_NEWNET #define CLONE_NEWNET 0x40000000ULL +#endif +#ifndef CLONE_THREAD +#define CLONE_THREAD 0x00010000ULL +#endif #define CLONE_NEW_MASK \ (CLONE_NEWNS | CLONE_NEWTIME | CLONE_NEWCGROUP | CLONE_NEWUTS | \ CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET) +/* W^X enforcement for mprotect in trap/rewrite mode. + * + * Reject simultaneous PROT_WRITE|PROT_EXEC to prevent JIT spray attacks. + * On none->X transitions, scan the page for syscall/sysenter/SVC instructions + * and add them to the origin map for rewrite-mode caller validation. + * + * In seccomp mode, this is a no-op: CONTINUE lets the host kernel handle it. + */ +static struct kbox_dispatch forward_mprotect( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + uint64_t addr = kbox_syscall_request_arg(req, 0); + uint64_t len = kbox_syscall_request_arg(req, 1); + int prot = (int) kbox_syscall_request_arg(req, 2); + + /* In seccomp mode (supervisor), just pass through. */ + if (!request_uses_trap_signals(req)) + return kbox_dispatch_continue(); + + /* W^X enforcement: reject PROT_WRITE | PROT_EXEC. */ + if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: mprotect denied: W^X violation at 0x%llx len=%llu " + "(pid=%u)\n", + (unsigned long long) addr, (unsigned long long) len, + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EACCES); + } + + /* Allow the mprotect to proceed via host kernel. If the page transitions + * to PROT_EXEC, JIT code on it will take the Tier 1 (RET_TRAP) slow path + * because it won't be in the BPF allow ranges. This is safe: un-rewritten + * syscall instructions in JIT pages are caught by the SIGSYS handler. + * + * Full scan-on-X (rewriting JIT pages at mprotect time) is a future + * optimization: it would promote JIT pages from Tier 1 (~3us) to Tier 2 + * (~41ns) but requires synchronous instruction scanning while the page + * is still writable, which adds latency to the mprotect call. + */ + return kbox_dispatch_continue(); +} + static struct kbox_dispatch forward_clone3( - const struct kbox_seccomp_notif *notif, + const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { uint64_t flags; @@ -4171,7 +6126,9 @@ static struct kbox_dispatch forward_clone3( /* clone3(struct clone_args *args, size_t size). flags is the first uint64_t * field in clone_args. We only need to read the first 8 bytes. */ - rc = kbox_vm_read(notif->pid, notif->data.args[0], &flags, sizeof(flags)); + rc = + guest_mem_read(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, 0), &flags, sizeof(flags)); if (rc < 0) { /* Can't read tracee memory; fail closed with EPERM. * @@ -4185,7 +6142,7 @@ static struct kbox_dispatch forward_clone3( fprintf(stderr, "kbox: clone3 denied: cannot read clone_args " "(pid=%u, rc=%d)\n", - notif->pid, rc); + kbox_syscall_request_pid(req), rc); return kbox_dispatch_errno(EPERM); } @@ -4194,7 +6151,20 @@ static struct kbox_dispatch forward_clone3( fprintf(stderr, "kbox: clone3 denied: namespace flags 0x%llx " "(pid=%u)\n", - (unsigned long long) (flags & CLONE_NEW_MASK), notif->pid); + (unsigned long long) (flags & CLONE_NEW_MASK), + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EPERM); + } + + /* In trap/rewrite mode, block thread creation (CLONE_THREAD). + * Multi-threaded guests require --syscall-mode=seccomp. + */ + if ((flags & CLONE_THREAD) && request_uses_trap_signals(req)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: clone3 denied: CLONE_THREAD in trap/rewrite mode " + "(pid=%u, use --syscall-mode=seccomp)\n", + kbox_syscall_request_pid(req)); return kbox_dispatch_errno(EPERM); } @@ -4203,126 +6173,130 @@ static struct kbox_dispatch forward_clone3( /* Main dispatch function. */ -struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, - const void *notif_ptr) +struct kbox_dispatch kbox_dispatch_request( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req) { - const struct kbox_seccomp_notif *notif = notif_ptr; const struct kbox_host_nrs *h = ctx->host_nrs; - int nr = notif->data.nr; + int nr; + + if (!ctx || !req) + return kbox_dispatch_errno(EINVAL); + + kbox_dispatch_prepare_request_ctx(ctx, req); + nr = req->nr; if (ctx->verbose) { const char *name = syscall_name_from_nr(h, nr); - fprintf(stderr, "seccomp notify: pid=%u nr=%d (%s)\n", notif->pid, nr, - name ? name : "unknown"); + fprintf(stderr, "%s syscall: pid=%u nr=%d (%s)\n", + req->source == KBOX_SYSCALL_SOURCE_SECCOMP ? "seccomp notify" + : "in-process", + kbox_syscall_request_pid(req), nr, name ? name : "unknown"); } /* Legacy x86_64 syscalls. */ if (nr == h->stat) - return forward_stat_legacy(notif, ctx, 0); + return forward_stat_legacy(req, ctx, 0); if (nr == h->lstat) - return forward_stat_legacy(notif, ctx, 1); + return forward_stat_legacy(req, ctx, 1); if (nr == h->access) - return forward_access_legacy(notif, ctx); + return forward_access_legacy(req, ctx); if (nr == h->mkdir) - return forward_mkdir_legacy(notif, ctx); + return forward_mkdir_legacy(req, ctx); if (nr == h->rmdir) - return forward_rmdir_legacy(notif, ctx); + return forward_rmdir_legacy(req, ctx); if (nr == h->unlink) - return forward_unlink_legacy(notif, ctx); + return forward_unlink_legacy(req, ctx); if (nr == h->rename) - return forward_rename_legacy(notif, ctx); + return forward_rename_legacy(req, ctx); if (nr == h->chmod) - return forward_chmod_legacy(notif, ctx); + return forward_chmod_legacy(req, ctx); if (nr == h->chown) - return forward_chown_legacy(notif, ctx); + return forward_chown_legacy(req, ctx); if (nr == h->open) - return forward_open_legacy(notif, ctx); + return forward_open_legacy(req, ctx); /* File open/create. */ if (nr == h->openat) - return forward_openat(notif, ctx); + return forward_openat(req, ctx); if (nr == h->openat2) - return forward_openat2(notif, ctx); + return forward_openat2(req, ctx); /* Metadata. */ if (nr == h->fstat) - return forward_fstat(notif, ctx); + return forward_fstat(req, ctx); if (nr == h->newfstatat) - return forward_newfstatat(notif, ctx); + return forward_newfstatat(req, ctx); if (nr == h->statx) - return forward_statx(notif, ctx); + return forward_statx(req, ctx); if (nr == h->faccessat && h->faccessat > 0) - return forward_faccessat(notif, ctx); + return forward_faccessat(req, ctx); if (nr == h->faccessat2) - return forward_faccessat2(notif, ctx); + return forward_faccessat2(req, ctx); /* Directories. */ if (nr == h->getdents64) - return forward_getdents64(notif, ctx); + return forward_getdents64(req, ctx); if (nr == h->getdents) - return forward_getdents(notif, ctx); + return forward_getdents(req, ctx); if (nr == h->mkdirat) - return forward_mkdirat(notif, ctx); + return forward_mkdirat(req, ctx); if (nr == h->unlinkat) - return forward_unlinkat(notif, ctx); + return forward_unlinkat(req, ctx); if (nr == h->renameat && h->renameat > 0) - return forward_renameat(notif, ctx); + return forward_renameat(req, ctx); if (nr == h->renameat2) - return forward_renameat2(notif, ctx); + return forward_renameat2(req, ctx); if (nr == h->fchmodat) - return forward_fchmodat(notif, ctx); + return forward_fchmodat(req, ctx); if (nr == h->fchownat) - return forward_fchownat(notif, ctx); + return forward_fchownat(req, ctx); /* Navigation. */ if (nr == h->chdir) - return forward_chdir(notif, ctx); + return forward_chdir(req, ctx); if (nr == h->fchdir) - return forward_fchdir(notif, ctx); + return forward_fchdir(req, ctx); if (nr == h->getcwd) - return forward_getcwd(notif, ctx); + return forward_getcwd(req, ctx); /* Identity: UID. */ - if (nr == h->getuid) { - DISPATCH_GET_UID(notif, ctx, ctx->override_uid, kbox_lkl_getuid); - } - if (nr == h->geteuid) { - DISPATCH_GET_UID(notif, ctx, ctx->override_uid, kbox_lkl_geteuid); - } + if (nr == h->getuid) + return dispatch_get_uid(kbox_lkl_getuid, ctx); + if (nr == h->geteuid) + return dispatch_get_uid(kbox_lkl_geteuid, ctx); if (nr == h->getresuid) { if (ctx->host_root) { if (ctx->root_identity) - return forward_getresuid_override(notif, 0); + return forward_getresuid_override(req, ctx, 0); if (ctx->override_uid != (uid_t) -1) - return forward_getresuid_override(notif, ctx->override_uid); + return forward_getresuid_override(req, ctx, ctx->override_uid); return kbox_dispatch_continue(); } - return forward_getresuid(notif, ctx); + return forward_getresuid(req, ctx); } /* Identity: GID. */ - if (nr == h->getgid) { - DISPATCH_GET_GID(notif, ctx, ctx->override_gid, kbox_lkl_getgid); - } - if (nr == h->getegid) { - DISPATCH_GET_GID(notif, ctx, ctx->override_gid, kbox_lkl_getegid); - } + if (nr == h->getgid) + return dispatch_get_gid(kbox_lkl_getgid, ctx); + if (nr == h->getegid) + return dispatch_get_gid(kbox_lkl_getegid, ctx); if (nr == h->getresgid) { if (ctx->host_root) { if (ctx->root_identity) - return forward_getresgid_override(notif, 0); + return forward_getresgid_override(req, ctx, 0); if (ctx->override_gid != (gid_t) -1) - return forward_getresgid_override(notif, ctx->override_gid); + return forward_getresgid_override(req, ctx, ctx->override_gid); return kbox_dispatch_continue(); } - return forward_getresgid(notif, ctx); + return forward_getresgid(req, ctx); } /* Identity: groups. */ @@ -4330,127 +6304,127 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, if (nr == h->getgroups) { if (ctx->host_root) { if (ctx->root_identity) - return forward_getgroups_override(notif, 0); + return forward_getgroups_override(req, ctx, 0); if (ctx->override_gid != (gid_t) -1) - return forward_getgroups_override(notif, ctx->override_gid); + return forward_getgroups_override(req, ctx, ctx->override_gid); return kbox_dispatch_continue(); } - return forward_getgroups(notif, ctx); + return forward_getgroups(req, ctx); } /* Identity: set*. */ if (nr == h->setuid) - DISPATCH_SET_ID(notif, ctx, forward_setuid); + return dispatch_set_id(req, ctx, forward_setuid); if (nr == h->setreuid) - DISPATCH_SET_ID(notif, ctx, forward_setreuid); + return dispatch_set_id(req, ctx, forward_setreuid); if (nr == h->setresuid) - DISPATCH_SET_ID(notif, ctx, forward_setresuid); + return dispatch_set_id(req, ctx, forward_setresuid); if (nr == h->setgid) - DISPATCH_SET_ID(notif, ctx, forward_setgid); + return dispatch_set_id(req, ctx, forward_setgid); if (nr == h->setregid) - DISPATCH_SET_ID(notif, ctx, forward_setregid); + return dispatch_set_id(req, ctx, forward_setregid); if (nr == h->setresgid) - DISPATCH_SET_ID(notif, ctx, forward_setresgid); + return dispatch_set_id(req, ctx, forward_setresgid); if (nr == h->setgroups) - DISPATCH_SET_ID(notif, ctx, forward_setgroups); + return dispatch_set_id(req, ctx, forward_setgroups); if (nr == h->setfsgid) - DISPATCH_SET_ID(notif, ctx, forward_setfsgid); + return dispatch_set_id(req, ctx, forward_setfsgid); /* Mount. */ if (nr == h->mount) - return forward_mount(notif, ctx); + return forward_mount(req, ctx); if (nr == h->umount2) - return forward_umount2(notif, ctx); + return forward_umount2(req, ctx); /* FD operations. */ if (nr == h->close) - return forward_close(notif, ctx); + return forward_close(req, ctx); if (nr == h->fcntl) - return forward_fcntl(notif, ctx); + return forward_fcntl(req, ctx); if (nr == h->dup) - return forward_dup(notif, ctx); + return forward_dup(req, ctx); if (nr == h->dup2) - return forward_dup2(notif, ctx); + return forward_dup2(req, ctx); if (nr == h->dup3) - return forward_dup3(notif, ctx); + return forward_dup3(req, ctx); /* I/O. */ if (nr == h->read) - return forward_read_like(notif, ctx, 0); + return forward_read_like(req, ctx, 0); if (nr == h->pread64) - return forward_read_like(notif, ctx, 1); + return forward_read_like(req, ctx, 1); if (nr == h->write) - return forward_write(notif, ctx); + return forward_write(req, ctx); if (nr == h->lseek) - return forward_lseek(notif, ctx); + return forward_lseek(req, ctx); /* Networking. */ if (nr == h->socket) - return forward_socket(notif, ctx); + return forward_socket(req, ctx); if (nr == h->bind) - return forward_bind(notif, ctx); + return forward_bind(req, ctx); if (nr == h->connect) - return forward_connect(notif, ctx); + return forward_connect(req, ctx); if (nr == h->sendto) - return forward_sendto(notif, ctx); + return forward_sendto(req, ctx); if (nr == h->recvfrom) - return forward_recvfrom(notif, ctx); + return forward_recvfrom(req, ctx); /* sendmsg: BPF allow-listed (SCM_RIGHTS), never reaches here. * Shadow socket callers should use sendto for addressed datagrams. */ if (nr == h->recvmsg) - return forward_recvmsg(notif, ctx); + return forward_recvmsg(req, ctx); if (nr == h->getsockopt) - return forward_getsockopt(notif, ctx); + return forward_getsockopt(req, ctx); if (nr == h->setsockopt) - return forward_setsockopt(notif, ctx); + return forward_setsockopt(req, ctx); if (nr == h->getsockname) - return forward_getsockname(notif, ctx); + return forward_getsockname(req, ctx); if (nr == h->getpeername) - return forward_getpeername(notif, ctx); + return forward_getpeername(req, ctx); if (nr == h->shutdown) - return forward_shutdown(notif, ctx); + return forward_shutdown(req, ctx); /* I/O extended. */ if (nr == h->pwrite64) - return forward_pwrite64(notif, ctx); + return forward_pwrite64(req, ctx); if (nr == h->writev) - return forward_writev(notif, ctx); + return forward_writev(req, ctx); if (nr == h->readv) - return forward_readv(notif, ctx); + return forward_readv(req, ctx); if (nr == h->ftruncate) - return forward_ftruncate(notif, ctx); + return forward_ftruncate(req, ctx); if (nr == h->fallocate) - return forward_fallocate(notif, ctx); + return forward_fallocate(req, ctx); if (nr == h->flock) - return forward_flock(notif, ctx); + return forward_flock(req, ctx); if (nr == h->fsync) - return forward_fsync(notif, ctx); + return forward_fsync(req, ctx); if (nr == h->fdatasync) - return forward_fdatasync(notif, ctx); + return forward_fdatasync(req, ctx); if (nr == h->sync) - return forward_sync(notif, ctx); + return forward_sync(req, ctx); if (nr == h->ioctl) - return forward_ioctl(notif, ctx); + return forward_ioctl(req, ctx); /* File operations. */ if (nr == h->readlinkat) - return forward_readlinkat(notif, ctx); + return forward_readlinkat(req, ctx); if (nr == h->pipe2) - return forward_pipe2(notif, ctx); + return forward_pipe2(req, ctx); if (nr == h->pipe) { /* Legacy pipe(2) has only one arg: pipefd. Create host pipe and inject * via ADDFD, same as the pipe2 path. */ - pid_t ppid = notif->pid; - uint64_t remote_pfd = notif->data.args[0]; + pid_t ppid = kbox_syscall_request_pid(req); + uint64_t remote_pfd = kbox_syscall_request_arg(req, 0); if (remote_pfd == 0) return kbox_dispatch_errno(EFAULT); @@ -4458,15 +6432,13 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, if (pipe(host_pfds) < 0) return kbox_dispatch_errno(errno); - int tfd0 = - kbox_notify_addfd(ctx->listener_fd, notif->id, host_pfds[0], 0); + int tfd0 = request_addfd(ctx, req, host_pfds[0], 0); if (tfd0 < 0) { close(host_pfds[0]); close(host_pfds[1]); return kbox_dispatch_errno(-tfd0); } - int tfd1 = - kbox_notify_addfd(ctx->listener_fd, notif->id, host_pfds[1], 0); + int tfd1 = request_addfd(ctx, req, host_pfds[1], 0); if (tfd1 < 0) { close(host_pfds[0]); close(host_pfds[1]); @@ -4476,19 +6448,19 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, close(host_pfds[1]); int gfds[2] = {tfd0, tfd1}; - int pwrc = kbox_vm_write(ppid, remote_pfd, gfds, sizeof(gfds)); + int pwrc = guest_mem_write(ctx, ppid, remote_pfd, gfds, sizeof(gfds)); if (pwrc < 0) return kbox_dispatch_errno(-pwrc); return kbox_dispatch_value(0); } if (nr == h->symlinkat) - return forward_symlinkat(notif, ctx); + return forward_symlinkat(req, ctx); if (nr == h->linkat) - return forward_linkat(notif, ctx); + return forward_linkat(req, ctx); if (nr == h->utimensat) - return forward_utimensat(notif, ctx); + return forward_utimensat(req, ctx); if (nr == h->sendfile) - return forward_sendfile(notif, ctx); + return forward_sendfile(req, ctx); if (nr == h->copy_file_range) return kbox_dispatch_errno(ENOSYS); @@ -4512,42 +6484,79 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, /* Time. */ if (nr == h->clock_gettime) - return forward_clock_gettime(notif, ctx); + return forward_clock_gettime(req, ctx); if (nr == h->clock_getres) - return forward_clock_getres(notif, ctx); + return forward_clock_getres(req, ctx); if (nr == h->gettimeofday) - return forward_gettimeofday(notif, ctx); + return forward_gettimeofday(req, ctx); /* Process lifecycle. */ if (nr == h->umask) - return forward_umask(notif, ctx); + return forward_umask(req, ctx); if (nr == h->uname) - return forward_uname(notif, ctx); + return forward_uname(req, ctx); if (nr == h->brk) return kbox_dispatch_continue(); if (nr == h->getrandom) - return forward_getrandom(notif, ctx); + return forward_getrandom(req, ctx); if (nr == h->syslog) - return forward_syslog(notif, ctx); + return forward_syslog(req, ctx); if (nr == h->prctl) - return forward_prctl(notif, ctx); + return forward_prctl(req, ctx); if (nr == h->wait4) return kbox_dispatch_continue(); if (nr == h->waitid) return kbox_dispatch_continue(); + if (nr == h->exit) + return kbox_dispatch_continue(); + if (nr == h->exit_group) + return kbox_dispatch_continue(); /* Signals (CONTINUE). */ /* Signal disposition and masking are per-process host kernel state. */ - if (nr == h->rt_sigaction) + if (nr == h->rt_sigaction) { + if (request_uses_trap_signals(req) && + kbox_syscall_trap_signal_is_reserved( + (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)))) { + if (ctx->verbose) { + fprintf(stderr, + "kbox: reserved SIGSYS handler change denied " + "(pid=%u source=%d)\n", + kbox_syscall_request_pid(req), req->source); + } + return kbox_dispatch_errno(EPERM); + } return kbox_dispatch_continue(); /* signal handler registration */ - if (nr == h->rt_sigprocmask) + } + if (nr == h->rt_sigprocmask) { + if (request_uses_trap_signals(req)) { + long how = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + int blocks_reserved = request_blocks_reserved_sigsys(req, ctx); + + if (blocks_reserved < 0) + return kbox_dispatch_errno(-blocks_reserved); + if (how != SIG_UNBLOCK && blocks_reserved) { + if (ctx->verbose) { + fprintf(stderr, + "kbox: reserved SIGSYS mask change denied " + "(pid=%u source=%d how=%ld)\n", + kbox_syscall_request_pid(req), req->source, how); + } + return kbox_dispatch_errno(EPERM); + } + return emulate_trap_rt_sigprocmask(req, ctx); + } return kbox_dispatch_continue(); /* signal mask manipulation */ + } if (nr == h->rt_sigreturn) return kbox_dispatch_continue(); /* return from signal handler */ - if (nr == h->rt_sigpending) + if (nr == h->rt_sigpending) { + if (request_uses_trap_signals(req)) + return emulate_trap_rt_sigpending(req, ctx); return kbox_dispatch_continue(); /* pending signal query */ + } if (nr == h->rt_sigaltstack) return kbox_dispatch_continue(); /* alternate signal stack */ if (nr == h->setitimer) @@ -4569,20 +6578,20 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, * tracee's actual host PID from the seccomp notification). */ #define IS_GUEST_PID(p) \ - ((p) == ctx->child_pid || (p) == (pid_t) notif->pid || (p) == 1) + ((p) == ctx->child_pid || (p) == kbox_syscall_request_pid(req) || (p) == 1) if (nr == h->kill) { - pid_t target = (pid_t) notif->data.args[0]; - int sig = (int) notif->data.args[1]; + pid_t target = (pid_t) kbox_syscall_request_arg(req, 0); + int sig = (int) kbox_syscall_request_arg(req, 1); if (!IS_GUEST_PID(target) && target != 0) { if (ctx->verbose) fprintf(stderr, "kbox: kill(%d) denied: not guest PID\n", target); return kbox_dispatch_errno(EPERM); } - /* Emulate kill() from the supervisor. Virtual PID 1 and process-group - * 0 must target the real child PID, not the host's PID 1 or the - * supervisor's process group. + /* Translate virtual PID to real PID. In both seccomp and trap + * mode, the guest sees itself as PID 1. Route kill(1, sig) and + * kill(0, sig) to the real child PID. */ { pid_t real_target = ctx->child_pid; @@ -4591,38 +6600,42 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, long ret = syscall(SYS_kill, real_target, sig); if (ret < 0) return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && + real_target == ctx->child_pid && + trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); return kbox_dispatch_value(0); } } if (nr == h->tgkill) { - pid_t tgid = (pid_t) notif->data.args[0]; - pid_t tid = (pid_t) notif->data.args[1]; - int sig = (int) notif->data.args[2]; + pid_t tgid = (pid_t) kbox_syscall_request_arg(req, 0); + pid_t tid = (pid_t) kbox_syscall_request_arg(req, 1); + int sig = (int) kbox_syscall_request_arg(req, 2); if (!IS_GUEST_PID(tgid)) { if (ctx->verbose) fprintf(stderr, "kbox: tgkill(%d) denied: not guest PID\n", tgid); return kbox_dispatch_errno(EPERM); } - - /* The guest passes its virtual PID (1) but the host kernel needs the - * real PID. We can't modify syscall args via seccomp-unotify, so - * emulate: call tgkill with the real PID from the supervisor. The - * tracee's tid is its real host tid (gettid returns virtual 1, but the - * seccomp notification contains the real tid in notif->pid). + /* Translate virtual PID/TID to real. Both seccomp and trap modes + * must emulate tgkill because the guest uses virtual PID 1. */ { pid_t real_tgid = ctx->child_pid; - pid_t real_tid = (tid == 1) ? (pid_t) notif->pid : tid; + pid_t real_tid = (tid == 1) ? kbox_syscall_request_pid(req) : tid; long ret = syscall(SYS_tgkill, real_tgid, real_tid, sig); if (ret < 0) return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && real_tgid == ctx->child_pid && + real_tid == kbox_syscall_request_pid(req) && + trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); return kbox_dispatch_value(0); } } if (nr == h->tkill) { - pid_t target = (pid_t) notif->data.args[0]; - int sig = (int) notif->data.args[1]; + pid_t target = (pid_t) kbox_syscall_request_arg(req, 0); + int sig = (int) kbox_syscall_request_arg(req, 1); if (!IS_GUEST_PID(target)) { if (ctx->verbose) fprintf(stderr, "kbox: tkill(%d) denied: not guest PID\n", @@ -4630,10 +6643,15 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, return kbox_dispatch_errno(EPERM); } { - pid_t real_tid = (target == 1) ? (pid_t) notif->pid : target; + pid_t real_tid = + (target == 1) ? kbox_syscall_request_pid(req) : target; long ret = syscall(SYS_tkill, real_tid, sig); if (ret < 0) return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && + real_tid == kbox_syscall_request_pid(req) && + trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); return kbox_dispatch_value(0); } } @@ -4653,21 +6671,60 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, if (nr == h->futex) return kbox_dispatch_continue(); /* fast userspace mutex */ if (nr == h->clone3) - return forward_clone3(notif, ctx); /* sanitize namespace flags */ - if (nr == h->arch_prctl) - return kbox_dispatch_continue(); /* x86_64 FS/GS base */ + return forward_clone3(req, ctx); /* sanitize namespace flags */ + if (nr == h->arch_prctl) { + /* In trap/rewrite mode, arch_prctl(SET_FS) must be intercepted + * to avoid overwriting kbox's TLS. The SIGSYS handler swaps + * FS on entry/exit; SET_FS updates the guest's saved FS base + * so it takes effect when the handler returns. GET_FS returns + * the guest's saved FS base. In seccomp mode, CONTINUE is fine + * because the supervisor runs in a separate process. + */ + if (request_uses_trap_signals(req)) { + long subcmd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + if (subcmd == 0x1002 /* ARCH_SET_FS */) { + kbox_syscall_trap_set_guest_fs( + kbox_syscall_request_arg(req, 1)); + return kbox_dispatch_value(0); + } + if (subcmd == 0x1003 /* ARCH_GET_FS */) { + uint64_t out_ptr = kbox_syscall_request_arg(req, 1); + uint64_t fs = kbox_syscall_trap_get_guest_fs(); + if (out_ptr == 0) + return kbox_dispatch_errno(EFAULT); + int wrc = guest_mem_write(ctx, kbox_syscall_request_pid(req), + out_ptr, &fs, sizeof(fs)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + return kbox_dispatch_value(0); + } + } + return kbox_dispatch_continue(); /* GS or seccomp mode */ + } if (nr == h->rseq) return kbox_dispatch_continue(); /* restartable sequences */ if (nr == h->clone) { /* Legacy clone: flags are in args[0] directly (not a struct). */ - uint64_t cflags = notif->data.args[0]; + uint64_t cflags = kbox_syscall_request_arg(req, 0); if (cflags & CLONE_NEW_MASK) { if (ctx->verbose) fprintf(stderr, "kbox: clone denied: namespace flags 0x%llx " "(pid=%u)\n", (unsigned long long) (cflags & CLONE_NEW_MASK), - notif->pid); + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EPERM); + } + /* In trap/rewrite mode, block thread creation (CLONE_THREAD). + * The SIGSYS handler and shared LKL state are not thread-safe; + * multi-threaded guests must use --syscall-mode=seccomp. + */ + if ((cflags & CLONE_THREAD) && request_uses_trap_signals(req)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: clone denied: CLONE_THREAD in trap/rewrite mode " + "(pid=%u, use --syscall-mode=seccomp)\n", + kbox_syscall_request_pid(req)); return kbox_dispatch_errno(EPERM); } return kbox_dispatch_continue(); @@ -4679,14 +6736,22 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, /* Memory mapping. */ - if (nr == h->mmap) - return forward_mmap(notif, ctx); - if (nr == h->munmap) + if (nr == h->mmap) { + invalidate_translated_path_cache(ctx); + return forward_mmap(req, ctx); + } + if (nr == h->munmap) { + invalidate_translated_path_cache(ctx); return kbox_dispatch_continue(); /* unmap pages */ - if (nr == h->mprotect) - return kbox_dispatch_continue(); /* change page protections */ - if (nr == h->mremap) + } + if (nr == h->mprotect) { + invalidate_translated_path_cache(ctx); + return forward_mprotect(req, ctx); /* W^X enforcement + CONTINUE */ + } + if (nr == h->mremap) { + invalidate_translated_path_cache(ctx); return kbox_dispatch_continue(); /* remap pages */ + } if (nr == h->membarrier) return kbox_dispatch_continue(); /* memory barrier (musl threads) */ @@ -4719,11 +6784,11 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, * guest from escaping resource limits. */ if (nr == h->prlimit64) { - uint64_t new_limit_ptr = notif->data.args[2]; + uint64_t new_limit_ptr = kbox_syscall_request_arg(req, 2); if (new_limit_ptr == 0) return kbox_dispatch_continue(); /* GET only */ /* SET operation: check which resource. */ - int resource = (int) notif->data.args[1]; + int resource = (int) kbox_syscall_request_arg(req, 1); /* Allow safe resources: RLIMIT_CORE(4), RLIMIT_AS(9), etc. */ if (resource == 4 /* RLIMIT_CORE */ || resource == 9 /* RLIMIT_AS */) return kbox_dispatch_continue(); @@ -4789,11 +6854,12 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, */ if (nr == h->readlink) { char path[4096]; - int ret = kbox_vm_read_string(notif->pid, notif->data.args[0], path, - sizeof(path)); + int ret = guest_mem_read_string(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, 0), path, + sizeof(path)); if (ret < 0) return kbox_dispatch_errno(-ret); - long bufsiz = (long) notif->data.args[2]; + long bufsiz = (long) kbox_syscall_request_arg(req, 2); char buf[4096]; if (bufsiz > (long) sizeof(buf)) bufsiz = (long) sizeof(buf); @@ -4801,8 +6867,9 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, kbox_lkl_readlinkat(ctx->sysnrs, AT_FDCWD_LINUX, path, buf, bufsiz); if (lret < 0) return kbox_dispatch_from_lkl(lret); - ret = - kbox_vm_write(notif->pid, notif->data.args[1], buf, (size_t) lret); + ret = guest_mem_write(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, 1), buf, + (size_t) lret); if (ret < 0) return kbox_dispatch_errno(-ret); return kbox_dispatch_value(lret); @@ -4811,13 +6878,23 @@ struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, /* Exec (in-image binary extraction + pathname rewrite). */ if (nr == h->execve) - return forward_execve(notif, ctx, 0); + return forward_execve(req, ctx, 0); if (nr == h->execveat) - return forward_execve(notif, ctx, 1); + return forward_execve(req, ctx, 1); /* Default: deny unknown syscalls. */ if (ctx->verbose) fprintf(stderr, "kbox: DENY unknown syscall nr=%d (pid=%u)\n", nr, - notif->pid); + kbox_syscall_request_pid(req)); return kbox_dispatch_errno(ENOSYS); } + +struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, + const void *notif_ptr) +{ + struct kbox_syscall_request req; + + if (kbox_syscall_request_from_notif(notif_ptr, &req) < 0) + return kbox_dispatch_errno(EINVAL); + return kbox_dispatch_request(ctx, &req); +} diff --git a/src/seccomp-supervisor.c b/src/seccomp-supervisor.c index 2dde262..87365ae 100644 --- a/src/seccomp-supervisor.c +++ b/src/seccomp-supervisor.c @@ -229,27 +229,30 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) { struct kbox_seccomp_notif notif; struct kbox_seccomp_notif_resp resp; + struct kbox_syscall_request req; struct kbox_dispatch d; struct pollfd pfd; int exit_code; int ret; + int poll_timeout; - for (;;) { - /* 1. Check if child already exited. */ - ret = check_child(ctx->child_pid, &exit_code); - if (ret < 0) { - fprintf(stderr, "waitpid: %s\n", strerror(errno)); - return -1; - } - if (ret == 1) - return exit_code; + poll_timeout = -1; +#ifdef KBOX_HAS_WEB + if (ctx->web) + poll_timeout = 100; +#endif - /* 2. Poll for a seccomp notification. */ + for (;;) { + /* Poll for the next seccomp notification. In the normal non-web + * steady state we block here instead of doing a non-blocking + * waitpid() on every iteration; that extra wait syscall shows up + * directly in the seccomp fast path on syscall-heavy workloads. + */ pfd.fd = ctx->listener_fd; pfd.events = POLLIN; pfd.revents = 0; - ret = poll(&pfd, 1, 100); + ret = poll(&pfd, 1, poll_timeout); if (ret < 0) { if (errno == EINTR) continue; @@ -268,15 +271,19 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) continue; } - /* 3. POLLHUP / POLLERR => recheck child. */ + /* POLLHUP / POLLERR => recheck child. */ if (pfd.revents & (POLLHUP | POLLERR | POLLNVAL)) { ret = check_child(ctx->child_pid, &exit_code); + if (ret < 0) { + fprintf(stderr, "waitpid: %s\n", strerror(errno)); + return -1; + } if (ret == 1) return exit_code; continue; } - /* 4. Receive notification. */ + /* Receive notification. */ ret = kbox_notify_recv(ctx->listener_fd, ¬if); if (ret < 0) { int e = -ret; @@ -285,6 +292,15 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) if (e == ENOENT && ctx->web) kbox_web_counters(ctx->web)->recv_enoent++; #endif + if (e == ENOENT) { + ret = check_child(ctx->child_pid, &exit_code); + if (ret < 0) { + fprintf(stderr, "waitpid: %s\n", strerror(errno)); + return -1; + } + if (ret == 1) + return exit_code; + } continue; } fprintf(stderr, "kbox_notify_recv: %s\n", strerror(e)); @@ -297,7 +313,11 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) if (ctx->web) t_dispatch_start = kbox_clock_ns(); #endif - d = kbox_dispatch_syscall(ctx, ¬if); + if (kbox_syscall_request_from_notif(¬if, &req) < 0) { + fprintf(stderr, "kbox: failed to decode seccomp notification\n"); + return -1; + } + d = kbox_dispatch_request(ctx, &req); /* 6. Build and send response. */ build_response(&resp, notif.id, &d); @@ -316,12 +336,10 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) else disp = KBOX_DISP_RETURN; - const char *sname = - syscall_name_from_nr(ctx->host_nrs, notif.data.nr); + const char *sname = syscall_name_from_nr(ctx->host_nrs, req.nr); - kbox_web_record_syscall(ctx->web, notif.pid, notif.data.nr, sname, - notif.data.args, disp, d.val, d.error, - latency); + kbox_web_record_syscall(ctx->web, (uint32_t) req.pid, req.nr, sname, + req.args, disp, d.val, d.error, latency); } #endif @@ -338,6 +356,13 @@ static int supervise_loop(struct kbox_supervisor_ctx *ctx) if (e == ENOENT && ctx->web) kbox_web_counters(ctx->web)->send_enoent++; #endif + ret = check_child(ctx->child_pid, &exit_code); + if (ret < 0) { + fprintf(stderr, "waitpid: %s\n", strerror(errno)); + return -1; + } + if (ret == 1) + return exit_code; continue; } fprintf(stderr, "kbox_notify_send: %s\n", strerror(e)); @@ -507,6 +532,8 @@ int kbox_run_supervisor(const struct kbox_sysnrs *sysnrs, ctx.host_nrs = host_nrs; ctx.fd_table = &fd_table; ctx.listener_fd = listener_fd; + ctx.proc_self_fd_dirfd = -1; + ctx.proc_mem_fd = -1; ctx.child_pid = pid; ctx.host_root = host_root; ctx.verbose = verbose; @@ -514,11 +541,17 @@ int kbox_run_supervisor(const struct kbox_sysnrs *sysnrs, ctx.override_uid = (uid_t) -1; ctx.override_gid = (gid_t) -1; ctx.normalize = normalize; + ctx.guest_mem_ops = &kbox_process_vm_guest_mem_ops; + ctx.fd_inject_ops = NULL; ctx.web = web; /* 4c. Enter supervisor loop. */ exit_code = supervise_loop(&ctx); + if (ctx.proc_mem_fd >= 0) + close(ctx.proc_mem_fd); + if (ctx.proc_self_fd_dirfd >= 0) + close(ctx.proc_self_fd_dirfd); close(listener_fd); if (exit_code < 0) diff --git a/src/seccomp.h b/src/seccomp.h index 1d756ce..969db64 100644 --- a/src/seccomp.h +++ b/src/seccomp.h @@ -3,11 +3,15 @@ #define KBOX_SECCOMP_H #include +#include #include #include "fd-table.h" +#include "kbox/path.h" +#include "procmem.h" #include "seccomp-defs.h" #include "syscall-nr.h" +#include "syscall-trap-signal.h" struct kbox_dispatch { enum { @@ -19,12 +23,82 @@ struct kbox_dispatch { }; struct kbox_web_ctx; +struct kbox_fd_inject_ops; + +#define KBOX_PATH_SHADOW_CACHE_MAX 8 +#define KBOX_TRANSLATED_PATH_CACHE_MAX 8 +#define KBOX_LITERAL_PATH_CACHE_MAX 8 + +struct kbox_path_shadow_cache_entry { + int valid; + int memfd; + char path[KBOX_MAX_PATH]; + struct stat host_stat; +}; + +struct kbox_translated_path_cache_entry { + int valid; + unsigned generation; + char guest_path[KBOX_MAX_PATH]; + char translated[KBOX_MAX_PATH]; +}; + +struct kbox_literal_path_cache_entry { + int valid; + unsigned generation; + pid_t pid; + uint64_t guest_addr; + char translated[KBOX_MAX_PATH]; +}; + +enum kbox_syscall_source { + KBOX_SYSCALL_SOURCE_SECCOMP = 0, + KBOX_SYSCALL_SOURCE_TRAP, + KBOX_SYSCALL_SOURCE_REWRITE, +}; + +struct kbox_syscall_regs { + int nr; + uint64_t instruction_pointer; + uint64_t args[6]; +}; + +struct kbox_syscall_request { + enum kbox_syscall_source source; + pid_t pid; + uint64_t cookie; + int nr; + uint64_t instruction_pointer; + uint64_t args[6]; + struct kbox_guest_mem guest_mem; +}; + +static inline pid_t kbox_syscall_request_pid( + const struct kbox_syscall_request *req) +{ + return req->pid; +} + +static inline uint64_t kbox_syscall_request_cookie( + const struct kbox_syscall_request *req) +{ + return req->cookie; +} + +static inline uint64_t kbox_syscall_request_arg( + const struct kbox_syscall_request *req, + size_t idx) +{ + return idx < 6 ? req->args[idx] : 0; +} struct kbox_supervisor_ctx { const struct kbox_sysnrs *sysnrs; const struct kbox_host_nrs *host_nrs; struct kbox_fd_table *fd_table; int listener_fd; + int proc_self_fd_dirfd; + int proc_mem_fd; pid_t child_pid; const char *host_root; int verbose; @@ -32,10 +106,30 @@ struct kbox_supervisor_ctx { uid_t override_uid; gid_t override_gid; int normalize; + const struct kbox_guest_mem_ops *guest_mem_ops; + struct kbox_guest_mem active_guest_mem; + const struct kbox_fd_inject_ops *fd_inject_ops; struct kbox_web_ctx *web; + unsigned active_writeback_shadows; + unsigned path_translation_generation; + struct kbox_path_shadow_cache_entry + path_shadow_cache[KBOX_PATH_SHADOW_CACHE_MAX]; + struct kbox_translated_path_cache_entry + translated_path_cache[KBOX_TRANSLATED_PATH_CACHE_MAX]; + struct kbox_literal_path_cache_entry + literal_path_cache[KBOX_LITERAL_PATH_CACHE_MAX]; }; int kbox_install_seccomp_listener(const struct kbox_host_nrs *h); +int kbox_install_seccomp_trap(const struct kbox_host_nrs *h); +int kbox_install_seccomp_trap_ranges( + const struct kbox_host_nrs *h, + const struct kbox_syscall_trap_ip_range *trap_ranges, + size_t trap_range_count); +int kbox_install_seccomp_rewrite_ranges( + const struct kbox_host_nrs *h, + const struct kbox_syscall_trap_ip_range *trap_ranges, + size_t trap_range_count); int kbox_notify_recv(int listener_fd, void *notif); int kbox_notify_send(int listener_fd, const void *resp); int kbox_notify_addfd(int listener_fd, @@ -47,8 +141,25 @@ int kbox_notify_addfd_at(int listener_fd, int srcfd, int target_fd, uint32_t newfd_flags); +int kbox_syscall_request_init_from_regs(struct kbox_syscall_request *out, + enum kbox_syscall_source source, + pid_t pid, + uint64_t cookie, + const struct kbox_syscall_regs *regs, + const struct kbox_guest_mem *guest_mem); +void kbox_dispatch_prepare_request_ctx(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req); struct kbox_dispatch kbox_dispatch_syscall(struct kbox_supervisor_ctx *ctx, const void *notif); +int kbox_syscall_request_from_notif(const void *notif, + struct kbox_syscall_request *out); +struct kbox_dispatch kbox_dispatch_request( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req); +int kbox_dispatch_try_rewrite_wrapper_fast_path( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out); struct kbox_dispatch kbox_dispatch_continue(void); struct kbox_dispatch kbox_dispatch_errno(int err); struct kbox_dispatch kbox_dispatch_value(int64_t val); diff --git a/src/shadow-fd.c b/src/shadow-fd.c index e0a4ce4..8a62f3f 100644 --- a/src/shadow-fd.c +++ b/src/shadow-fd.c @@ -37,9 +37,6 @@ int kbox_shadow_create(const struct kbox_sysnrs *s, long lkl_fd) if (!S_ISREG(kst.st_mode)) return -ENODEV; - if (kst.st_size <= 0) - return -ENODEV; - if (kst.st_size > KBOX_SHADOW_MAX_SIZE) return -EFBIG; @@ -53,6 +50,9 @@ int kbox_shadow_create(const struct kbox_sysnrs *s, long lkl_fd) return -e; } + if (kst.st_size == 0) + return memfd; + /* Read from LKL in chunks via pread64 (position-independent) * and write to the memfd. */ diff --git a/src/syscall-nr.c b/src/syscall-nr.c index a053530..3368637 100644 --- a/src/syscall-nr.c +++ b/src/syscall-nr.c @@ -436,7 +436,7 @@ const struct kbox_host_nrs HOST_NRS_AARCH64 = { .setsid = 157, .clock_gettime = 113, .clock_getres = 114, - .gettimeofday = -1, + .gettimeofday = 169, .readlinkat = 78, .pipe2 = 59, .pipe = -1, diff --git a/src/syscall-request.c b/src/syscall-request.c new file mode 100644 index 0000000..c157691 --- /dev/null +++ b/src/syscall-request.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "seccomp.h" + +int kbox_syscall_request_init_from_regs(struct kbox_syscall_request *out, + enum kbox_syscall_source source, + pid_t pid, + uint64_t cookie, + const struct kbox_syscall_regs *regs, + const struct kbox_guest_mem *guest_mem) +{ + if (!out || !regs) + return -1; + + memset(out, 0, sizeof(*out)); + out->source = source; + out->pid = pid; + out->cookie = cookie; + out->nr = regs->nr; + out->instruction_pointer = regs->instruction_pointer; + memcpy(out->args, regs->args, sizeof(out->args)); + + if (guest_mem) { + out->guest_mem = *guest_mem; + } else if (source == KBOX_SYSCALL_SOURCE_SECCOMP) { + out->guest_mem.ops = &kbox_process_vm_guest_mem_ops; + out->guest_mem.opaque = (uintptr_t) pid; + } + + return 0; +} + +int kbox_syscall_request_from_notif(const void *notif_ptr, + struct kbox_syscall_request *out) +{ + const struct kbox_seccomp_notif *notif = notif_ptr; + struct kbox_syscall_regs regs; + + if (!notif || !out) + return -1; + + regs.nr = notif->data.nr; + regs.instruction_pointer = notif->data.instruction_pointer; + memcpy(regs.args, notif->data.args, sizeof(regs.args)); + return kbox_syscall_request_init_from_regs(out, KBOX_SYSCALL_SOURCE_SECCOMP, + (pid_t) notif->pid, notif->id, + ®s, NULL); +} diff --git a/src/syscall-trap-signal.h b/src/syscall-trap-signal.h new file mode 100644 index 0000000..8cdff94 --- /dev/null +++ b/src/syscall-trap-signal.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_SYSCALL_TRAP_SIGNAL_H +#define KBOX_SYSCALL_TRAP_SIGNAL_H + +#include +#include + +struct kbox_syscall_trap_ip_range { + uintptr_t start; + uintptr_t end; +}; + +int kbox_syscall_trap_reserved_signal(void); +int kbox_syscall_trap_signal_is_reserved(int signum); +int kbox_syscall_trap_sigset_blocks_reserved(const void *mask, size_t len); +uintptr_t kbox_syscall_trap_host_syscall_ip(void); +int kbox_syscall_trap_host_syscall_range( + struct kbox_syscall_trap_ip_range *out); +int kbox_syscall_trap_internal_ip_ranges(struct kbox_syscall_trap_ip_range *out, + size_t cap, + size_t *count); +int64_t kbox_syscall_trap_host_syscall6(long nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5); +int64_t kbox_syscall_trap_host_futex_wait_private(int *addr, int expected); +int64_t kbox_syscall_trap_host_futex_wake_private(int *addr, int count); +int64_t kbox_syscall_trap_host_exit_group_now(int status); +int64_t kbox_syscall_trap_host_execve_now(const char *pathname, + char *const argv[], + char *const envp[]); +int64_t kbox_syscall_trap_host_execveat_now(int dirfd, + const char *pathname, + char *const argv[], + char *const envp[], + int flags); +int64_t kbox_syscall_trap_host_clone_now(uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4); +int64_t kbox_syscall_trap_host_clone3_now(const void *uargs, size_t size); +#if defined(__x86_64__) +int64_t kbox_syscall_trap_host_fork_now(void); +int64_t kbox_syscall_trap_host_vfork_now(void); +#endif +#if defined(__x86_64__) +int64_t kbox_syscall_trap_host_arch_prctl_get_fs(uint64_t *out); +int64_t kbox_syscall_trap_host_arch_prctl_set_fs(uint64_t val); +#endif +int64_t kbox_syscall_trap_host_rt_sigprocmask_unblock(const uint64_t *mask, + size_t sigset_size); + +#endif /* KBOX_SYSCALL_TRAP_SIGNAL_H */ diff --git a/src/syscall-trap.c b/src/syscall-trap.c new file mode 100644 index 0000000..b2bed46 --- /dev/null +++ b/src/syscall-trap.c @@ -0,0 +1,1549 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "syscall-trap.h" + +static struct kbox_syscall_trap_runtime *active_trap_runtime; + +#ifndef FUTEX_WAIT_PRIVATE +#define FUTEX_WAIT_PRIVATE 128 +#endif + +#ifndef FUTEX_WAKE_PRIVATE +#define FUTEX_WAKE_PRIVATE 129 +#endif + +static inline struct kbox_syscall_trap_runtime *load_active_trap_runtime(void) +{ + return __atomic_load_n(&active_trap_runtime, __ATOMIC_ACQUIRE); +} + +static inline void store_active_trap_runtime( + struct kbox_syscall_trap_runtime *runtime) +{ + __atomic_store_n(&active_trap_runtime, runtime, __ATOMIC_RELEASE); +} + +#define STR2(x) #x +#define XSTR(x) STR2(x) + +static int wait_for_pending_dispatch(struct kbox_syscall_trap_runtime *runtime); + +#if defined(__x86_64__) +#define KBOX_ARCH_SET_FS 0x1002 +#define KBOX_ARCH_GET_FS 0x1003 + +/* FSGSBASE detection. On kernels 5.9+ with CR4.FSGSBASE set, rdfsbase + * and wrfsbase are available in userspace. These take ~2ns vs ~1.5us + * for arch_prctl via the host trampoline. Probed once at install time. + */ +static int have_fsgsbase = -1; +static sigjmp_buf fsgsbase_probe_jmpbuf; + +static void fsgsbase_probe_sigill(int signo) +{ + (void) signo; + siglongjmp(fsgsbase_probe_jmpbuf, 1); +} + +static void probe_fsgsbase(void) +{ + struct sigaction sa, old; + uint64_t val; + + memset(&sa, 0, sizeof(sa)); + sigemptyset(&sa.sa_mask); + sa.sa_handler = fsgsbase_probe_sigill; + sa.sa_flags = 0; + + if (sigaction(SIGILL, &sa, &old) < 0) { + have_fsgsbase = 0; + return; + } + + if (sigsetjmp(fsgsbase_probe_jmpbuf, 1) == 0) { + __asm__ volatile("rdfsbase %0" : "=r"(val)); + have_fsgsbase = 1; + } else { + have_fsgsbase = 0; + } + + sigaction(SIGILL, &old, NULL); +} + +static uint64_t read_host_fs_base(void) +{ + uint64_t val; + + if (have_fsgsbase) { + __asm__ volatile("rdfsbase %0" : "=r"(val)); + return val; + } + val = 0; + kbox_syscall_trap_host_arch_prctl_get_fs(&val); + return val; +} + +static void write_host_fs_base(uint64_t val) +{ + if (have_fsgsbase) { + __asm__ volatile("wrfsbase %0" : : "r"(val) : "memory"); + return; + } + kbox_syscall_trap_host_arch_prctl_set_fs(val); +} +#endif + +#if defined(__x86_64__) +extern char kbox_syscall_trap_host_syscall_start[]; +extern char kbox_syscall_trap_host_syscall_ip_label[]; +extern char kbox_syscall_trap_host_syscall_end[]; +extern char kbox_syscall_trap_host_futex_wait_start[]; +extern char kbox_syscall_trap_host_futex_wait_end[]; +extern char kbox_syscall_trap_host_futex_wake_start[]; +extern char kbox_syscall_trap_host_futex_wake_end[]; +extern char kbox_syscall_trap_host_exit_group_start[]; +extern char kbox_syscall_trap_host_exit_group_end[]; +extern char kbox_syscall_trap_host_execve_start[]; +extern char kbox_syscall_trap_host_execve_end[]; +extern char kbox_syscall_trap_host_execveat_start[]; +extern char kbox_syscall_trap_host_execveat_end[]; +extern char kbox_syscall_trap_host_clone_start[]; +extern char kbox_syscall_trap_host_clone_end[]; +extern char kbox_syscall_trap_host_clone3_start[]; +extern char kbox_syscall_trap_host_clone3_end[]; +extern char kbox_syscall_trap_host_fork_start[]; +extern char kbox_syscall_trap_host_fork_end[]; +extern char kbox_syscall_trap_host_vfork_start[]; +extern char kbox_syscall_trap_host_vfork_end[]; +extern char kbox_syscall_trap_host_arch_prctl_get_fs_start[]; +extern char kbox_syscall_trap_host_arch_prctl_get_fs_end[]; +extern char kbox_syscall_trap_host_arch_prctl_set_fs_start[]; +extern char kbox_syscall_trap_host_arch_prctl_set_fs_end[]; +extern char kbox_syscall_trap_host_rt_sigprocmask_unblock_start[]; +extern char kbox_syscall_trap_host_rt_sigprocmask_unblock_end[]; + +__asm__( + ".text\n" + ".globl kbox_syscall_trap_host_syscall6\n" + ".type kbox_syscall_trap_host_syscall6,@function\n" + ".globl kbox_syscall_trap_host_syscall_start\n" + "kbox_syscall_trap_host_syscall_start:\n" + "kbox_syscall_trap_host_syscall6:\n" + "mov %rdi, %rax\n" + "mov %rsi, %rdi\n" + "mov %rdx, %rsi\n" + "mov %rcx, %rdx\n" + "mov %r8, %r10\n" + "mov %r9, %r8\n" + "mov 8(%rsp), %r9\n" + ".globl kbox_syscall_trap_host_syscall_ip_label\n" + "kbox_syscall_trap_host_syscall_ip_label:\n" + "syscall\n" + ".globl kbox_syscall_trap_host_syscall_end\n" + "kbox_syscall_trap_host_syscall_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_syscall6, " + ".-kbox_syscall_trap_host_syscall6\n" + + ".globl kbox_syscall_trap_host_futex_wait_private\n" + ".type kbox_syscall_trap_host_futex_wait_private,@function\n" + ".globl kbox_syscall_trap_host_futex_wait_start\n" + "kbox_syscall_trap_host_futex_wait_start:\n" + "kbox_syscall_trap_host_futex_wait_private:\n" + "mov %rsi, %rdx\n" + "mov $" XSTR(FUTEX_WAIT_PRIVATE) ", %esi\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_futex) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_futex_wait_end\n" + "kbox_syscall_trap_host_futex_wait_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_futex_wait_private, " + ".-kbox_syscall_trap_host_futex_wait_private\n" + + ".globl kbox_syscall_trap_host_futex_wake_private\n" + ".type kbox_syscall_trap_host_futex_wake_private,@function\n" + ".globl kbox_syscall_trap_host_futex_wake_start\n" + "kbox_syscall_trap_host_futex_wake_start:\n" + "kbox_syscall_trap_host_futex_wake_private:\n" + "mov %rsi, %rdx\n" + "mov $" XSTR(FUTEX_WAKE_PRIVATE) ", %esi\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_futex) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_futex_wake_end\n" + "kbox_syscall_trap_host_futex_wake_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_futex_wake_private, " + ".-kbox_syscall_trap_host_futex_wake_private\n" + + ".globl kbox_syscall_trap_host_exit_group_now\n" + ".type kbox_syscall_trap_host_exit_group_now,@function\n" + ".globl kbox_syscall_trap_host_exit_group_start\n" + "kbox_syscall_trap_host_exit_group_start:\n" + "kbox_syscall_trap_host_exit_group_now:\n" + "mov $" XSTR(__NR_exit_group) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_exit_group_end\n" + "kbox_syscall_trap_host_exit_group_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_exit_group_now, " + ".-kbox_syscall_trap_host_exit_group_now\n" + + ".globl kbox_syscall_trap_host_execve_now\n" + ".type kbox_syscall_trap_host_execve_now,@function\n" + ".globl kbox_syscall_trap_host_execve_start\n" + "kbox_syscall_trap_host_execve_start:\n" + "kbox_syscall_trap_host_execve_now:\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_execve) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_execve_end\n" + "kbox_syscall_trap_host_execve_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_execve_now, " + ".-kbox_syscall_trap_host_execve_now\n" + + ".globl kbox_syscall_trap_host_execveat_now\n" + ".type kbox_syscall_trap_host_execveat_now,@function\n" + ".globl kbox_syscall_trap_host_execveat_start\n" + "kbox_syscall_trap_host_execveat_start:\n" + "kbox_syscall_trap_host_execveat_now:\n" + "mov %rcx, %r10\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_execveat) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_execveat_end\n" + "kbox_syscall_trap_host_execveat_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_execveat_now, " + ".-kbox_syscall_trap_host_execveat_now\n" + + ".globl kbox_syscall_trap_host_clone_now\n" + ".type kbox_syscall_trap_host_clone_now,@function\n" + ".globl kbox_syscall_trap_host_clone_start\n" + "kbox_syscall_trap_host_clone_start:\n" + "kbox_syscall_trap_host_clone_now:\n" + "mov %rcx, %r10\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_clone) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_clone_end\n" + "kbox_syscall_trap_host_clone_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_clone_now, " + ".-kbox_syscall_trap_host_clone_now\n" + + ".globl kbox_syscall_trap_host_clone3_now\n" + ".type kbox_syscall_trap_host_clone3_now,@function\n" + ".globl kbox_syscall_trap_host_clone3_start\n" + "kbox_syscall_trap_host_clone3_start:\n" + "kbox_syscall_trap_host_clone3_now:\n" + "xor %edx, %edx\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_clone3) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_clone3_end\n" + "kbox_syscall_trap_host_clone3_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_clone3_now, " + ".-kbox_syscall_trap_host_clone3_now\n" + + ".globl kbox_syscall_trap_host_fork_now\n" + ".type kbox_syscall_trap_host_fork_now,@function\n" + ".globl kbox_syscall_trap_host_fork_start\n" + "kbox_syscall_trap_host_fork_start:\n" + "kbox_syscall_trap_host_fork_now:\n" + "mov $" XSTR(__NR_fork) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_fork_end\n" + "kbox_syscall_trap_host_fork_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_fork_now, " + ".-kbox_syscall_trap_host_fork_now\n" + + ".globl kbox_syscall_trap_host_vfork_now\n" + ".type kbox_syscall_trap_host_vfork_now,@function\n" + ".globl kbox_syscall_trap_host_vfork_start\n" + "kbox_syscall_trap_host_vfork_start:\n" + "kbox_syscall_trap_host_vfork_now:\n" + "mov $" XSTR(__NR_vfork) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_vfork_end\n" + "kbox_syscall_trap_host_vfork_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_vfork_now, " + ".-kbox_syscall_trap_host_vfork_now\n" + + ".globl kbox_syscall_trap_host_arch_prctl_get_fs\n" + ".type kbox_syscall_trap_host_arch_prctl_get_fs,@function\n" + ".globl kbox_syscall_trap_host_arch_prctl_get_fs_start\n" + "kbox_syscall_trap_host_arch_prctl_get_fs_start:\n" + "kbox_syscall_trap_host_arch_prctl_get_fs:\n" + "mov %rdi, %rsi\n" + "mov $" XSTR(KBOX_ARCH_GET_FS) ", %edi\n" + "xor %edx, %edx\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_arch_prctl) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_arch_prctl_get_fs_end\n" + "kbox_syscall_trap_host_arch_prctl_get_fs_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_arch_prctl_get_fs, " + ".-kbox_syscall_trap_host_arch_prctl_get_fs\n" + + ".globl kbox_syscall_trap_host_arch_prctl_set_fs\n" + ".type kbox_syscall_trap_host_arch_prctl_set_fs,@function\n" + ".globl kbox_syscall_trap_host_arch_prctl_set_fs_start\n" + "kbox_syscall_trap_host_arch_prctl_set_fs_start:\n" + "kbox_syscall_trap_host_arch_prctl_set_fs:\n" + "mov %rdi, %rsi\n" + "mov $" XSTR(KBOX_ARCH_SET_FS) ", %edi\n" + "xor %edx, %edx\n" + "xor %r10d, %r10d\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(__NR_arch_prctl) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_arch_prctl_set_fs_end\n" + "kbox_syscall_trap_host_arch_prctl_set_fs_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_arch_prctl_set_fs, " + ".-kbox_syscall_trap_host_arch_prctl_set_fs\n" + + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock\n" + ".type kbox_syscall_trap_host_rt_sigprocmask_unblock,@function\n" + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock_start\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock_start:\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock:\n" + "mov %rsi, %r10\n" + "mov %rdi, %rsi\n" + "xor %edx, %edx\n" + "xor %r8d, %r8d\n" + "xor %r9d, %r9d\n" + "mov $" XSTR(SIG_UNBLOCK) ", %edi\n" + "mov $" XSTR(__NR_rt_sigprocmask) ", %eax\n" + "syscall\n" + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock_end\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_rt_sigprocmask_unblock, " + ".-kbox_syscall_trap_host_rt_sigprocmask_unblock\n"); + +extern int64_t kbox_syscall_trap_host_syscall6(long nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5); +extern int64_t kbox_syscall_trap_host_futex_wait_private(int *addr, + int expected); +extern int64_t kbox_syscall_trap_host_futex_wake_private(int *addr, int count); +extern int64_t kbox_syscall_trap_host_exit_group_now(int status); +extern int64_t kbox_syscall_trap_host_execve_now(const char *pathname, + char *const argv[], + char *const envp[]); +extern int64_t kbox_syscall_trap_host_execveat_now(int dirfd, + const char *pathname, + char *const argv[], + char *const envp[], + int flags); +extern int64_t kbox_syscall_trap_host_clone_now(uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4); +extern int64_t kbox_syscall_trap_host_clone3_now(const void *uargs, + size_t size); +extern int64_t kbox_syscall_trap_host_fork_now(void); +extern int64_t kbox_syscall_trap_host_vfork_now(void); +extern int64_t kbox_syscall_trap_host_arch_prctl_get_fs(uint64_t *out); +extern int64_t kbox_syscall_trap_host_arch_prctl_set_fs(uint64_t val); +extern int64_t kbox_syscall_trap_host_rt_sigprocmask_unblock( + const uint64_t *mask, + size_t sigset_size); +#elif defined(__aarch64__) +extern char kbox_syscall_trap_host_syscall_start[]; +extern char kbox_syscall_trap_host_syscall_ip_label[]; +extern char kbox_syscall_trap_host_syscall_end[]; +extern char kbox_syscall_trap_host_futex_wait_start[]; +extern char kbox_syscall_trap_host_futex_wait_end[]; +extern char kbox_syscall_trap_host_futex_wake_start[]; +extern char kbox_syscall_trap_host_futex_wake_end[]; +extern char kbox_syscall_trap_host_exit_group_start[]; +extern char kbox_syscall_trap_host_exit_group_end[]; +extern char kbox_syscall_trap_host_execve_start[]; +extern char kbox_syscall_trap_host_execve_end[]; +extern char kbox_syscall_trap_host_execveat_start[]; +extern char kbox_syscall_trap_host_execveat_end[]; +extern char kbox_syscall_trap_host_clone_start[]; +extern char kbox_syscall_trap_host_clone_end[]; +extern char kbox_syscall_trap_host_clone3_start[]; +extern char kbox_syscall_trap_host_clone3_end[]; +extern char kbox_syscall_trap_host_rt_sigprocmask_unblock_start[]; +extern char kbox_syscall_trap_host_rt_sigprocmask_unblock_end[]; + +__asm__( + ".text\n" + ".globl kbox_syscall_trap_host_syscall6\n" + ".type kbox_syscall_trap_host_syscall6,%function\n" + ".globl kbox_syscall_trap_host_syscall_start\n" + "kbox_syscall_trap_host_syscall_start:\n" + "kbox_syscall_trap_host_syscall6:\n" + "mov x8, x0\n" + "mov x0, x1\n" + "mov x1, x2\n" + "mov x2, x3\n" + "mov x3, x4\n" + "mov x4, x5\n" + "mov x5, x6\n" + ".globl kbox_syscall_trap_host_syscall_ip_label\n" + "kbox_syscall_trap_host_syscall_ip_label:\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_syscall_end\n" + "kbox_syscall_trap_host_syscall_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_syscall6, " + ".-kbox_syscall_trap_host_syscall6\n" + + ".globl kbox_syscall_trap_host_futex_wait_private\n" + ".type kbox_syscall_trap_host_futex_wait_private,%function\n" + ".globl kbox_syscall_trap_host_futex_wait_start\n" + "kbox_syscall_trap_host_futex_wait_start:\n" + "kbox_syscall_trap_host_futex_wait_private:\n" + "mov x8, #" XSTR(__NR_futex) "\n" + "mov x2, x1\n" + "mov x1, #" XSTR(FUTEX_WAIT_PRIVATE) "\n" + "mov x3, xzr\n" + "mov x4, xzr\n" + "mov x5, xzr\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_futex_wait_end\n" + "kbox_syscall_trap_host_futex_wait_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_futex_wait_private, " + ".-kbox_syscall_trap_host_futex_wait_private\n" + + ".globl kbox_syscall_trap_host_futex_wake_private\n" + ".type kbox_syscall_trap_host_futex_wake_private,%function\n" + ".globl kbox_syscall_trap_host_futex_wake_start\n" + "kbox_syscall_trap_host_futex_wake_start:\n" + "kbox_syscall_trap_host_futex_wake_private:\n" + "mov x8, #" XSTR(__NR_futex) "\n" + "mov x2, x1\n" + "mov x1, #" XSTR(FUTEX_WAKE_PRIVATE) "\n" + "mov x3, xzr\n" + "mov x4, xzr\n" + "mov x5, xzr\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_futex_wake_end\n" + "kbox_syscall_trap_host_futex_wake_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_futex_wake_private, " + ".-kbox_syscall_trap_host_futex_wake_private\n" + + ".globl kbox_syscall_trap_host_exit_group_now\n" + ".type kbox_syscall_trap_host_exit_group_now,%function\n" + ".globl kbox_syscall_trap_host_exit_group_start\n" + "kbox_syscall_trap_host_exit_group_start:\n" + "kbox_syscall_trap_host_exit_group_now:\n" + "mov x8, #" XSTR(__NR_exit_group) "\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_exit_group_end\n" + "kbox_syscall_trap_host_exit_group_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_exit_group_now, " + ".-kbox_syscall_trap_host_exit_group_now\n" + + ".globl kbox_syscall_trap_host_execve_now\n" + ".type kbox_syscall_trap_host_execve_now,%function\n" + ".globl kbox_syscall_trap_host_execve_start\n" + "kbox_syscall_trap_host_execve_start:\n" + "kbox_syscall_trap_host_execve_now:\n" + "mov x8, #" XSTR(__NR_execve) "\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_execve_end\n" + "kbox_syscall_trap_host_execve_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_execve_now, " + ".-kbox_syscall_trap_host_execve_now\n" + + ".globl kbox_syscall_trap_host_execveat_now\n" + ".type kbox_syscall_trap_host_execveat_now,%function\n" + ".globl kbox_syscall_trap_host_execveat_start\n" + "kbox_syscall_trap_host_execveat_start:\n" + "kbox_syscall_trap_host_execveat_now:\n" + "mov x8, #" XSTR(__NR_execveat) "\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_execveat_end\n" + "kbox_syscall_trap_host_execveat_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_execveat_now, " + ".-kbox_syscall_trap_host_execveat_now\n" + + ".globl kbox_syscall_trap_host_clone_now\n" + ".type kbox_syscall_trap_host_clone_now,%function\n" + ".globl kbox_syscall_trap_host_clone_start\n" + "kbox_syscall_trap_host_clone_start:\n" + "kbox_syscall_trap_host_clone_now:\n" + "mov x8, #" XSTR(__NR_clone) "\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_clone_end\n" + "kbox_syscall_trap_host_clone_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_clone_now, " + ".-kbox_syscall_trap_host_clone_now\n" + + ".globl kbox_syscall_trap_host_clone3_now\n" + ".type kbox_syscall_trap_host_clone3_now,%function\n" + ".globl kbox_syscall_trap_host_clone3_start\n" + "kbox_syscall_trap_host_clone3_start:\n" + "kbox_syscall_trap_host_clone3_now:\n" + "mov x8, #" XSTR(__NR_clone3) "\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_clone3_end\n" + "kbox_syscall_trap_host_clone3_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_clone3_now, " + ".-kbox_syscall_trap_host_clone3_now\n" + + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock\n" + ".type kbox_syscall_trap_host_rt_sigprocmask_unblock,%function\n" + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock_start\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock_start:\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock:\n" + "mov x8, #" XSTR(__NR_rt_sigprocmask) "\n" + "mov x3, x1\n" + "mov x1, x0\n" + "mov x0, #" XSTR(SIG_UNBLOCK) "\n" + "mov x2, xzr\n" + "mov x4, xzr\n" + "mov x5, xzr\n" + "svc #0\n" + ".globl kbox_syscall_trap_host_rt_sigprocmask_unblock_end\n" + "kbox_syscall_trap_host_rt_sigprocmask_unblock_end:\n" + "ret\n" + ".size kbox_syscall_trap_host_rt_sigprocmask_unblock, " + ".-kbox_syscall_trap_host_rt_sigprocmask_unblock\n"); + +extern int64_t kbox_syscall_trap_host_syscall6(long nr, + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4, + uint64_t a5); +extern int64_t kbox_syscall_trap_host_futex_wait_private(int *addr, + int expected); +extern int64_t kbox_syscall_trap_host_futex_wake_private(int *addr, int count); +extern int64_t kbox_syscall_trap_host_exit_group_now(int status); +extern int64_t kbox_syscall_trap_host_execve_now(const char *pathname, + char *const argv[], + char *const envp[]); +extern int64_t kbox_syscall_trap_host_execveat_now(int dirfd, + const char *pathname, + char *const argv[], + char *const envp[], + int flags); +extern int64_t kbox_syscall_trap_host_clone_now(uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t a3, + uint64_t a4); +extern int64_t kbox_syscall_trap_host_clone3_now(const void *uargs, + size_t size); +extern int64_t kbox_syscall_trap_host_rt_sigprocmask_unblock( + const uint64_t *mask, + size_t sigset_size); +#endif + +static int direct_trap_execute(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out) +{ + if (!runtime || !runtime->ctx || !req || !out) + return -1; + + if (runtime->service_running) { + if (kbox_syscall_trap_runtime_capture(runtime, req) < 0) + return -1; + if (wait_for_pending_dispatch(runtime) < 0) + return -1; + return kbox_syscall_trap_runtime_take_dispatch(runtime, out); + } + + *out = kbox_dispatch_request(runtime->ctx, req); + return 0; +} + +static const struct kbox_syscall_trap_ops direct_trap_ops = { + .execute = direct_trap_execute, +}; + +static ssize_t trap_host_write(int fd, const void *buf, size_t len) +{ + return (ssize_t) kbox_syscall_trap_host_syscall6(__NR_write, (uint64_t) fd, + (uint64_t) (uintptr_t) buf, + (uint64_t) len, 0, 0, 0); +} + +static int trap_host_futex_wait(int *addr, int expected) +{ + int64_t rc; + + do { + rc = kbox_syscall_trap_host_futex_wait_private(addr, expected); + } while (rc == -EINTR); + + if (rc == 0 || rc == -EAGAIN) + return 0; + return -1; +} + +static int trap_host_futex_wake(int *addr) +{ + int64_t rc; + + do { + rc = kbox_syscall_trap_host_futex_wake_private(addr, INT_MAX); + } while (rc == -EINTR); + + return rc < 0 ? -1 : 0; +} + +static int trap_futex_wake(int *addr) +{ + long rc; + + do { + rc = syscall(SYS_futex, addr, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0); + } while (rc < 0 && errno == EINTR); + + return rc < 0 ? -1 : 0; +} + +static int wait_for_pending_dispatch(struct kbox_syscall_trap_runtime *runtime) +{ + if (!runtime) + return -1; + + for (;;) { + if (__atomic_load_n(&runtime->has_pending_dispatch, __ATOMIC_ACQUIRE)) + return 0; + if (trap_host_futex_wait(&runtime->has_pending_dispatch, 0) < 0) + return -1; + } +} + +__attribute__((noreturn)) static void trap_host_exit_group(int status) +{ + (void) kbox_syscall_trap_host_exit_group_now(status); + __builtin_unreachable(); +} + +int kbox_syscall_trap_reserved_signal(void) +{ + return SIGSYS; +} + +int kbox_syscall_trap_signal_is_reserved(int signum) +{ + return signum == kbox_syscall_trap_reserved_signal(); +} + +int kbox_syscall_trap_sigset_blocks_reserved(const void *mask, size_t len) +{ + const unsigned char *bytes = mask; + unsigned int signo = (unsigned int) kbox_syscall_trap_reserved_signal(); + unsigned int bit = signo - 1U; + unsigned int byte_index = bit / 8U; + unsigned int bit_index = bit % 8U; + + if (!mask || len <= byte_index) + return 0; + return (bytes[byte_index] & (1U << bit_index)) != 0; +} + +uintptr_t kbox_syscall_trap_host_syscall_ip(void) +{ +#if defined(__x86_64__) || defined(__aarch64__) + return (uintptr_t) kbox_syscall_trap_host_syscall_ip_label; +#else + return 0; +#endif +} + +int kbox_syscall_trap_host_syscall_range(struct kbox_syscall_trap_ip_range *out) +{ +#if defined(__x86_64__) || defined(__aarch64__) + uintptr_t start = (uintptr_t) kbox_syscall_trap_host_syscall_start; + uintptr_t end = (uintptr_t) kbox_syscall_trap_host_syscall_end; + + if (!out || start >= end) + return -1; + out->start = start; + /* seccomp reports a post-syscall instruction pointer. On some x86_64 + * builds that can land slightly past the raw `syscall` instruction, + * so leave a small tail window after the trampoline body instead of + * assuming the first byte of `ret` is always enough. + */ + out->end = end + 16; + return 0; +#else + (void) out; + return -1; +#endif +} + +static int append_ip_range(struct kbox_syscall_trap_ip_range *out, + size_t cap, + size_t *count, + uintptr_t start, + uintptr_t end) +{ + if (!out || !count || *count >= cap || start >= end) + return -1; + out[*count].start = start; + out[*count].end = end + 16; + (*count)++; + return 0; +} + +int kbox_syscall_trap_internal_ip_ranges(struct kbox_syscall_trap_ip_range *out, + size_t cap, + size_t *count) +{ + size_t n = 0; + + if (!out || !count) + return -1; + +#if defined(__x86_64__) + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_futex_wait_start, + (uintptr_t) kbox_syscall_trap_host_futex_wait_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_futex_wake_start, + (uintptr_t) kbox_syscall_trap_host_futex_wake_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_exit_group_start, + (uintptr_t) kbox_syscall_trap_host_exit_group_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_execve_start, + (uintptr_t) kbox_syscall_trap_host_execve_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_execveat_start, + (uintptr_t) kbox_syscall_trap_host_execveat_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_clone_start, + (uintptr_t) kbox_syscall_trap_host_clone_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_clone3_start, + (uintptr_t) kbox_syscall_trap_host_clone3_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_fork_start, + (uintptr_t) kbox_syscall_trap_host_fork_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_vfork_start, + (uintptr_t) kbox_syscall_trap_host_vfork_end) < 0) + return -1; + if (append_ip_range( + out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_arch_prctl_get_fs_start, + (uintptr_t) kbox_syscall_trap_host_arch_prctl_get_fs_end) < 0) + return -1; + if (append_ip_range( + out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_arch_prctl_set_fs_start, + (uintptr_t) kbox_syscall_trap_host_arch_prctl_set_fs_end) < 0) + return -1; + if (append_ip_range( + out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_rt_sigprocmask_unblock_start, + (uintptr_t) kbox_syscall_trap_host_rt_sigprocmask_unblock_end) < 0) + return -1; +#elif defined(__aarch64__) + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_futex_wait_start, + (uintptr_t) kbox_syscall_trap_host_futex_wait_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_futex_wake_start, + (uintptr_t) kbox_syscall_trap_host_futex_wake_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_exit_group_start, + (uintptr_t) kbox_syscall_trap_host_exit_group_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_execve_start, + (uintptr_t) kbox_syscall_trap_host_execve_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_execveat_start, + (uintptr_t) kbox_syscall_trap_host_execveat_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_clone_start, + (uintptr_t) kbox_syscall_trap_host_clone_end) < 0) + return -1; + if (append_ip_range(out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_clone3_start, + (uintptr_t) kbox_syscall_trap_host_clone3_end) < 0) + return -1; + if (append_ip_range( + out, cap, &n, + (uintptr_t) kbox_syscall_trap_host_rt_sigprocmask_unblock_start, + (uintptr_t) kbox_syscall_trap_host_rt_sigprocmask_unblock_end) < 0) + return -1; +#else + return -1; +#endif + + *count = n; + return 0; +} + +int kbox_syscall_regs_from_sigsys(const siginfo_t *info, + const void *ucontext_ptr, + struct kbox_syscall_regs *out) +{ + const ucontext_t *uc = ucontext_ptr; + + if (!info || !ucontext_ptr || !out) + return -1; + if (info->si_signo != SIGSYS) + return -1; + + memset(out, 0, sizeof(*out)); + +#if defined(__x86_64__) + out->nr = (info->si_syscall != 0) ? info->si_syscall + : (int) uc->uc_mcontext.gregs[REG_RAX]; + out->instruction_pointer = (uint64_t) uc->uc_mcontext.gregs[REG_RIP]; + out->args[0] = (uint64_t) uc->uc_mcontext.gregs[REG_RDI]; + out->args[1] = (uint64_t) uc->uc_mcontext.gregs[REG_RSI]; + out->args[2] = (uint64_t) uc->uc_mcontext.gregs[REG_RDX]; + out->args[3] = (uint64_t) uc->uc_mcontext.gregs[REG_R10]; + out->args[4] = (uint64_t) uc->uc_mcontext.gregs[REG_R8]; + out->args[5] = (uint64_t) uc->uc_mcontext.gregs[REG_R9]; + return 0; +#elif defined(__aarch64__) + out->nr = (info->si_syscall != 0) ? info->si_syscall + : (int) uc->uc_mcontext.regs[8]; + out->instruction_pointer = (uint64_t) uc->uc_mcontext.pc; + out->args[0] = (uint64_t) uc->uc_mcontext.regs[0]; + out->args[1] = (uint64_t) uc->uc_mcontext.regs[1]; + out->args[2] = (uint64_t) uc->uc_mcontext.regs[2]; + out->args[3] = (uint64_t) uc->uc_mcontext.regs[3]; + out->args[4] = (uint64_t) uc->uc_mcontext.regs[4]; + out->args[5] = (uint64_t) uc->uc_mcontext.regs[5]; + return 0; +#else + (void) uc; + return -1; +#endif +} + +int kbox_syscall_request_from_sigsys(struct kbox_syscall_request *out, + pid_t pid, + const siginfo_t *info, + const void *ucontext_ptr, + const struct kbox_guest_mem *guest_mem) +{ + struct kbox_syscall_regs regs; + struct kbox_guest_mem current_guest_mem; + + if (kbox_syscall_regs_from_sigsys(info, ucontext_ptr, ®s) < 0) + return -1; + if (!guest_mem) { + current_guest_mem.ops = &kbox_current_guest_mem_ops; + current_guest_mem.opaque = 0; + guest_mem = ¤t_guest_mem; + } + return kbox_syscall_request_init_from_regs(out, KBOX_SYSCALL_SOURCE_TRAP, + pid, 0, ®s, guest_mem); +} + +int kbox_syscall_dispatch_sigsys(struct kbox_supervisor_ctx *ctx, + pid_t pid, + const siginfo_t *info, + void *ucontext_ptr) +{ + struct kbox_syscall_trap_runtime runtime; + + if (kbox_syscall_trap_runtime_init(&runtime, ctx, NULL) < 0) + return -1; + runtime.pid = pid; + return kbox_syscall_trap_handle(&runtime, info, ucontext_ptr); +} + +int kbox_syscall_trap_runtime_init(struct kbox_syscall_trap_runtime *runtime, + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_trap_ops *ops) +{ + if (!runtime || !ctx) + return -1; + + memset(runtime, 0, sizeof(*runtime)); + runtime->ctx = ctx; + runtime->ops = ops ? ops : &direct_trap_ops; + runtime->pid = getpid(); + runtime->wake_fd = -1; + return 0; +} + +static int64_t host_syscall_from_request(const struct kbox_syscall_request *req) +{ + long ret; + + if (!req) + return -EINVAL; + + errno = 0; + ret = syscall((long) req->nr, (unsigned long) req->args[0], + (unsigned long) req->args[1], (unsigned long) req->args[2], + (unsigned long) req->args[3], (unsigned long) req->args[4], + (unsigned long) req->args[5]); + if (ret < 0) + return -errno; + return (int64_t) ret; +} + +static int host_syscall_requires_guest_thread( + const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req) +{ + const struct kbox_host_nrs *h; + int nr; + + if (!ctx || !ctx->host_nrs || !req) + return 1; + + h = ctx->host_nrs; + nr = req->nr; + + if (nr == h->execve || nr == h->execveat || nr == h->exit || + nr == h->exit_group || nr == h->rt_sigprocmask || + nr == h->rt_sigaltstack || nr == h->clone3 || nr == h->clone || + nr == h->fork || nr == h->vfork) { + return 1; + } + + return 0; +} + +void kbox_syscall_trap_runtime_set_wake_fd( + struct kbox_syscall_trap_runtime *runtime, + int wake_fd) +{ + if (!runtime) + return; + runtime->wake_fd = wake_fd; + runtime->owns_wake_fd = 0; +} + +int kbox_syscall_trap_runtime_capture(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req) +{ + if (!runtime || !req) + return -1; + if (__atomic_load_n(&runtime->has_pending_request, __ATOMIC_ACQUIRE)) + return -1; + + runtime->pending_request = *req; + __atomic_store_n(&runtime->has_pending_request, 1, __ATOMIC_RELEASE); + + if (runtime->wake_fd >= 0) { + uint64_t wake_value = 1; + ssize_t wr; + + if (runtime->service_running) { + wr = trap_host_write(runtime->wake_fd, &wake_value, + sizeof(wake_value)); + } else { + wr = write(runtime->wake_fd, &wake_value, sizeof(wake_value)); + } + if (wr >= 0) + return 0; + if (runtime->service_running) { + if (wr == -EAGAIN || wr == -EWOULDBLOCK) + return 0; + } else { + if (errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + } + return -1; + } + + return runtime->service_running + ? trap_host_futex_wake(&runtime->has_pending_request) + : 0; +} + +int kbox_syscall_trap_runtime_take_pending( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_syscall_request *out) +{ + if (!runtime || !out || + !__atomic_load_n(&runtime->has_pending_request, __ATOMIC_ACQUIRE)) + return -1; + + *out = runtime->pending_request; + __atomic_store_n(&runtime->has_pending_request, 0, __ATOMIC_RELEASE); + return 0; +} + +int kbox_syscall_trap_runtime_complete( + struct kbox_syscall_trap_runtime *runtime, + const struct kbox_dispatch *dispatch) +{ + if (!runtime || !dispatch) + return -1; + + runtime->pending_dispatch = *dispatch; + __atomic_store_n(&runtime->has_pending_dispatch, 1, __ATOMIC_RELEASE); + if (trap_futex_wake(&runtime->has_pending_dispatch) < 0) + return -1; + runtime->last_dispatch = *dispatch; + runtime->has_last_dispatch = 1; + return 0; +} + +int kbox_syscall_trap_runtime_take_dispatch( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_dispatch *out) +{ + if (!runtime || !out || + !__atomic_load_n(&runtime->has_pending_dispatch, __ATOMIC_ACQUIRE)) + return -1; + + *out = runtime->pending_dispatch; + __atomic_store_n(&runtime->has_pending_dispatch, 0, __ATOMIC_RELEASE); + return 0; +} + +int kbox_syscall_trap_active_dispatch(const struct kbox_syscall_request *req, + struct kbox_dispatch *out) +{ + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || !req || !out || !runtime->ctx || !runtime->service_running) + return -1; + if (kbox_syscall_trap_runtime_capture(runtime, req) < 0) + return -1; + if (wait_for_pending_dispatch(runtime) < 0) + return -1; + return kbox_syscall_trap_runtime_take_dispatch(runtime, out); +} + +pid_t kbox_syscall_trap_active_pid(void) +{ + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime) + return (pid_t) -1; + return runtime->pid; +} + +int kbox_syscall_trap_runtime_dispatch_pending( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_dispatch *out) +{ + struct kbox_syscall_request req; + struct kbox_dispatch dispatch; + + if (!runtime || !runtime->ctx) + return -1; + if (kbox_syscall_trap_runtime_take_pending(runtime, &req) < 0) + return -1; + + dispatch = kbox_dispatch_request(runtime->ctx, &req); + if (dispatch.kind == KBOX_DISPATCH_CONTINUE && + !host_syscall_requires_guest_thread(runtime->ctx, &req)) { + int64_t ret = host_syscall_from_request(&req); + + if (ret < 0) + dispatch = kbox_dispatch_errno((int) -ret); + else + dispatch = kbox_dispatch_value(ret); + } + if (kbox_syscall_trap_runtime_complete(runtime, &dispatch) < 0) + return -1; + if (out) + *out = dispatch; + return 0; +} + +static void *trap_service_thread_main(void *opaque) +{ + struct kbox_syscall_trap_runtime *runtime = opaque; + + while (runtime && + !__atomic_load_n(&runtime->service_stop, __ATOMIC_ACQUIRE)) { + if (runtime->wake_fd >= 0) { + uint64_t wake_value = 0; + ssize_t rd; + + rd = read(runtime->wake_fd, &wake_value, sizeof(wake_value)); + if (rd < 0) { + if (errno == EINTR) + continue; + break; + } + if (rd == 0) + break; + } else { + while (!__atomic_load_n(&runtime->service_stop, __ATOMIC_ACQUIRE) && + !__atomic_load_n(&runtime->has_pending_request, + __ATOMIC_ACQUIRE)) { + if (syscall(SYS_futex, &runtime->has_pending_request, + FUTEX_WAIT_PRIVATE, 0, NULL, NULL, 0) < 0 && + errno != EINTR && errno != EAGAIN) + break; + } + if (__atomic_load_n(&runtime->service_stop, __ATOMIC_ACQUIRE)) + break; + } + + while ( + __atomic_load_n(&runtime->has_pending_request, __ATOMIC_ACQUIRE)) { + if (kbox_syscall_trap_runtime_dispatch_pending(runtime, NULL) < 0) + break; + } + } + + return NULL; +} + +int kbox_syscall_trap_runtime_service_start( + struct kbox_syscall_trap_runtime *runtime) +{ + if (!runtime) + return -1; + if (runtime->service_running) + return 0; + + __atomic_store_n(&runtime->service_stop, 0, __ATOMIC_RELEASE); + if (pthread_create(&runtime->service_thread, NULL, trap_service_thread_main, + runtime) != 0) { + return -1; + } + + runtime->service_running = 1; + return 0; +} + +int kbox_syscall_trap_runtime_service_stop( + struct kbox_syscall_trap_runtime *runtime) +{ + if (!runtime) + return -1; + if (!runtime->service_running) + return 0; + + __atomic_store_n(&runtime->service_stop, 1, __ATOMIC_RELEASE); + if (runtime->wake_fd >= 0) { + uint64_t wake_value = 1; + ssize_t wr = write(runtime->wake_fd, &wake_value, sizeof(wake_value)); + (void) wr; + } else { + (void) trap_futex_wake(&runtime->has_pending_request); + } + if (pthread_join(runtime->service_thread, NULL) != 0) + return -1; + + runtime->service_running = 0; + if (runtime->owns_wake_fd && runtime->wake_fd >= 0) { + close(runtime->wake_fd); + runtime->wake_fd = -1; + runtime->owns_wake_fd = 0; + } + return 0; +} + +int kbox_syscall_trap_handle(struct kbox_syscall_trap_runtime *runtime, + const siginfo_t *info, + void *ucontext_ptr) +{ + struct kbox_syscall_request req; + struct kbox_dispatch dispatch; + + if (!runtime || !runtime->ctx || !runtime->ops || !runtime->ops->execute) { + return -1; + } + if (kbox_syscall_request_from_sigsys(&req, runtime->pid, info, ucontext_ptr, + NULL) < 0) { + return -1; + } + + runtime->last_request = req; + runtime->has_last_request = 1; + runtime->active_ucontext = ucontext_ptr; + if (runtime->ops->execute(runtime, &req, &dispatch) < 0) + return -1; + runtime->last_dispatch = dispatch; + runtime->has_last_dispatch = 1; + /* The kernel clobbers RAX with -ENOSYS when delivering SIGSYS for + * SECCOMP_RET_TRAP. Restore the original syscall number so that + * host_syscall() (called for CONTINUE dispatches) reads the correct nr. + */ +#if defined(__x86_64__) + ((ucontext_t *) ucontext_ptr)->uc_mcontext.gregs[REG_RAX] = (greg_t) req.nr; +#endif + if (kbox_syscall_result_to_sigsys(ucontext_ptr, &dispatch) < 0) + return -1; + runtime->active_ucontext = NULL; + return 0; +} + +/* Execute a raw host syscall on behalf of the guest. Used for CONTINUE in + * trap mode: the kernel already blocked the original syscall via RET_TRAP, + * so we must issue it from the handler. The hook's own syscall instruction + * is in the ALLOW IP range and will not trigger seccomp again. + */ +static int64_t host_syscall(const ucontext_t *uc) +{ +#if defined(__x86_64__) + long nr = uc->uc_mcontext.gregs[REG_RAX]; + + if (nr == __NR_exit || nr == __NR_exit_group) + return kbox_syscall_trap_host_exit_group_now( + (int) uc->uc_mcontext.gregs[REG_RDI]); + if (nr == __NR_execve) + return kbox_syscall_trap_host_execve_now( + (const char *) (uintptr_t) uc->uc_mcontext.gregs[REG_RDI], + (char *const *) (uintptr_t) uc->uc_mcontext.gregs[REG_RSI], + (char *const *) (uintptr_t) uc->uc_mcontext.gregs[REG_RDX]); + if (nr == __NR_execveat) + return kbox_syscall_trap_host_execveat_now( + (int) uc->uc_mcontext.gregs[REG_RDI], + (const char *) (uintptr_t) uc->uc_mcontext.gregs[REG_RSI], + (char *const *) (uintptr_t) uc->uc_mcontext.gregs[REG_RDX], + (char *const *) (uintptr_t) uc->uc_mcontext.gregs[REG_R10], + (int) uc->uc_mcontext.gregs[REG_R8]); + if (nr == __NR_clone) + return kbox_syscall_trap_host_clone_now( + (uint64_t) uc->uc_mcontext.gregs[REG_RDI], + (uint64_t) uc->uc_mcontext.gregs[REG_RSI], + (uint64_t) uc->uc_mcontext.gregs[REG_RDX], + (uint64_t) uc->uc_mcontext.gregs[REG_R10], + (uint64_t) uc->uc_mcontext.gregs[REG_R8]); + if (nr == __NR_clone3) + return kbox_syscall_trap_host_clone3_now( + (const void *) (uintptr_t) uc->uc_mcontext.gregs[REG_RDI], + (size_t) uc->uc_mcontext.gregs[REG_RSI]); + if (nr == __NR_fork) + return kbox_syscall_trap_host_fork_now(); + if (nr == __NR_vfork) + return kbox_syscall_trap_host_vfork_now(); + + return kbox_syscall_trap_host_syscall6( + nr, (uint64_t) uc->uc_mcontext.gregs[REG_RDI], + (uint64_t) uc->uc_mcontext.gregs[REG_RSI], + (uint64_t) uc->uc_mcontext.gregs[REG_RDX], + (uint64_t) uc->uc_mcontext.gregs[REG_R10], + (uint64_t) uc->uc_mcontext.gregs[REG_R8], + (uint64_t) uc->uc_mcontext.gregs[REG_R9]); +#elif defined(__aarch64__) + long nr = (long) uc->uc_mcontext.regs[8]; + + if (nr == __NR_exit || nr == __NR_exit_group) + return kbox_syscall_trap_host_exit_group_now( + (int) uc->uc_mcontext.regs[0]); + if (nr == __NR_execve) + return kbox_syscall_trap_host_execve_now( + (const char *) (uintptr_t) uc->uc_mcontext.regs[0], + (char *const *) (uintptr_t) uc->uc_mcontext.regs[1], + (char *const *) (uintptr_t) uc->uc_mcontext.regs[2]); + if (nr == __NR_execveat) + return kbox_syscall_trap_host_execveat_now( + (int) uc->uc_mcontext.regs[0], + (const char *) (uintptr_t) uc->uc_mcontext.regs[1], + (char *const *) (uintptr_t) uc->uc_mcontext.regs[2], + (char *const *) (uintptr_t) uc->uc_mcontext.regs[3], + (int) uc->uc_mcontext.regs[4]); + if (nr == __NR_clone) + return kbox_syscall_trap_host_clone_now( + (uint64_t) uc->uc_mcontext.regs[0], + (uint64_t) uc->uc_mcontext.regs[1], + (uint64_t) uc->uc_mcontext.regs[2], + (uint64_t) uc->uc_mcontext.regs[3], + (uint64_t) uc->uc_mcontext.regs[4]); + if (nr == __NR_clone3) + return kbox_syscall_trap_host_clone3_now( + (const void *) (uintptr_t) uc->uc_mcontext.regs[0], + (size_t) uc->uc_mcontext.regs[1]); + + return kbox_syscall_trap_host_syscall6( + nr, (uint64_t) uc->uc_mcontext.regs[0], + (uint64_t) uc->uc_mcontext.regs[1], (uint64_t) uc->uc_mcontext.regs[2], + (uint64_t) uc->uc_mcontext.regs[3], (uint64_t) uc->uc_mcontext.regs[4], + (uint64_t) uc->uc_mcontext.regs[5]); +#else + (void) uc; + return -ENOSYS; +#endif +} + +int kbox_syscall_result_to_sigsys(void *ucontext_ptr, + const struct kbox_dispatch *dispatch) +{ + ucontext_t *uc = ucontext_ptr; + int64_t ret; + + if (!uc || !dispatch) + return -1; + + if (dispatch->kind == KBOX_DISPATCH_CONTINUE) { + /* In trap mode, CONTINUE means "let the host kernel handle + * this syscall." RET_TRAP blocks the original syscall, so + * returning from the handler does NOT re-execute it. We must + * issue the syscall ourselves from the handler and write the + * host kernel's return value into the ucontext. + */ + ret = host_syscall(uc); + } else if (dispatch->error != 0) { + ret = -(int64_t) dispatch->error; + } else { + ret = dispatch->val; + } + +#if defined(__x86_64__) + uc->uc_mcontext.gregs[REG_RAX] = (greg_t) ret; + return 0; +#elif defined(__aarch64__) + uc->uc_mcontext.regs[0] = (uint64_t) ret; + return 0; +#else + return -1; +#endif +} + +/* The SIGSYS handler must not have stack-protector instrumentation. + * When the guest has set its own FS base via arch_prctl(SET_FS), the + * signal handler is entered with FS pointing to guest TLS. The stack + * canary lives at %fs:0x28; if FS points to guest TLS, the canary + * value is wrong and the function aborts on return. + * + * By disabling the stack protector for this one function, we can + * safely swap FS to kbox's TLS before calling any C dispatch code + * (which does have canaries and will work correctly with kbox's FS). + */ +__attribute__((no_stack_protector)) static void +trap_sigsys_handler(int signo, siginfo_t *info, void *ucontext_ptr) +{ + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (signo != SIGSYS || !runtime || !runtime->ctx) { + trap_host_exit_group(127); + } + +#if defined(__x86_64__) + /* Save guest FS base and restore kbox's FS base so the dispatcher + * (which uses stack canaries, errno, etc.) runs with correct TLS. + */ + runtime->guest_fs_base = read_host_fs_base(); + write_host_fs_base(runtime->host_fs_base); +#endif + + if (kbox_syscall_trap_handle(runtime, info, ucontext_ptr) < 0) { +#if defined(__x86_64__) + write_host_fs_base(runtime->guest_fs_base); +#endif + trap_host_exit_group(127); + } + +#if defined(__x86_64__) + /* Restore guest FS base. If the dispatched syscall was + * arch_prctl(SET_FS), guest_fs_base was updated by the + * interceptor in seccomp-dispatch.c. + */ + write_host_fs_base(runtime->guest_fs_base); +#endif +} + +int kbox_syscall_trap_runtime_install(struct kbox_syscall_trap_runtime *runtime, + struct kbox_supervisor_ctx *ctx) +{ + struct sigaction sa; + + if (!runtime || !ctx) + return -1; + + if (kbox_syscall_trap_runtime_init(runtime, ctx, NULL) < 0) + return -1; + memset(&sa, 0, sizeof(sa)); + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = trap_sigsys_handler; + sa.sa_flags = SA_SIGINFO; + + store_active_trap_runtime(runtime); + +#if defined(__x86_64__) + /* Probe for FSGSBASE support before reading the FS base. + * rdfsbase/wrfsbase are ~750x faster than the arch_prctl + * trampoline (~2ns vs ~1.5us). Only probed once. + */ + if (have_fsgsbase < 0) + probe_fsgsbase(); + + /* Save the host FS base before installing the handler. This is + * kbox's own TLS pointer. The handler will restore it on entry + * and swap to the guest's FS base on exit. + */ + runtime->host_fs_base = read_host_fs_base(); + runtime->guest_fs_base = runtime->host_fs_base; +#endif + + if (kbox_syscall_trap_runtime_service_start(runtime) < 0) { + store_active_trap_runtime(NULL); + return -1; + } + + if (sigaction(SIGSYS, &sa, &runtime->old_sigsys) < 0) { + (void) kbox_syscall_trap_runtime_service_stop(runtime); + store_active_trap_runtime(NULL); + return -1; + } + + runtime->installed = 1; + return 0; +} + +void kbox_syscall_trap_runtime_uninstall( + struct kbox_syscall_trap_runtime *runtime) +{ + if (!runtime || !runtime->installed) + return; + + (void) kbox_syscall_trap_runtime_service_stop(runtime); + sigaction(SIGSYS, &runtime->old_sigsys, NULL); + if (load_active_trap_runtime() == runtime) + store_active_trap_runtime(NULL); + runtime->installed = 0; +} + +uint64_t kbox_syscall_trap_get_guest_fs(void) +{ +#if defined(__x86_64__) + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (runtime) + return runtime->guest_fs_base; +#endif + return 0; +} + +void kbox_syscall_trap_set_guest_fs(uint64_t val) +{ +#if defined(__x86_64__) + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (runtime) + runtime->guest_fs_base = val; +#endif + (void) val; +} + +int kbox_syscall_trap_get_sigmask(void *out, size_t len) +{ + ucontext_t *uc; + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || !runtime->active_ucontext || !out) + return -1; + + uc = runtime->active_ucontext; + if (len > sizeof(uc->uc_sigmask)) + len = sizeof(uc->uc_sigmask); + memcpy(out, &uc->uc_sigmask, len); + return 0; +} + +int kbox_syscall_trap_set_sigmask(const void *mask, size_t len) +{ + ucontext_t *uc; + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || !runtime->active_ucontext || !mask) + return -1; + + uc = runtime->active_ucontext; + if (len > sizeof(uc->uc_sigmask)) + len = sizeof(uc->uc_sigmask); + memcpy(&uc->uc_sigmask, mask, len); + return 0; +} + +int kbox_syscall_trap_get_pending(void *out, size_t len) +{ + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || !out) + return -1; + + if (len > sizeof(runtime->emulated_pending)) + len = sizeof(runtime->emulated_pending); + memcpy(out, &runtime->emulated_pending, len); + return 0; +} + +int kbox_syscall_trap_set_pending(const void *mask, size_t len) +{ + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || !mask) + return -1; + + if (len > sizeof(runtime->emulated_pending)) + len = sizeof(runtime->emulated_pending); + memcpy(&runtime->emulated_pending, mask, len); + return 0; +} + +int kbox_syscall_trap_add_pending_signal(int signo) +{ + sigset_t next; + struct kbox_syscall_trap_runtime *runtime = load_active_trap_runtime(); + + if (!runtime || signo <= 0) + return -1; + + next = runtime->emulated_pending; + if (sigaddset(&next, signo) < 0) + return -1; + runtime->emulated_pending = next; + return 0; +} diff --git a/src/syscall-trap.h b/src/syscall-trap.h new file mode 100644 index 0000000..a22999a --- /dev/null +++ b/src/syscall-trap.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_SYSCALL_TRAP_H +#define KBOX_SYSCALL_TRAP_H + +#include +#include +#include +#include + +#include "seccomp.h" +#include "syscall-trap-signal.h" + +struct kbox_syscall_trap_runtime; + +struct kbox_syscall_trap_ops { + int (*execute)(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out); +}; + +struct kbox_syscall_trap_runtime { + struct kbox_supervisor_ctx *ctx; + const struct kbox_syscall_trap_ops *ops; + struct sigaction old_sigsys; + struct kbox_syscall_request last_request; + struct kbox_dispatch last_dispatch; + struct kbox_syscall_request pending_request; + struct kbox_dispatch pending_dispatch; + pid_t pid; + int wake_fd; + int owns_wake_fd; + int service_stop; + int service_running; + pthread_t service_thread; + int has_last_request; + int has_last_dispatch; + int has_pending_request; + int has_pending_dispatch; + void *active_ucontext; + sigset_t emulated_pending; + int installed; +#if defined(__x86_64__) + uint64_t host_fs_base; + uint64_t guest_fs_base; +#endif +}; + +int kbox_syscall_regs_from_sigsys(const siginfo_t *info, + const void *ucontext_ptr, + struct kbox_syscall_regs *out); +int kbox_syscall_request_from_sigsys(struct kbox_syscall_request *out, + pid_t pid, + const siginfo_t *info, + const void *ucontext_ptr, + const struct kbox_guest_mem *guest_mem); +int kbox_syscall_trap_runtime_init(struct kbox_syscall_trap_runtime *runtime, + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_trap_ops *ops); +void kbox_syscall_trap_runtime_set_wake_fd( + struct kbox_syscall_trap_runtime *runtime, + int wake_fd); +int kbox_syscall_trap_runtime_capture(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req); +int kbox_syscall_trap_runtime_take_pending( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_syscall_request *out); +int kbox_syscall_trap_runtime_complete( + struct kbox_syscall_trap_runtime *runtime, + const struct kbox_dispatch *dispatch); +int kbox_syscall_trap_runtime_take_dispatch( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_dispatch *out); +int kbox_syscall_trap_active_dispatch(const struct kbox_syscall_request *req, + struct kbox_dispatch *out); +pid_t kbox_syscall_trap_active_pid(void); +int kbox_syscall_trap_runtime_dispatch_pending( + struct kbox_syscall_trap_runtime *runtime, + struct kbox_dispatch *out); +int kbox_syscall_trap_runtime_service_start( + struct kbox_syscall_trap_runtime *runtime); +int kbox_syscall_trap_runtime_service_stop( + struct kbox_syscall_trap_runtime *runtime); +int kbox_syscall_trap_handle(struct kbox_syscall_trap_runtime *runtime, + const siginfo_t *info, + void *ucontext_ptr); +int kbox_syscall_dispatch_sigsys(struct kbox_supervisor_ctx *ctx, + pid_t pid, + const siginfo_t *info, + void *ucontext_ptr); +int kbox_syscall_result_to_sigsys(void *ucontext_ptr, + const struct kbox_dispatch *dispatch); +int kbox_syscall_trap_runtime_install(struct kbox_syscall_trap_runtime *runtime, + struct kbox_supervisor_ctx *ctx); +void kbox_syscall_trap_runtime_uninstall( + struct kbox_syscall_trap_runtime *runtime); +uint64_t kbox_syscall_trap_get_guest_fs(void); +void kbox_syscall_trap_set_guest_fs(uint64_t val); +int kbox_syscall_trap_get_sigmask(void *out, size_t len); +int kbox_syscall_trap_set_sigmask(const void *mask, size_t len); +int kbox_syscall_trap_get_pending(void *out, size_t len); +int kbox_syscall_trap_set_pending(const void *mask, size_t len); +int kbox_syscall_trap_add_pending_signal(int signo); + +#endif /* KBOX_SYSCALL_TRAP_H */ diff --git a/src/web-events.c b/src/web-events.c index c089eba..c582422 100644 --- a/src/web-events.c +++ b/src/web-events.c @@ -127,37 +127,6 @@ void kbox_event_push_syscall(struct kbox_event_ring *ring, ring_push_routine(ring, &e); } - -/* JSON string escaping. */ - -/* Escape a string for safe JSON embedding. - * Handles: " \ and control characters (< 0x20). - * Returns bytes written (not including NUL). - */ -static int json_escape(const char *src, char *dst, int dstsz) -{ - int pos = 0; - if (!src) { - if (dstsz > 0) - dst[0] = '\0'; - return 0; - } - for (; *src && pos < dstsz - 6; src++) { - unsigned char c = (unsigned char) *src; - if (c == '"' || c == '\\') { - dst[pos++] = '\\'; - dst[pos++] = (char) c; - } else if (c < 0x20) { - pos += snprintf(dst + pos, (size_t) (dstsz - pos), "\\u%04x", c); - } else { - dst[pos++] = (char) c; - } - } - if (pos < dstsz) - dst[pos] = '\0'; - return pos; -} - /* Event JSON serialization. */ int kbox_event_to_json(const struct kbox_event *evt, char *buf, int bufsz) diff --git a/src/web-server.c b/src/web-server.c index 504073e..148e1da 100644 --- a/src/web-server.c +++ b/src/web-server.c @@ -329,7 +329,7 @@ static int handle_request(struct kbox_web_ctx *ctx, { char buf[WEB_RESP_BUF_SIZE]; - /* Static assets (/, /style.css, /js/*.js) */ + /* Static assets under / and /js/. */ if (strcmp(req->method, "GET") == 0 && strncmp(req->path, "/api/", 5) != 0 && strcmp(req->path, "/stats") != 0) { diff --git a/src/x86-decode.c b/src/x86-decode.c new file mode 100644 index 0000000..2757ec1 --- /dev/null +++ b/src/x86-decode.c @@ -0,0 +1,968 @@ +/* SPDX-License-Identifier: MIT */ + +/* Minimal x86-64 instruction length decoder. + * + * Enough to walk instruction boundaries in executable segments. Not a full + * disassembler; we only need the length of each instruction so the rewrite + * scanner can identify 0F 05 / 0F 34 at true instruction starts. + * + * Reference: Intel SDM Vol. 2, Chapter 2 "Instruction Format". + * Instruction layout: [prefixes] [REX] opcode [ModR/M [SIB]] [disp] [imm] + */ + +#include "x86-decode.h" + +/* Opcode map flags, packed into a single byte per opcode. */ +#define F_NONE 0x00 /* No operands (or implicit register-only) */ +#define F_MODRM 0x01 /* Has ModR/M byte */ +#define F_IMM8 0x02 /* 8-bit immediate */ +#define F_IMM32 0x04 /* 32-bit immediate (16-bit with 66h prefix) */ +#define F_IMM64 0x08 /* 64-bit immediate (MOV r64, imm64 with REX.W) */ +#define F_REL32 0x10 /* 32-bit relative displacement */ +#define F_REL8 0x20 /* 8-bit relative displacement */ +#define F_PREFIX 0x40 /* This is a prefix, not an opcode */ +#define F_BAD 0x80 /* Invalid / unknown in 64-bit mode */ + +/* One-byte opcode map (0x00..0xFF). + * + * This table encodes the operand structure of each primary opcode. + * Group opcodes (F6/F7, FF, etc.) share the same ModR/M flag since the + * ModR/M.reg field selects the sub-operation but doesn't change the + * instruction length (with the exception of F6/F7 TEST which adds an + * immediate; handled as a special case). + */ +static const unsigned char one_byte_map[256] = { + /* 00-07: ADD r/m, r / ADD r, r/m / ADD AL,imm8 / ADD eAX,imm32 / PUSH ES / + POP ES */ + [0x00] = F_MODRM, + [0x01] = F_MODRM, + [0x02] = F_MODRM, + [0x03] = F_MODRM, + [0x04] = F_IMM8, + [0x05] = F_IMM32, + [0x06] = F_BAD, + [0x07] = F_BAD, /* PUSH/POP ES invalid in 64-bit */ + + /* 08-0F: OR */ + [0x08] = F_MODRM, + [0x09] = F_MODRM, + [0x0A] = F_MODRM, + [0x0B] = F_MODRM, + [0x0C] = F_IMM8, + [0x0D] = F_IMM32, + [0x0E] = F_BAD, /* PUSH CS */ + [0x0F] = F_NONE, /* Two-byte escape; handled separately */ + + /* 10-17: ADC */ + [0x10] = F_MODRM, + [0x11] = F_MODRM, + [0x12] = F_MODRM, + [0x13] = F_MODRM, + [0x14] = F_IMM8, + [0x15] = F_IMM32, + [0x16] = F_BAD, + [0x17] = F_BAD, + + /* 18-1F: SBB */ + [0x18] = F_MODRM, + [0x19] = F_MODRM, + [0x1A] = F_MODRM, + [0x1B] = F_MODRM, + [0x1C] = F_IMM8, + [0x1D] = F_IMM32, + [0x1E] = F_BAD, + [0x1F] = F_BAD, + + /* 20-27: AND */ + [0x20] = F_MODRM, + [0x21] = F_MODRM, + [0x22] = F_MODRM, + [0x23] = F_MODRM, + [0x24] = F_IMM8, + [0x25] = F_IMM32, + [0x26] = F_PREFIX, /* ES override */ + [0x27] = F_BAD, /* DAA */ + + /* 28-2F: SUB */ + [0x28] = F_MODRM, + [0x29] = F_MODRM, + [0x2A] = F_MODRM, + [0x2B] = F_MODRM, + [0x2C] = F_IMM8, + [0x2D] = F_IMM32, + [0x2E] = F_PREFIX, /* CS override */ + [0x2F] = F_BAD, /* DAS */ + + /* 30-37: XOR */ + [0x30] = F_MODRM, + [0x31] = F_MODRM, + [0x32] = F_MODRM, + [0x33] = F_MODRM, + [0x34] = F_IMM8, + [0x35] = F_IMM32, + [0x36] = F_PREFIX, /* SS override */ + [0x37] = F_BAD, /* AAA */ + + /* 38-3F: CMP */ + [0x38] = F_MODRM, + [0x39] = F_MODRM, + [0x3A] = F_MODRM, + [0x3B] = F_MODRM, + [0x3C] = F_IMM8, + [0x3D] = F_IMM32, + [0x3E] = F_PREFIX, /* DS override */ + [0x3F] = F_BAD, /* AAS */ + + /* 40-4F: REX prefixes in 64-bit mode */ + [0x40] = F_PREFIX, + [0x41] = F_PREFIX, + [0x42] = F_PREFIX, + [0x43] = F_PREFIX, + [0x44] = F_PREFIX, + [0x45] = F_PREFIX, + [0x46] = F_PREFIX, + [0x47] = F_PREFIX, + [0x48] = F_PREFIX, + [0x49] = F_PREFIX, + [0x4A] = F_PREFIX, + [0x4B] = F_PREFIX, + [0x4C] = F_PREFIX, + [0x4D] = F_PREFIX, + [0x4E] = F_PREFIX, + [0x4F] = F_PREFIX, + + /* 50-5F: PUSH/POP r64 */ + [0x50] = F_NONE, + [0x51] = F_NONE, + [0x52] = F_NONE, + [0x53] = F_NONE, + [0x54] = F_NONE, + [0x55] = F_NONE, + [0x56] = F_NONE, + [0x57] = F_NONE, + [0x58] = F_NONE, + [0x59] = F_NONE, + [0x5A] = F_NONE, + [0x5B] = F_NONE, + [0x5C] = F_NONE, + [0x5D] = F_NONE, + [0x5E] = F_NONE, + [0x5F] = F_NONE, + + /* 60-6F */ + [0x60] = F_BAD, + [0x61] = F_BAD, /* PUSHA/POPA invalid */ + [0x62] = F_BAD, /* BOUND invalid (EVEX prefix) */ + [0x63] = F_MODRM, /* MOVSXD */ + [0x64] = F_PREFIX, + [0x65] = F_PREFIX, /* FS/GS override */ + [0x66] = F_PREFIX, /* Operand-size override */ + [0x67] = F_PREFIX, /* Address-size override */ + [0x68] = F_IMM32, /* PUSH imm32 */ + [0x69] = F_MODRM | F_IMM32, /* IMUL r, r/m, imm32 */ + [0x6A] = F_IMM8, /* PUSH imm8 */ + [0x6B] = F_MODRM | F_IMM8, /* IMUL r, r/m, imm8 */ + [0x6C] = F_NONE, + [0x6D] = F_NONE, /* INS */ + [0x6E] = F_NONE, + [0x6F] = F_NONE, /* OUTS */ + + /* 70-7F: Jcc rel8 */ + [0x70] = F_REL8, + [0x71] = F_REL8, + [0x72] = F_REL8, + [0x73] = F_REL8, + [0x74] = F_REL8, + [0x75] = F_REL8, + [0x76] = F_REL8, + [0x77] = F_REL8, + [0x78] = F_REL8, + [0x79] = F_REL8, + [0x7A] = F_REL8, + [0x7B] = F_REL8, + [0x7C] = F_REL8, + [0x7D] = F_REL8, + [0x7E] = F_REL8, + [0x7F] = F_REL8, + + /* 80-83: Group 1 (ALU imm) */ + [0x80] = F_MODRM | F_IMM8, + [0x81] = F_MODRM | F_IMM32, + [0x82] = F_BAD, /* Invalid in 64-bit */ + [0x83] = F_MODRM | F_IMM8, + + /* 84-8F */ + [0x84] = F_MODRM, + [0x85] = F_MODRM, /* TEST */ + [0x86] = F_MODRM, + [0x87] = F_MODRM, /* XCHG */ + [0x88] = F_MODRM, + [0x89] = F_MODRM, /* MOV r/m, r */ + [0x8A] = F_MODRM, + [0x8B] = F_MODRM, /* MOV r, r/m */ + [0x8C] = F_MODRM, + [0x8D] = F_MODRM, /* MOV r/m, Sreg / LEA */ + [0x8E] = F_MODRM, /* MOV Sreg, r/m */ + [0x8F] = F_MODRM, /* POP r/m */ + + /* 90-97: XCHG rAX, r / NOP */ + [0x90] = F_NONE, + [0x91] = F_NONE, + [0x92] = F_NONE, + [0x93] = F_NONE, + [0x94] = F_NONE, + [0x95] = F_NONE, + [0x96] = F_NONE, + [0x97] = F_NONE, + + /* 98-9F */ + [0x98] = F_NONE, /* CBW/CWDE/CDQE */ + [0x99] = F_NONE, /* CWD/CDQ/CQO */ + [0x9A] = F_BAD, /* CALLF invalid */ + [0x9B] = F_NONE, /* FWAIT */ + [0x9C] = F_NONE, /* PUSHF */ + [0x9D] = F_NONE, /* POPF */ + [0x9E] = F_NONE, /* SAHF */ + [0x9F] = F_NONE, /* LAHF */ + + /* A0-AF */ + [0xA0] = F_NONE, /* MOV AL, moffs (special case: 8 or 4 byte addr) */ + [0xA1] = F_NONE, /* MOV rAX, moffs (special case) */ + [0xA2] = F_NONE, /* MOV moffs, AL (special case) */ + [0xA3] = F_NONE, /* MOV moffs, rAX (special case) */ + [0xA4] = F_NONE, + [0xA5] = F_NONE, /* MOVS */ + [0xA6] = F_NONE, + [0xA7] = F_NONE, /* CMPS */ + [0xA8] = F_IMM8, + [0xA9] = F_IMM32, /* TEST AL/rAX, imm */ + [0xAA] = F_NONE, + [0xAB] = F_NONE, /* STOS */ + [0xAC] = F_NONE, + [0xAD] = F_NONE, /* LODS */ + [0xAE] = F_NONE, + [0xAF] = F_NONE, /* SCAS */ + + /* B0-BF: MOV r8/r64, imm */ + [0xB0] = F_IMM8, + [0xB1] = F_IMM8, + [0xB2] = F_IMM8, + [0xB3] = F_IMM8, + [0xB4] = F_IMM8, + [0xB5] = F_IMM8, + [0xB6] = F_IMM8, + [0xB7] = F_IMM8, + [0xB8] = F_IMM32, + [0xB9] = F_IMM32, + [0xBA] = F_IMM32, + [0xBB] = F_IMM32, + [0xBC] = F_IMM32, + [0xBD] = F_IMM32, + [0xBE] = F_IMM32, + [0xBF] = F_IMM32, + + /* C0-CF */ + [0xC0] = F_MODRM | F_IMM8, /* Shift Group 2 imm8 */ + [0xC1] = F_MODRM | F_IMM8, + [0xC2] = F_NONE, /* RET imm16 (special case: 2-byte immediate) */ + [0xC3] = F_NONE, /* RET */ + [0xC4] = F_BAD, /* VEX 3-byte prefix; handled separately */ + [0xC5] = F_BAD, /* VEX 2-byte prefix; handled separately */ + [0xC6] = F_MODRM | F_IMM8, /* MOV r/m8, imm8 */ + [0xC7] = F_MODRM | F_IMM32, /* MOV r/m32, imm32 */ + [0xC8] = F_NONE, /* ENTER: iw, ib (3 bytes, special case) */ + [0xC9] = F_NONE, /* LEAVE */ + [0xCA] = F_NONE, /* RETF imm16 (special case) */ + [0xCB] = F_NONE, /* RETF */ + [0xCC] = F_NONE, /* INT 3 */ + [0xCD] = F_IMM8, /* INT imm8 */ + [0xCE] = F_BAD, /* INTO invalid */ + [0xCF] = F_NONE, /* IRET */ + + /* D0-DF */ + [0xD0] = F_MODRM, + [0xD1] = F_MODRM, /* Shift Group 2 (1/CL) */ + [0xD2] = F_MODRM, + [0xD3] = F_MODRM, + [0xD4] = F_BAD, + [0xD5] = F_BAD, /* AAM/AAD invalid */ + [0xD6] = F_BAD, /* SALC invalid */ + [0xD7] = F_NONE, /* XLAT */ + /* D8-DF: x87 FPU */ + [0xD8] = F_MODRM, + [0xD9] = F_MODRM, + [0xDA] = F_MODRM, + [0xDB] = F_MODRM, + [0xDC] = F_MODRM, + [0xDD] = F_MODRM, + [0xDE] = F_MODRM, + [0xDF] = F_MODRM, + + /* E0-EF */ + [0xE0] = F_REL8, + [0xE1] = F_REL8, + [0xE2] = F_REL8, /* LOOP/LOOPZ/LOOPNZ */ + [0xE3] = F_REL8, /* JRCXZ */ + [0xE4] = F_IMM8, + [0xE5] = F_IMM8, /* IN imm8 */ + [0xE6] = F_IMM8, + [0xE7] = F_IMM8, /* OUT imm8 */ + [0xE8] = F_REL32, /* CALL rel32 */ + [0xE9] = F_REL32, /* JMP rel32 */ + [0xEA] = F_BAD, /* JMPF invalid */ + [0xEB] = F_REL8, /* JMP rel8 */ + [0xEC] = F_NONE, + [0xED] = F_NONE, /* IN DX */ + [0xEE] = F_NONE, + [0xEF] = F_NONE, /* OUT DX */ + + /* F0-FF */ + [0xF0] = F_PREFIX, /* LOCK */ + [0xF1] = F_NONE, /* INT1/ICEBP */ + [0xF2] = F_PREFIX, /* REPNE */ + [0xF3] = F_PREFIX, /* REP/REPE */ + [0xF4] = F_NONE, /* HLT */ + [0xF5] = F_NONE, /* CMC */ + [0xF6] = F_MODRM, /* Group 3 byte: TEST variant adds imm8 (special case) */ + [0xF7] = F_MODRM, /* Group 3 word: TEST variant adds imm32 (special case) */ + [0xF8] = F_NONE, + [0xF9] = F_NONE, /* CLC/STC */ + [0xFA] = F_NONE, + [0xFB] = F_NONE, /* CLI/STI */ + [0xFC] = F_NONE, + [0xFD] = F_NONE, /* CLD/STD */ + [0xFE] = F_MODRM, /* Group 4 (INC/DEC r/m8) */ + [0xFF] = F_MODRM, /* Group 5 (INC/DEC/CALL/JMP/PUSH r/m) */ +}; + +/* Two-byte opcode map (0F xx). + * + * Flags indicate ModR/M and immediate requirements for each 0F-prefixed + * opcode. Most SSE/SSE2 instructions have ModR/M; a few have immediates. + */ +static const unsigned char two_byte_map[256] = { + /* 0F 00-0F */ + [0x00] = F_MODRM, /* Group 6 (SLDT, STR, ...) */ + [0x01] = F_MODRM, /* Group 7 (SGDT, SIDT, ...) */ + [0x02] = F_MODRM, /* LAR */ + [0x03] = F_MODRM, /* LSL */ + [0x04] = F_BAD, + [0x05] = F_NONE, /* SYSCALL */ + [0x06] = F_NONE, /* CLTS */ + [0x07] = F_NONE, /* SYSRET */ + [0x08] = F_NONE, /* INVD */ + [0x09] = F_NONE, /* WBINVD */ + [0x0A] = F_BAD, + [0x0B] = F_NONE, /* UD2 */ + [0x0C] = F_BAD, + [0x0D] = F_MODRM, /* prefetchw */ + [0x0E] = F_NONE, /* FEMMS */ + [0x0F] = F_MODRM | F_IMM8, /* 3DNow! */ + + /* 0F 10-1F: SSE MOV* */ + [0x10] = F_MODRM, + [0x11] = F_MODRM, + [0x12] = F_MODRM, + [0x13] = F_MODRM, + [0x14] = F_MODRM, + [0x15] = F_MODRM, + [0x16] = F_MODRM, + [0x17] = F_MODRM, + [0x18] = F_MODRM, /* prefetch group */ + [0x19] = F_MODRM, /* NOP r/m (multi-byte NOP) */ + [0x1A] = F_MODRM, + [0x1B] = F_MODRM, + [0x1C] = F_MODRM, + [0x1D] = F_MODRM, + [0x1E] = F_MODRM, + [0x1F] = F_MODRM, /* NOP r/m (multi-byte NOP) */ + + /* 0F 20-2F */ + [0x20] = F_MODRM, + [0x21] = F_MODRM, /* MOV from/to CRn/DRn */ + [0x22] = F_MODRM, + [0x23] = F_MODRM, + [0x24] = F_BAD, + [0x25] = F_BAD, + [0x26] = F_BAD, + [0x27] = F_BAD, + [0x28] = F_MODRM, + [0x29] = F_MODRM, /* MOVAPS */ + [0x2A] = F_MODRM, + [0x2B] = F_MODRM, + [0x2C] = F_MODRM, + [0x2D] = F_MODRM, + [0x2E] = F_MODRM, + [0x2F] = F_MODRM, + + /* 0F 30-3F */ + [0x30] = F_NONE, /* WRMSR */ + [0x31] = F_NONE, /* RDTSC */ + [0x32] = F_NONE, /* RDMSR */ + [0x33] = F_NONE, /* RDPMC */ + [0x34] = F_NONE, /* SYSENTER */ + [0x35] = F_NONE, /* SYSEXIT */ + [0x36] = F_BAD, + [0x37] = F_NONE, /* GETSEC */ + [0x38] = F_BAD, /* 3-byte escape 0F 38; handled separately */ + [0x39] = F_BAD, + [0x3A] = F_BAD, /* 3-byte escape 0F 3A; handled separately */ + [0x3B] = F_BAD, + [0x3C] = F_BAD, + [0x3D] = F_BAD, + [0x3E] = F_BAD, + [0x3F] = F_BAD, + + /* 0F 40-4F: CMOVcc */ + [0x40] = F_MODRM, + [0x41] = F_MODRM, + [0x42] = F_MODRM, + [0x43] = F_MODRM, + [0x44] = F_MODRM, + [0x45] = F_MODRM, + [0x46] = F_MODRM, + [0x47] = F_MODRM, + [0x48] = F_MODRM, + [0x49] = F_MODRM, + [0x4A] = F_MODRM, + [0x4B] = F_MODRM, + [0x4C] = F_MODRM, + [0x4D] = F_MODRM, + [0x4E] = F_MODRM, + [0x4F] = F_MODRM, + + /* 0F 50-5F: SSE arithmetic */ + [0x50] = F_MODRM, + [0x51] = F_MODRM, + [0x52] = F_MODRM, + [0x53] = F_MODRM, + [0x54] = F_MODRM, + [0x55] = F_MODRM, + [0x56] = F_MODRM, + [0x57] = F_MODRM, + [0x58] = F_MODRM, + [0x59] = F_MODRM, + [0x5A] = F_MODRM, + [0x5B] = F_MODRM, + [0x5C] = F_MODRM, + [0x5D] = F_MODRM, + [0x5E] = F_MODRM, + [0x5F] = F_MODRM, + + /* 0F 60-6F: SSE pack/unpack */ + [0x60] = F_MODRM, + [0x61] = F_MODRM, + [0x62] = F_MODRM, + [0x63] = F_MODRM, + [0x64] = F_MODRM, + [0x65] = F_MODRM, + [0x66] = F_MODRM, + [0x67] = F_MODRM, + [0x68] = F_MODRM, + [0x69] = F_MODRM, + [0x6A] = F_MODRM, + [0x6B] = F_MODRM, + [0x6C] = F_MODRM, + [0x6D] = F_MODRM, + [0x6E] = F_MODRM, + [0x6F] = F_MODRM, + + /* 0F 70-7F */ + [0x70] = F_MODRM | F_IMM8, /* PSHUFD etc */ + [0x71] = F_MODRM | F_IMM8, /* Group 12 */ + [0x72] = F_MODRM | F_IMM8, /* Group 13 */ + [0x73] = F_MODRM | F_IMM8, /* Group 14 */ + [0x74] = F_MODRM, + [0x75] = F_MODRM, + [0x76] = F_MODRM, + [0x77] = F_NONE, /* EMMS */ + [0x78] = F_MODRM, + [0x79] = F_MODRM, + [0x7A] = F_BAD, + [0x7B] = F_BAD, + [0x7C] = F_MODRM, + [0x7D] = F_MODRM, + [0x7E] = F_MODRM, + [0x7F] = F_MODRM, + + /* 0F 80-8F: Jcc rel32 */ + [0x80] = F_REL32, + [0x81] = F_REL32, + [0x82] = F_REL32, + [0x83] = F_REL32, + [0x84] = F_REL32, + [0x85] = F_REL32, + [0x86] = F_REL32, + [0x87] = F_REL32, + [0x88] = F_REL32, + [0x89] = F_REL32, + [0x8A] = F_REL32, + [0x8B] = F_REL32, + [0x8C] = F_REL32, + [0x8D] = F_REL32, + [0x8E] = F_REL32, + [0x8F] = F_REL32, + + /* 0F 90-9F: SETcc */ + [0x90] = F_MODRM, + [0x91] = F_MODRM, + [0x92] = F_MODRM, + [0x93] = F_MODRM, + [0x94] = F_MODRM, + [0x95] = F_MODRM, + [0x96] = F_MODRM, + [0x97] = F_MODRM, + [0x98] = F_MODRM, + [0x99] = F_MODRM, + [0x9A] = F_MODRM, + [0x9B] = F_MODRM, + [0x9C] = F_MODRM, + [0x9D] = F_MODRM, + [0x9E] = F_MODRM, + [0x9F] = F_MODRM, + + /* 0F A0-AF */ + [0xA0] = F_NONE, + [0xA1] = F_NONE, /* PUSH/POP FS */ + [0xA2] = F_NONE, /* CPUID */ + [0xA3] = F_MODRM, /* BT */ + [0xA4] = F_MODRM | F_IMM8, /* SHLD imm8 */ + [0xA5] = F_MODRM, /* SHLD CL */ + [0xA6] = F_BAD, + [0xA7] = F_BAD, + [0xA8] = F_NONE, + [0xA9] = F_NONE, /* PUSH/POP GS */ + [0xAA] = F_NONE, /* RSM */ + [0xAB] = F_MODRM, /* BTS */ + [0xAC] = F_MODRM | F_IMM8, /* SHRD imm8 */ + [0xAD] = F_MODRM, /* SHRD CL */ + [0xAE] = F_MODRM, /* Group 15 (FXSAVE, LFENCE, ...) */ + [0xAF] = F_MODRM, /* IMUL */ + + /* 0F B0-BF */ + [0xB0] = F_MODRM, + [0xB1] = F_MODRM, /* CMPXCHG */ + [0xB2] = F_MODRM, /* LSS */ + [0xB3] = F_MODRM, /* BTR */ + [0xB4] = F_MODRM, + [0xB5] = F_MODRM, /* LFS/LGS */ + [0xB6] = F_MODRM, + [0xB7] = F_MODRM, /* MOVZX */ + [0xB8] = F_MODRM, /* POPCNT */ + [0xB9] = F_MODRM, /* Group 10 (UD1) */ + [0xBA] = F_MODRM | F_IMM8, /* Group 8 (BT/BTS/BTR/BTC imm8) */ + [0xBB] = F_MODRM, /* BTC */ + [0xBC] = F_MODRM, + [0xBD] = F_MODRM, /* BSF/BSR, TZCNT/LZCNT */ + [0xBE] = F_MODRM, + [0xBF] = F_MODRM, /* MOVSX */ + + /* 0F C0-CF */ + [0xC0] = F_MODRM, + [0xC1] = F_MODRM, /* XADD */ + [0xC2] = F_MODRM | F_IMM8, /* CMPPS/CMPPD */ + [0xC3] = F_MODRM, /* MOVNTI */ + [0xC4] = F_MODRM | F_IMM8, /* PINSRW */ + [0xC5] = F_MODRM | F_IMM8, /* PEXTRW */ + [0xC6] = F_MODRM | F_IMM8, /* SHUFPS/SHUFPD */ + [0xC7] = F_MODRM, /* Group 9 (CMPXCHG8B/16B) */ + [0xC8] = F_NONE, + [0xC9] = F_NONE, + [0xCA] = F_NONE, + [0xCB] = F_NONE, + [0xCC] = F_NONE, + [0xCD] = F_NONE, + [0xCE] = F_NONE, + [0xCF] = F_NONE, + /* BSWAP r32/r64 */ + + /* 0F D0-DF: SSE2 */ + [0xD0] = F_MODRM, + [0xD1] = F_MODRM, + [0xD2] = F_MODRM, + [0xD3] = F_MODRM, + [0xD4] = F_MODRM, + [0xD5] = F_MODRM, + [0xD6] = F_MODRM, + [0xD7] = F_MODRM, + [0xD8] = F_MODRM, + [0xD9] = F_MODRM, + [0xDA] = F_MODRM, + [0xDB] = F_MODRM, + [0xDC] = F_MODRM, + [0xDD] = F_MODRM, + [0xDE] = F_MODRM, + [0xDF] = F_MODRM, + + /* 0F E0-EF: SSE2 */ + [0xE0] = F_MODRM, + [0xE1] = F_MODRM, + [0xE2] = F_MODRM, + [0xE3] = F_MODRM, + [0xE4] = F_MODRM, + [0xE5] = F_MODRM, + [0xE6] = F_MODRM, + [0xE7] = F_MODRM, + [0xE8] = F_MODRM, + [0xE9] = F_MODRM, + [0xEA] = F_MODRM, + [0xEB] = F_MODRM, + [0xEC] = F_MODRM, + [0xED] = F_MODRM, + [0xEE] = F_MODRM, + [0xEF] = F_MODRM, + + /* 0F F0-FF: SSE2 */ + [0xF0] = F_MODRM, + [0xF1] = F_MODRM, + [0xF2] = F_MODRM, + [0xF3] = F_MODRM, + [0xF4] = F_MODRM, + [0xF5] = F_MODRM, + [0xF6] = F_MODRM, + [0xF7] = F_MODRM, + [0xF8] = F_MODRM, + [0xF9] = F_MODRM, + [0xFA] = F_MODRM, + [0xFB] = F_MODRM, + [0xFC] = F_MODRM, + [0xFD] = F_MODRM, + [0xFE] = F_MODRM, + [0xFF] = F_BAD, +}; + +/* Decode ModR/M + optional SIB + displacement. + * Returns the number of extra bytes consumed (ModR/M + SIB + displacement), + * or 0 on truncation. + */ +static int decode_modrm(const unsigned char *code, + size_t remaining, + int addr_size_32) +{ + unsigned char modrm; + int mod, rm; + int len = 1; /* ModR/M byte itself */ + + if (remaining < 1) + return 0; + + modrm = code[0]; + mod = (modrm >> 6) & 3; + rm = modrm & 7; + + if (mod == 3) + return 1; /* Register-direct, no displacement */ + + /* 32-bit addressing (default in 64-bit mode, or 67h prefix) */ + if (!addr_size_32) { + /* 64-bit addressing */ + if (mod == 0 && rm == 5) { + /* RIP-relative: 4-byte displacement */ + return remaining >= 5 ? 5 : 0; + } + if (rm == 4) { + /* SIB byte follows */ + if (remaining < 2) + return 0; + len = 2; /* ModR/M + SIB */ + unsigned char sib = code[1]; + int base = sib & 7; + if (mod == 0 && base == 5) { + /* disp32 with SIB, no base */ + return remaining >= 6 ? 6 : 0; + } + } + if (mod == 1) + return remaining >= (size_t) (len + 1) ? len + 1 : 0; /* disp8 */ + if (mod == 2) + return remaining >= (size_t) (len + 4) ? len + 4 : 0; /* disp32 */ + return remaining >= (size_t) len ? len : 0; + } + + /* 32-bit addressing mode (with 67h prefix) */ + if (mod == 0 && rm == 5) { + return remaining >= 5 ? 5 : 0; /* disp32 */ + } + if (rm == 4) { + if (remaining < 2) + return 0; + len = 2; + unsigned char sib = code[1]; + int base = sib & 7; + if (mod == 0 && base == 5) + return remaining >= 6 ? 6 : 0; + } + if (mod == 1) + return remaining >= (size_t) (len + 1) ? len + 1 : 0; + if (mod == 2) + return remaining >= (size_t) (len + 4) ? len + 4 : 0; + return remaining >= (size_t) len ? len : 0; +} + +int kbox_x86_insn_length(const unsigned char *code, size_t max_len) +{ + size_t pos = 0; + int has_66 = 0; + int has_67 = 0; + int has_rex_w = 0; + unsigned char rex = 0; + unsigned char opcode; + unsigned char flags; + int len; + + if (!code || max_len == 0) + return 0; + + /* 1. Consume legacy prefixes and REX prefix. */ + for (;;) { + if (pos >= max_len) + return 0; + opcode = code[pos]; + + /* Legacy prefixes. A legacy prefix after REX invalidates the REX. */ + if (opcode == 0x26 || opcode == 0x2E || opcode == 0x36 || + opcode == 0x3E || opcode == 0x64 || opcode == 0x65 || + opcode == 0xF0 || opcode == 0xF2 || opcode == 0xF3) { + rex = 0; + has_rex_w = 0; + pos++; + continue; + } + if (opcode == 0x66) { + rex = 0; + has_rex_w = 0; + has_66 = 1; + pos++; + continue; + } + if (opcode == 0x67) { + rex = 0; + has_rex_w = 0; + has_67 = 1; + pos++; + continue; + } + + /* REX prefix: 0x40..0x4F. Keep scanning; a following legacy + * prefix invalidates the REX (cleared above on next iteration). + */ + if (opcode >= 0x40 && opcode <= 0x4F) { + rex = opcode; + has_rex_w = (opcode & 0x08) != 0; + pos++; + continue; + } + + break; /* Not a prefix */ + } + + if (pos >= max_len) + return 0; + opcode = code[pos]; + + /* 2. Handle VEX prefixes (C4h = 3-byte VEX, C5h = 2-byte VEX). + * VEX-encoded instructions always have ModR/M. We decode the VEX + * payload length conservatively. + */ + if (opcode == 0xC5 && !rex) { + /* 2-byte VEX: C5 [vvvv] opcode [modrm...] */ + if (pos + 2 >= max_len) + return 0; + pos += 2; /* C5 + VEX byte */ + opcode = code[pos]; + pos++; + flags = two_byte_map[opcode]; + if (flags & F_BAD) + return 0; + len = (int) pos; + if (flags & F_MODRM) { + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + } + if (flags & F_IMM8) + len += 1; + if (flags & F_IMM32) + len += has_66 ? 2 : 4; + if (flags & F_REL8) + len += 1; + if (flags & F_REL32) + len += 4; + return (size_t) len <= max_len ? len : 0; + } + + if (opcode == 0xC4 && !rex) { + /* 3-byte VEX: C4 [byte1] [byte2] opcode [modrm...] */ + if (pos + 3 >= max_len) + return 0; + unsigned char map_select = code[pos + 1] & 0x1F; + pos += 3; /* C4 + 2 VEX bytes */ + opcode = code[pos]; + pos++; + + /* map_select 1 = 0F, 2 = 0F38, 3 = 0F3A */ + if (map_select == 1) + flags = two_byte_map[opcode]; + else if (map_select == 2) + flags = F_MODRM; /* 0F 38 xx: all have ModR/M, no imm (mostly) */ + else if (map_select == 3) + flags = F_MODRM | F_IMM8; /* 0F 3A xx: all have ModR/M + imm8 */ + else + return 0; + + if (flags & F_BAD) + return 0; + len = (int) pos; + if (flags & F_MODRM) { + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + } + if (flags & F_IMM8) + len += 1; + if (flags & F_IMM32) + len += has_66 ? 2 : 4; + if (flags & F_REL8) + len += 1; + if (flags & F_REL32) + len += 4; + return (size_t) len <= max_len ? len : 0; + } + + /* 3. Two-byte opcode escape (0F xx). */ + if (opcode == 0x0F) { + pos++; + if (pos >= max_len) + return 0; + unsigned char op2 = code[pos]; + pos++; + + /* 3-byte escape: 0F 38 xx and 0F 3A xx */ + if (op2 == 0x38) { + if (pos >= max_len) + return 0; + pos++; /* consume the third opcode byte */ + /* 0F 38 xx: all have ModR/M, no immediate (SSE3/SSSE3/SSE4.1) */ + len = (int) pos; + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + return (size_t) len <= max_len ? len : 0; + } + if (op2 == 0x3A) { + if (pos >= max_len) + return 0; + pos++; /* consume the third opcode byte */ + /* 0F 3A xx: all have ModR/M + imm8 (SSE4.1 PBLENDW etc) */ + len = (int) pos; + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + len += 1; /* imm8 */ + return (size_t) len <= max_len ? len : 0; + } + + flags = two_byte_map[op2]; + if (flags & F_BAD) + return 0; + + len = (int) pos; + if (flags & F_MODRM) { + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + } + if (flags & F_IMM8) + len += 1; + if (flags & F_IMM32) + len += has_66 ? 2 : 4; + if (flags & F_REL8) + len += 1; + if (flags & F_REL32) + len += 4; + return (size_t) len <= max_len ? len : 0; + } + + /* 4. One-byte opcode. */ + pos++; + flags = one_byte_map[opcode]; + + if (flags & F_PREFIX) { + /* Stray prefix without a following opcode; treat as 1-byte insn + * (happens at end of padding regions). */ + return (int) pos; + } + if (flags & F_BAD) + return 0; + + /* Special cases that the flag table can't express cleanly. */ + + /* C8h ENTER: iw (2 bytes) + ib (1 byte) = 3 extra bytes */ + if (opcode == 0xC8) { + len = (int) pos + 3; + return (size_t) len <= max_len ? len : 0; + } + + /* C2h / CAh RET/RETF imm16: exactly 2-byte immediate. */ + if (opcode == 0xC2 || opcode == 0xCA) { + len = (int) pos + 2; + return (size_t) len <= max_len ? len : 0; + } + + /* A0-A3: MOV moffs. In 64-bit mode, the address is 8 bytes by default, + * 4 bytes with 67h address-size override. */ + if (opcode >= 0xA0 && opcode <= 0xA3) { + len = (int) pos + (has_67 ? 4 : 8); + return (size_t) len <= max_len ? len : 0; + } + + /* B8-BF with REX.W: MOV r64, imm64 (10 bytes total). */ + if (opcode >= 0xB8 && opcode <= 0xBF && has_rex_w) { + len = (int) pos + 8; + return (size_t) len <= max_len ? len : 0; + } + + /* F6/F7 Group 3: TEST variant (reg field 0 or 1) has an immediate. */ + if (opcode == 0xF6 || opcode == 0xF7) { + if (pos >= max_len) + return 0; + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len = (int) pos + modrm_len; + int reg_field = (code[pos] >> 3) & 7; + if (reg_field == 0 || reg_field == 1) { + /* TEST: has immediate */ + if (opcode == 0xF6) + len += 1; /* imm8 */ + else + len += has_66 ? 2 : 4; /* imm16/imm32 */ + } + return (size_t) len <= max_len ? len : 0; + } + + len = (int) pos; + if (flags & F_MODRM) { + int modrm_len = decode_modrm(code + pos, max_len - pos, has_67); + if (modrm_len == 0) + return 0; + len += modrm_len; + } + if (flags & F_IMM8) + len += 1; + if (flags & F_IMM32) + len += has_66 ? 2 : 4; + if (flags & F_IMM64) + len += 8; + if (flags & F_REL8) + len += 1; + if (flags & F_REL32) + len += 4; + + return (size_t) len <= max_len ? len : 0; +} diff --git a/src/x86-decode.h b/src/x86-decode.h new file mode 100644 index 0000000..29ea707 --- /dev/null +++ b/src/x86-decode.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KBOX_X86_DECODE_H +#define KBOX_X86_DECODE_H + +/* Minimal x86-64 instruction length decoder. + * + * Walks instruction boundaries in executable segments so the rewrite scanner + * only matches 0F 05 (syscall) / 0F 34 (sysenter) at true instruction starts, + * not as embedded bytes inside longer encodings (e.g. immediates, SIB bytes). + * + * This is NOT a full disassembler. It decodes only the length of each + * instruction, which is sufficient for boundary-aware scanning. The decoder + * handles: legacy prefixes, REX/REX2, VEX (2/3-byte), ModR/M, SIB, + * displacement, and immediate fields. + */ + +#include +#include + +/* Decode the length of the x86-64 instruction at @code[0..max_len-1]. + * Returns the instruction length in bytes (1..15), or 0 if the instruction + * could not be decoded (e.g. truncated, invalid encoding). + * + * The decoder is conservative: when encountering truly unknown opcodes it + * returns 0 so the caller can skip one byte and resync (safe: the rewrite + * scanner treats unknown regions as non-syscall). + */ +int kbox_x86_insn_length(const unsigned char *code, size_t max_len); + +#endif /* KBOX_X86_DECODE_H */ diff --git a/tests/guest/bench-test.c b/tests/guest/bench-test.c new file mode 100644 index 0000000..ea775b2 --- /dev/null +++ b/tests/guest/bench-test.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: MIT */ +/* Guest benchmark: measure per-syscall latency for core operations. + * + * Runs each syscall in a tight loop (default 10000 iterations) and reports + * average wall-clock time in microseconds. Designed to run inside kbox + * against an ext4 rootfs, and on bare metal for comparison. + * + * Usage: bench_test [iterations] + */ +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_ITERS 10000 + +static long elapsed_ns(struct timespec *start, struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) * 1000000000L + + (end->tv_nsec - start->tv_nsec); +} + +static double bench_stat(int iters) +{ + struct timespec t0, t1; + struct stat st; + + if (stat("/etc/hostname", &st) < 0) + return -1.0; + + clock_gettime(CLOCK_MONOTONIC, &t0); + for (int i = 0; i < iters; i++) + stat("/etc/hostname", &st); + clock_gettime(CLOCK_MONOTONIC, &t1); + + return (double) elapsed_ns(&t0, &t1) / iters / 1000.0; +} + +static double bench_open_close(int iters) +{ + struct timespec t0, t1; + + int probe = open("/etc/hostname", O_RDONLY); + if (probe < 0) + return -1.0; + close(probe); + + clock_gettime(CLOCK_MONOTONIC, &t0); + for (int i = 0; i < iters; i++) { + int fd = open("/etc/hostname", O_RDONLY); + if (fd >= 0) + close(fd); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + + return (double) elapsed_ns(&t0, &t1) / iters / 1000.0; +} + +static double bench_lseek_read(int iters) +{ + struct timespec t0, t1; + char buf[64]; + + int fd = open("/etc/hostname", O_RDONLY); + if (fd < 0) + return -1.0; + + volatile ssize_t sink; + + clock_gettime(CLOCK_MONOTONIC, &t0); + for (int i = 0; i < iters; i++) { + lseek(fd, 0, SEEK_SET); + sink = read(fd, buf, sizeof(buf)); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + + (void) sink; + close(fd); + return (double) elapsed_ns(&t0, &t1) / iters / 1000.0; +} + +static double bench_write(int iters) +{ + struct timespec t0, t1; + char buf[64]; + + memset(buf, 'x', sizeof(buf)); + + int fd = open("/tmp/bench_write", O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) + return -1.0; + + volatile ssize_t sink; + + clock_gettime(CLOCK_MONOTONIC, &t0); + for (int i = 0; i < iters; i++) { + lseek(fd, 0, SEEK_SET); + sink = write(fd, buf, sizeof(buf)); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + + (void) sink; + close(fd); + unlink("/tmp/bench_write"); + return (double) elapsed_ns(&t0, &t1) / iters / 1000.0; +} + +static double bench_getpid(int iters) +{ + struct timespec t0, t1; + + clock_gettime(CLOCK_MONOTONIC, &t0); + for (int i = 0; i < iters; i++) + getpid(); + clock_gettime(CLOCK_MONOTONIC, &t1); + + return (double) elapsed_ns(&t0, &t1) / iters / 1000.0; +} + +int main(int argc, char *argv[]) +{ + int iters = DEFAULT_ITERS; + + if (argc > 1) + iters = atoi(argv[1]); + if (iters <= 0) + iters = DEFAULT_ITERS; + + printf("syscall microbenchmark (%d iterations)\n", iters); + printf("%-16s %10s\n", "syscall", "us/call"); + printf("%-16s %10s\n", "-------", "-------"); + + struct { + const char *name; + double (*fn)(int); + } benches[] = { + {"stat", bench_stat}, + {"open+close", bench_open_close}, + {"lseek+read", bench_lseek_read}, + {"write", bench_write}, + {"getpid", bench_getpid}, + }; + for (size_t i = 0; i < sizeof(benches) / sizeof(benches[0]); i++) { + double us = benches[i].fn(iters); + if (us < 0) + printf("%-16s %10s\n", benches[i].name, "SKIP"); + else + printf("%-16s %10.1f\n", benches[i].name, us); + } + + printf("PASS: bench_test\n"); + return 0; +} diff --git a/tests/guest/trap-bench.S b/tests/guest/trap-bench.S new file mode 100644 index 0000000..b9b2995 --- /dev/null +++ b/tests/guest/trap-bench.S @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: MIT */ +/* Minimal getpid benchmark -- no libc, no TLS. + * Prints: "NNNNns\n" (total ns for 10000 getpid calls). + * Divide by 10000 to get ns/call. + */ +.globl _start +_start: + /* clock_gettime(CLOCK_MONOTONIC, &t0) */ + sub $16, %rsp + mov $228, %rax + mov $1, %rdi + mov %rsp, %rsi + syscall + mov (%rsp), %r12 /* t0.sec */ + mov 8(%rsp), %r13 /* t0.nsec */ + + /* 10000 x getpid -- use %rbx as counter (not clobbered by syscall) */ + mov $10000, %ebx +.loop: + mov $39, %eax + syscall + dec %ebx + jnz .loop + + /* clock_gettime(CLOCK_MONOTONIC, &t1) */ + mov $228, %rax + mov $1, %rdi + mov %rsp, %rsi + syscall + mov (%rsp), %r14 /* t1.sec */ + mov 8(%rsp), %r15 /* t1.nsec */ + add $16, %rsp + + /* elapsed = (t1.sec - t0.sec)*1e9 + (t1.nsec - t0.nsec) */ + sub %r12, %r14 + imul $1000000000, %r14, %r14 + sub %r13, %r15 + add %r15, %r14 /* r14 = total_ns */ + + /* Format r14 as decimal into stack buffer, then write */ + sub $32, %rsp + lea 30(%rsp), %rdi /* point to end */ + movb $'\n', (%rdi) + dec %rdi + /* Append "ns" */ + movb $'s', (%rdi) + dec %rdi + movb $'n', (%rdi) + dec %rdi + + mov %r14, %rax + mov $10, %r8 +.fmt: + xor %rdx, %rdx + div %r8 + add $'0', %dl + movb %dl, (%rdi) + dec %rdi + test %rax, %rax + jnz .fmt + + /* rdi points one before first digit */ + inc %rdi + /* length = 31 - (rdi - rsp) */ + lea 31(%rsp), %rdx + sub %rdi, %rdx /* len */ + + /* write(1, rdi, len) */ + mov %rdi, %rsi + mov $1, %rdi + mov $1, %rax + syscall + + add $32, %rsp + + /* exit(0) */ + mov $60, %rax + xor %rdi, %rdi + syscall diff --git a/tests/unit/test-elf.c b/tests/unit/test-elf.c index ab5d755..03c6f9d 100644 --- a/tests/unit/test-elf.c +++ b/tests/unit/test-elf.c @@ -1,9 +1,90 @@ /* SPDX-License-Identifier: MIT */ +#include +#include #include +#include +#include #include "kbox/elf.h" #include "test-runner.h" +#define ET_EXEC 2 +#define ET_DYN 3 +#define PT_LOAD 1 +#define PT_INTERP 3 +#define PT_PHDR 6 +#define PT_GNU_STACK 0x6474e551u +#define PF_X 0x1 +#define PF_W 0x2 +#define PF_R 0x4 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t type, + uint16_t machine, + uint64_t entry, + uint64_t phoff, + uint16_t phnum) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, type); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, phoff); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + size_t index, + uint32_t type, + uint32_t flags, + uint64_t offset, + uint64_t vaddr, + uint64_t filesz, + uint64_t memsz, + uint64_t align) +{ + unsigned char *ph = buf + 64 + index * 56; + + set_le32(ph + 0, type); + set_le32(ph + 4, flags); + set_le64(ph + 8, offset); + set_le64(ph + 16, vaddr); + set_le64(ph + 32, filesz); + set_le64(ph + 40, memsz); + set_le64(ph + 48, align); +} + /* Minimal 64-bit little-endian ELF with one PT_INTERP program header. */ static const unsigned char elf_with_interp[] = { /* ELF header (64 bytes) */ @@ -356,6 +437,145 @@ static void test_elf_find_interp_loc_static(void) ASSERT_EQ(filesz, 0); } +static void test_elf_read_header_window_fd(void) +{ + char path[128]; + unsigned char *buf = NULL; + size_t buf_len = 0; + int fd = test_mkstemp(path, sizeof(path), "kbox-elf-unit"); + + ASSERT_TRUE(fd >= 0); + unlink(path); + ASSERT_EQ(write(fd, elf_with_interp, sizeof(elf_with_interp)), + (long) sizeof(elf_with_interp)); + ASSERT_EQ(kbox_read_elf_header_window_fd(fd, &buf, &buf_len), 0); + ASSERT_EQ(buf_len, sizeof(elf_with_interp)); + ASSERT_EQ(memcmp(buf, elf_with_interp, buf_len), 0); + munmap(buf, buf_len); + close(fd); +} + +static void test_elf_read_header_window_fd_large_phoff(void) +{ + unsigned char elf[5000]; + char path[128]; + unsigned char *buf = NULL; + size_t buf_len = 0; + int fd; + + memset(elf, 0, sizeof(elf)); + memcpy(elf, elf_with_interp, 64); + /* Move phoff to 4096 and keep one PT_INTERP entry there. */ + elf[32] = 0x00; + elf[33] = 0x10; + elf[34] = 0x00; + elf[35] = 0x00; + elf[36] = 0x00; + elf[37] = 0x00; + elf[38] = 0x00; + elf[39] = 0x00; + memcpy(elf + 4096, elf_with_interp + 64, 56); + fd = test_mkstemp(path, sizeof(path), "kbox-elf-unit"); + + ASSERT_TRUE(fd >= 0); + unlink(path); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (long) sizeof(elf)); + ASSERT_EQ(kbox_read_elf_header_window_fd(fd, &buf, &buf_len), 0); + ASSERT_EQ(buf_len, 4152); + ASSERT_EQ(memcmp(buf, elf, buf_len), 0); + munmap(buf, buf_len); + close(fd); +} + +static void test_elf_build_load_plan_exec(void) +{ + unsigned char elf[1024]; + struct kbox_elf_load_plan plan; + + init_elf64(elf, sizeof(elf), ET_EXEC, 0x3e, 0x401020, 64, 3); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0x400000, 0x200, 0x300, 0x1000); + set_phdr(elf, 1, PT_LOAD, PF_R | PF_W, 0x200, 0x401000, 0x80, 0x200, + 0x1000); + set_phdr(elf, 2, PT_INTERP, 0, 0x180, 0, 8, 8, 1); + memcpy(elf + 0x180, "/ld.so\0", 8); + + ASSERT_EQ(kbox_build_elf_load_plan(elf, sizeof(elf), 0x1000, &plan), 0); + ASSERT_EQ(plan.machine, 0x3e); + ASSERT_EQ(plan.type, ET_EXEC); + ASSERT_EQ(plan.entry, 0x401020); + ASSERT_EQ(plan.segment_count, 2); + ASSERT_EQ(plan.pie, 0); + ASSERT_EQ(plan.has_interp, 1); + ASSERT_EQ(plan.interp_offset, 0x180); + ASSERT_EQ(plan.interp_size, 8); + ASSERT_EQ(plan.phdr_vaddr, 0x400040); + ASSERT_EQ(plan.min_vaddr, 0x400000); + ASSERT_EQ(plan.max_vaddr, 0x402000); + ASSERT_EQ(plan.load_size, 0x2000); + ASSERT_EQ(plan.segments[0].map_start, 0x400000); + ASSERT_EQ(plan.segments[0].map_offset, 0); + ASSERT_EQ(plan.segments[0].map_size, 0x1000); + ASSERT_EQ(plan.segments[1].map_start, 0x401000); + ASSERT_EQ(plan.segments[1].map_offset, 0); + ASSERT_EQ(plan.segments[1].map_size, 0x1000); +} + +static void test_elf_build_load_plan_pie_with_phdr_and_stack(void) +{ + unsigned char elf[1024]; + struct kbox_elf_load_plan plan; + + init_elf64(elf, sizeof(elf), ET_DYN, 0xb0, 0x120, 64, 4); + set_phdr(elf, 0, PT_PHDR, PF_R, 64, 0x40, 224, 224, 8); + set_phdr(elf, 1, PT_LOAD, PF_R | PF_X, 0, 0, 0x220, 0x220, 0x1000); + set_phdr(elf, 2, PT_LOAD, PF_R | PF_W, 0x220, 0x2000, 0x40, 0x100, 0x1000); + set_phdr(elf, 3, PT_GNU_STACK, PF_R | PF_W, 0, 0, 0, 0, 16); + + ASSERT_EQ(kbox_build_elf_load_plan(elf, sizeof(elf), 0x1000, &plan), 0); + ASSERT_EQ(plan.machine, 0xb0); + ASSERT_EQ(plan.type, ET_DYN); + ASSERT_EQ(plan.pie, 1); + ASSERT_EQ(plan.phdr_vaddr, 0x40); + ASSERT_EQ(plan.stack_flags, PF_R | PF_W); + ASSERT_EQ(plan.segment_count, 2); + ASSERT_EQ(plan.min_vaddr, 0); + ASSERT_EQ(plan.max_vaddr, 0x3000); + ASSERT_EQ(plan.load_size, 0x3000); +} + +static void test_elf_build_load_plan_honors_large_segment_align(void) +{ + unsigned char *elf; + size_t elf_len = 0xb1000; + struct kbox_elf_load_plan plan; + + elf = calloc(1, elf_len); + ASSERT_NE(elf, NULL); + init_elf64(elf, elf_len, ET_DYN, 0xb7, 0x696cc, 64, 2); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0xa19f4, 0xa19f4, 0x10000); + set_phdr(elf, 1, PT_LOAD, PF_R | PF_W, 0xafb00, 0xbfb00, 0x904, 0x3410, + 0x10000); + + ASSERT_EQ(kbox_build_elf_load_plan(elf, elf_len, 0x1000, &plan), 0); + ASSERT_EQ(plan.segment_count, 2); + ASSERT_EQ(plan.segments[1].map_offset, 0xa0000); + ASSERT_EQ(plan.segments[1].map_start, 0xb0000); + ASSERT_EQ(plan.segments[1].map_size, 0x20000); + ASSERT_EQ(plan.max_vaddr, 0xd0000); + free(elf); +} + +static void test_elf_build_load_plan_rejects_filesz_gt_memsz(void) +{ + unsigned char elf[256]; + struct kbox_elf_load_plan plan; + + init_elf64(elf, sizeof(elf), ET_EXEC, 0x3e, 0x400000, 64, 1); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0x400000, 0x200, 0x100, 0x1000); + + ASSERT_EQ(kbox_build_elf_load_plan(elf, sizeof(elf), 0x1000, &plan), -1); +} + void test_elf_init(void) { TEST_REGISTER(test_elf_parse_interp); @@ -365,4 +585,10 @@ void test_elf_init(void) TEST_REGISTER(test_elf_32bit_rejected); TEST_REGISTER(test_elf_find_interp_loc); TEST_REGISTER(test_elf_find_interp_loc_static); + TEST_REGISTER(test_elf_read_header_window_fd); + TEST_REGISTER(test_elf_read_header_window_fd_large_phoff); + TEST_REGISTER(test_elf_build_load_plan_exec); + TEST_REGISTER(test_elf_build_load_plan_pie_with_phdr_and_stack); + TEST_REGISTER(test_elf_build_load_plan_honors_large_segment_align); + TEST_REGISTER(test_elf_build_load_plan_rejects_filesz_gt_memsz); } diff --git a/tests/unit/test-loader-entry.c b/tests/unit/test-loader-entry.c new file mode 100644 index 0000000..676c8b4 --- /dev/null +++ b/tests/unit/test-loader-entry.c @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: MIT */ +#include + +#include "loader-entry.h" +#include "test-runner.h" + +#define ET_DYN 3 +#define PT_LOAD 1 +#define PF_R 0x4 +#define PF_X 0x1 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t machine, + uint64_t entry) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, ET_DYN); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, 64); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, 1); +} + +static void set_load_phdr(unsigned char *buf, uint64_t filesz) +{ + unsigned char *ph = buf + 64; + + set_le32(ph + 0, PT_LOAD); + set_le32(ph + 4, PF_R | PF_X); + set_le64(ph + 8, 0); + set_le64(ph + 16, 0); + set_le64(ph + 32, filesz); + set_le64(ph + 40, filesz); + set_le64(ph + 48, 0x1000); +} + +static void test_loader_build_entry_state_x86_64(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + struct kbox_loader_entry_state state; + + init_elf64(elf, sizeof(elf), 0x3e, 0x123); + set_load_phdr(elf, 0x180); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = elf; + spec.main_elf_len = sizeof(elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.main_load_bias = 0x610000000000ULL; + spec.stack_top = 0x710000010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(kbox_loader_build_entry_state(&layout, &state), 0); + ASSERT_EQ(state.arch, KBOX_LOADER_ENTRY_ARCH_X86_64); + ASSERT_EQ(state.pc, layout.initial_pc); + ASSERT_EQ(state.sp, layout.initial_sp); + ASSERT_EQ(state.regs[0], 0); + + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_entry_state_aarch64(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + struct kbox_loader_entry_state state; + + init_elf64(elf, sizeof(elf), 0xb7, 0x456); + set_load_phdr(elf, 0x180); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = elf; + spec.main_elf_len = sizeof(elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.main_load_bias = 0x620000000000ULL; + spec.stack_top = 0x720000010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(kbox_loader_build_entry_state(&layout, &state), 0); + ASSERT_EQ(state.arch, KBOX_LOADER_ENTRY_ARCH_AARCH64); + ASSERT_EQ(state.pc, layout.initial_pc); + ASSERT_EQ(state.sp, layout.initial_sp); + ASSERT_EQ(state.regs[5], 0); + + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_entry_state_rejects_unknown_machine(void) +{ + struct kbox_loader_layout layout; + struct kbox_loader_entry_state state; + + memset(&layout, 0, sizeof(layout)); + layout.main_plan.machine = 0xffff; + layout.initial_pc = 1; + layout.initial_sp = 2; + + ASSERT_EQ(kbox_loader_build_entry_state(&layout, &state), -1); +} + +void test_loader_entry_init(void) +{ + TEST_REGISTER(test_loader_build_entry_state_x86_64); + TEST_REGISTER(test_loader_build_entry_state_aarch64); + TEST_REGISTER(test_loader_build_entry_state_rejects_unknown_machine); +} diff --git a/tests/unit/test-loader-handoff.c b/tests/unit/test-loader-handoff.c new file mode 100644 index 0000000..d95842d --- /dev/null +++ b/tests/unit/test-loader-handoff.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: MIT */ +#include + +#include "loader-handoff.h" +#include "test-runner.h" + +#define ET_DYN 3 +#define PT_INTERP 3 +#define PT_LOAD 1 +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t machine, + uint64_t entry, + uint16_t phnum) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, ET_DYN); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, 64); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + size_t index, + uint32_t type, + uint32_t flags, + uint64_t offset, + uint64_t vaddr, + uint64_t filesz, + uint64_t memsz, + uint64_t align) +{ + unsigned char *ph = buf + 64 + index * 56; + + set_le32(ph + 0, type); + set_le32(ph + 4, flags); + set_le64(ph + 8, offset); + set_le64(ph + 16, vaddr); + set_le64(ph + 32, filesz); + set_le64(ph + 40, memsz); + set_le64(ph + 48, align); +} + +static void build_main_pie(unsigned char *elf, + size_t elf_size, + uint16_t machine) +{ + init_elf64(elf, elf_size, machine, 0x80, 2); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0x200, 0x200, 0x1000); + set_phdr(elf, 1, PT_INTERP, PF_R, 0x200, 0, 0x20, 0x20, 1); +} + +static void build_interp_pie(unsigned char *elf, + size_t elf_size, + uint16_t machine) +{ + init_elf64(elf, elf_size, machine, 0x140, 1); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0x200, 0x200, 0x1000); +} + +static void test_loader_build_handoff_main_entry_x86_64(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image image; + struct kbox_loader_image_spec image_spec; + struct kbox_loader_handoff handoff; + + build_main_pie(elf, sizeof(elf), 0x3e); + memset(&layout, 0, sizeof(layout)); + memset(&layout_spec, 0, sizeof(layout_spec)); + memset(&image, 0, sizeof(image)); + memset(&image_spec, 0, sizeof(image_spec)); + + layout_spec.main_elf = elf; + layout_spec.main_elf_len = sizeof(elf); + layout_spec.argv = argv; + layout_spec.argc = 1; + layout_spec.page_size = 4096; + layout_spec.main_load_bias = 0x600300000000ULL; + layout_spec.stack_top = 0x700300010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&layout_spec, &layout), 0); + image_spec.layout = &layout; + image_spec.main_elf = elf; + image_spec.main_elf_len = sizeof(elf); + ASSERT_EQ(kbox_loader_materialize_image(&image_spec, &image), 0); + + ASSERT_EQ(kbox_loader_build_handoff(&layout, &image, &handoff), 0); + ASSERT_EQ(handoff.entry.arch, KBOX_LOADER_ENTRY_ARCH_X86_64); + ASSERT_EQ(handoff.entry.pc, layout.initial_pc); + ASSERT_EQ(handoff.entry.sp, layout.initial_sp); + ASSERT_EQ(handoff.entry_mapping_index, 0); + ASSERT_EQ(handoff.stack_mapping_index, layout.mapping_count - 1); + ASSERT_EQ(handoff.entry_map_start, layout.mappings[0].addr); + ASSERT_EQ(handoff.entry_map_end, + layout.mappings[0].addr + layout.mappings[0].size); + + kbox_loader_image_reset(&image); + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_handoff_dynamic_aarch64_uses_interp_entry(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "sh"}; + unsigned char main_elf[2048]; + unsigned char interp_elf[2048]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image image; + struct kbox_loader_image_spec image_spec; + struct kbox_loader_handoff handoff; + + build_main_pie(main_elf, sizeof(main_elf), 0xb7); + build_interp_pie(interp_elf, sizeof(interp_elf), 0xb7); + memcpy(main_elf + 0x200, "/lib/ld-musl-aarch64.so.1", 26); + memset(&layout, 0, sizeof(layout)); + memset(&layout_spec, 0, sizeof(layout_spec)); + memset(&image, 0, sizeof(image)); + memset(&image_spec, 0, sizeof(image_spec)); + + layout_spec.main_elf = main_elf; + layout_spec.main_elf_len = sizeof(main_elf); + layout_spec.interp_elf = interp_elf; + layout_spec.interp_elf_len = sizeof(interp_elf); + layout_spec.argv = argv; + layout_spec.argc = 2; + layout_spec.page_size = 4096; + layout_spec.main_load_bias = 0x600400000000ULL; + layout_spec.interp_load_bias = 0x600500000000ULL; + layout_spec.stack_top = 0x700400010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&layout_spec, &layout), 0); + image_spec.layout = &layout; + image_spec.main_elf = main_elf; + image_spec.main_elf_len = sizeof(main_elf); + image_spec.interp_elf = interp_elf; + image_spec.interp_elf_len = sizeof(interp_elf); + ASSERT_EQ(kbox_loader_materialize_image(&image_spec, &image), 0); + + ASSERT_EQ(kbox_loader_build_handoff(&layout, &image, &handoff), 0); + ASSERT_EQ(handoff.entry.arch, KBOX_LOADER_ENTRY_ARCH_AARCH64); + ASSERT_EQ(handoff.entry.pc, layout.initial_pc); + ASSERT_EQ(handoff.entry_mapping_index, 1); + ASSERT_EQ(handoff.entry_map_start, layout.mappings[1].addr); + + kbox_loader_image_reset(&image); + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_handoff_rejects_unmapped_entry(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image image; + struct kbox_loader_image_spec image_spec; + struct kbox_loader_handoff handoff; + + build_main_pie(elf, sizeof(elf), 0x3e); + memset(&layout, 0, sizeof(layout)); + memset(&layout_spec, 0, sizeof(layout_spec)); + memset(&image, 0, sizeof(image)); + memset(&image_spec, 0, sizeof(image_spec)); + + layout_spec.main_elf = elf; + layout_spec.main_elf_len = sizeof(elf); + layout_spec.argv = argv; + layout_spec.argc = 1; + layout_spec.page_size = 4096; + layout_spec.main_load_bias = 0x600600000000ULL; + layout_spec.stack_top = 0x700600010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&layout_spec, &layout), 0); + image_spec.layout = &layout; + image_spec.main_elf = elf; + image_spec.main_elf_len = sizeof(elf); + ASSERT_EQ(kbox_loader_materialize_image(&image_spec, &image), 0); + + layout.initial_pc = layout.mappings[layout.mapping_count - 1].addr; + ASSERT_EQ(kbox_loader_build_handoff(&layout, &image, &handoff), -1); + + kbox_loader_image_reset(&image); + kbox_loader_layout_reset(&layout); +} + +void test_loader_handoff_init(void) +{ + TEST_REGISTER(test_loader_build_handoff_main_entry_x86_64); + TEST_REGISTER(test_loader_build_handoff_dynamic_aarch64_uses_interp_entry); + TEST_REGISTER(test_loader_build_handoff_rejects_unmapped_entry); +} diff --git a/tests/unit/test-loader-image.c b/tests/unit/test-loader-image.c new file mode 100644 index 0000000..817ab11 --- /dev/null +++ b/tests/unit/test-loader-image.c @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: MIT */ +#include + +#include "loader-image.h" +#include "test-runner.h" + +#define ET_DYN 3 +#define PT_LOAD 1 +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t machine, + uint64_t entry, + uint16_t phnum) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, ET_DYN); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, 64); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + size_t index, + uint32_t flags, + uint64_t offset, + uint64_t vaddr, + uint64_t filesz, + uint64_t memsz, + uint64_t align) +{ + unsigned char *ph = buf + 64 + index * 56; + + set_le32(ph + 0, PT_LOAD); + set_le32(ph + 4, flags); + set_le64(ph + 8, offset); + set_le64(ph + 16, vaddr); + set_le64(ph + 32, filesz); + set_le64(ph + 40, memsz); + set_le64(ph + 48, align); +} + +static void build_rx_pie(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, 0x3e, 0x80, 1); + set_phdr(elf, 0, PF_R | PF_X, 0, 0, 0x180, 0x180, 0x1000); +} + +static void build_rw_bss_pie(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, 0x3e, 0x40, 1); + set_phdr(elf, 0, PF_R | PF_W, 0, 0, 0x180, 0x2500, 0x1000); +} + +static void test_loader_materialize_image_maps_main_bytes(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image image; + struct kbox_loader_image_spec image_spec; + const unsigned char *mapped; + + build_rx_pie(elf, sizeof(elf)); + memset(&layout, 0, sizeof(layout)); + memset(&layout_spec, 0, sizeof(layout_spec)); + memset(&image, 0, sizeof(image)); + memset(&image_spec, 0, sizeof(image_spec)); + + layout_spec.main_elf = elf; + layout_spec.main_elf_len = sizeof(elf); + layout_spec.argv = argv; + layout_spec.argc = 1; + layout_spec.page_size = 4096; + layout_spec.main_load_bias = 0x600100000000ULL; + layout_spec.stack_top = 0x700100010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&layout_spec, &layout), 0); + + image_spec.layout = &layout; + image_spec.main_elf = elf; + image_spec.main_elf_len = sizeof(elf); + ASSERT_EQ(kbox_loader_materialize_image(&image_spec, &image), 0); + ASSERT_EQ(image.region_count, 2); + + mapped = (const unsigned char *) (uintptr_t) layout.mappings[0].addr; + ASSERT_EQ(memcmp(mapped, elf, 0x180), 0); + ASSERT_EQ(mapped[0x17f], elf[0x17f]); + + kbox_loader_image_reset(&image); + kbox_loader_layout_reset(&layout); +} + +static void test_loader_materialize_image_realizes_bss_and_stack(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[4096]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec layout_spec; + struct kbox_loader_image image; + struct kbox_loader_image_spec image_spec; + const unsigned char *mapped; + const uint64_t *stack_words; + + build_rw_bss_pie(elf, sizeof(elf)); + memset(&layout, 0, sizeof(layout)); + memset(&layout_spec, 0, sizeof(layout_spec)); + memset(&image, 0, sizeof(image)); + memset(&image_spec, 0, sizeof(image_spec)); + + layout_spec.main_elf = elf; + layout_spec.main_elf_len = sizeof(elf); + layout_spec.argv = argv; + layout_spec.argc = 1; + layout_spec.page_size = 4096; + layout_spec.main_load_bias = 0x600200000000ULL; + layout_spec.stack_top = 0x700200010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&layout_spec, &layout), 0); + + image_spec.layout = &layout; + image_spec.main_elf = elf; + image_spec.main_elf_len = sizeof(elf); + ASSERT_EQ(kbox_loader_materialize_image(&image_spec, &image), 0); + ASSERT_EQ(image.region_count, 3); + + mapped = (const unsigned char *) (uintptr_t) layout.mappings[0].addr; + ASSERT_EQ(memcmp(mapped, elf, 0x180), 0); + ASSERT_EQ(mapped[0x180], 0); + ASSERT_EQ(mapped[0xfff], 0); + ASSERT_EQ(*((const unsigned char *) (uintptr_t) layout.mappings[1].addr), + 0); + + stack_words = (const uint64_t *) (uintptr_t) layout.initial_sp; + ASSERT_EQ(stack_words[0], 1); + + kbox_loader_image_reset(&image); + kbox_loader_layout_reset(&layout); +} + +void test_loader_image_init(void) +{ + TEST_REGISTER(test_loader_materialize_image_maps_main_bytes); + TEST_REGISTER(test_loader_materialize_image_realizes_bss_and_stack); +} diff --git a/tests/unit/test-loader-launch.c b/tests/unit/test-loader-launch.c new file mode 100644 index 0000000..a100a36 --- /dev/null +++ b/tests/unit/test-loader-launch.c @@ -0,0 +1,293 @@ +/* SPDX-License-Identifier: MIT */ +#include +#include +#include + +#include "loader-launch.h" +#include "test-runner.h" + +#define ET_EXEC 2 +#define ET_DYN 3 +#define PT_INTERP 3 +#define PT_LOAD 1 +#define PF_R 0x4 +#define PF_X 0x1 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t type, + uint16_t machine, + uint64_t entry, + uint16_t phnum) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, type); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, 64); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + size_t index, + uint32_t type, + uint32_t flags, + uint64_t offset, + uint64_t vaddr, + uint64_t filesz, + uint64_t memsz, + uint64_t align) +{ + unsigned char *ph = buf + 64 + index * 56; + + set_le32(ph + 0, type); + set_le32(ph + 4, flags); + set_le64(ph + 8, offset); + set_le64(ph + 16, vaddr); + set_le64(ph + 32, filesz); + set_le64(ph + 40, memsz); + set_le64(ph + 48, align); +} + +static void build_main_pie(unsigned char *elf, + size_t elf_size, + uint16_t machine) +{ + init_elf64(elf, elf_size, ET_DYN, machine, 0x80, 2); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0x200, 0x200, 0x1000); + set_phdr(elf, 1, PT_INTERP, PF_R, 0x200, 0, 0x20, 0x20, 1); +} + +static void build_interp_pie(unsigned char *elf, + size_t elf_size, + uint16_t machine) +{ + init_elf64(elf, elf_size, ET_DYN, machine, 0x140, 1); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0x200, 0x200, 0x1000); +} + +static int create_memfd_with_bytes(const unsigned char *buf, size_t len) +{ + int fd = memfd_create("kbox-loader-launch-test", MFD_CLOEXEC); + + if (fd < 0) + return -1; + if (write(fd, buf, len) != (ssize_t) len) { + close(fd); + return -1; + } + return fd; +} + +static void test_loader_prepare_launch_main_only_pie(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_launch launch; + struct kbox_loader_launch_spec spec; + int fd; + + build_main_pie(elf, sizeof(elf), 0x3e); + fd = create_memfd_with_bytes(elf, sizeof(elf)); + ASSERT_NE(fd, -1); + + memset(&launch, 0, sizeof(launch)); + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = fd; + spec.interp_fd = -1; + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.main_load_bias = 0x600700000000ULL; + spec.stack_top = 0x700700010000ULL; + + ASSERT_EQ(kbox_loader_prepare_launch(&spec, &launch), 0); + ASSERT_EQ(launch.transfer.arch, KBOX_LOADER_ENTRY_ARCH_X86_64); + ASSERT_EQ(launch.layout.main_load_bias, 0x600700000000ULL); + ASSERT_EQ(launch.transfer.pc, launch.layout.initial_pc); + ASSERT_EQ(launch.transfer.pc, 0x600700000080ULL); + ASSERT_EQ(launch.transfer.sp, launch.layout.initial_sp); + ASSERT_EQ(launch.image.region_count, launch.layout.mapping_count); + + kbox_loader_launch_reset(&launch); + close(fd); +} + +static void test_loader_prepare_launch_dynamic_interp(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "sh"}; + unsigned char main_elf[2048]; + unsigned char interp_elf[2048]; + struct kbox_loader_launch launch; + struct kbox_loader_launch_spec spec; + int exec_fd; + int interp_fd; + + build_main_pie(main_elf, sizeof(main_elf), 0xb7); + build_interp_pie(interp_elf, sizeof(interp_elf), 0xb7); + memcpy(main_elf + 0x200, "/lib/ld-musl-aarch64.so.1", 26); + exec_fd = create_memfd_with_bytes(main_elf, sizeof(main_elf)); + interp_fd = create_memfd_with_bytes(interp_elf, sizeof(interp_elf)); + ASSERT_NE(exec_fd, -1); + ASSERT_NE(interp_fd, -1); + + memset(&launch, 0, sizeof(launch)); + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = exec_fd; + spec.interp_fd = interp_fd; + spec.argv = argv; + spec.argc = 2; + spec.page_size = 4096; + spec.main_load_bias = 0x600800000000ULL; + spec.interp_load_bias = 0x600810000000ULL; + spec.stack_top = 0x700800010000ULL; + + ASSERT_EQ(kbox_loader_prepare_launch(&spec, &launch), 0); + ASSERT_EQ(launch.layout.has_interp, 1); + ASSERT_EQ(launch.transfer.arch, KBOX_LOADER_ENTRY_ARCH_AARCH64); + ASSERT_EQ(launch.transfer.pc, launch.layout.initial_pc); + ASSERT_EQ(launch.transfer.entry_map_start, launch.handoff.entry_map_start); + + kbox_loader_launch_reset(&launch); + close(interp_fd); + close(exec_fd); +} + +static void test_loader_prepare_launch_rejects_empty_fd(void) +{ + static const char *const argv[] = {"/bin/test"}; + struct kbox_loader_launch launch; + struct kbox_loader_launch_spec spec; + int fd = memfd_create("kbox-loader-empty", MFD_CLOEXEC); + + ASSERT_NE(fd, -1); + memset(&launch, 0, sizeof(launch)); + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = fd; + spec.interp_fd = -1; + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.stack_top = 0x700900010000ULL; + + ASSERT_EQ(kbox_loader_prepare_launch(&spec, &launch), -1); + + kbox_loader_launch_reset(&launch); + close(fd); +} + +static void test_loader_collect_exec_ranges_static_exec(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_launch launch; + struct kbox_loader_launch_spec spec; + struct kbox_loader_exec_range ranges[4]; + size_t range_count = 0; + int fd; + + build_main_pie(elf, sizeof(elf), 0x3e); + fd = create_memfd_with_bytes(elf, sizeof(elf)); + ASSERT_NE(fd, -1); + + memset(&launch, 0, sizeof(launch)); + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = fd; + spec.interp_fd = -1; + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.main_load_bias = 0x600700000000ULL; + spec.stack_top = 0x700700010000ULL; + + ASSERT_EQ(kbox_loader_prepare_launch(&spec, &launch), 0); + ASSERT_EQ(kbox_loader_collect_exec_ranges(&launch, ranges, 4, &range_count), + 0); + ASSERT_EQ(range_count, 1); + ASSERT_EQ(ranges[0].start, launch.layout.mappings[0].addr); + ASSERT_EQ(ranges[0].end, + launch.layout.mappings[0].addr + launch.layout.mappings[0].size); + + kbox_loader_launch_reset(&launch); + close(fd); +} + +static void test_loader_collect_exec_ranges_dynamic_interp(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "sh"}; + unsigned char main_elf[2048]; + unsigned char interp_elf[2048]; + struct kbox_loader_launch launch; + struct kbox_loader_launch_spec spec; + struct kbox_loader_exec_range ranges[4]; + size_t range_count = 0; + int exec_fd; + int interp_fd; + + build_main_pie(main_elf, sizeof(main_elf), 0xb7); + build_interp_pie(interp_elf, sizeof(interp_elf), 0xb7); + memcpy(main_elf + 0x200, "/lib/ld-musl-aarch64.so.1", 26); + exec_fd = create_memfd_with_bytes(main_elf, sizeof(main_elf)); + interp_fd = create_memfd_with_bytes(interp_elf, sizeof(interp_elf)); + ASSERT_NE(exec_fd, -1); + ASSERT_NE(interp_fd, -1); + + memset(&launch, 0, sizeof(launch)); + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = exec_fd; + spec.interp_fd = interp_fd; + spec.argv = argv; + spec.argc = 2; + spec.page_size = 4096; + spec.main_load_bias = 0x600800000000ULL; + spec.interp_load_bias = 0x600810000000ULL; + spec.stack_top = 0x700800010000ULL; + + ASSERT_EQ(kbox_loader_prepare_launch(&spec, &launch), 0); + ASSERT_EQ(kbox_loader_collect_exec_ranges(&launch, ranges, 4, &range_count), + 0); + ASSERT_EQ(range_count, 2); + + kbox_loader_launch_reset(&launch); + close(interp_fd); + close(exec_fd); +} + +void test_loader_launch_init(void) +{ + TEST_REGISTER(test_loader_prepare_launch_main_only_pie); + TEST_REGISTER(test_loader_prepare_launch_dynamic_interp); + TEST_REGISTER(test_loader_prepare_launch_rejects_empty_fd); + TEST_REGISTER(test_loader_collect_exec_ranges_static_exec); + TEST_REGISTER(test_loader_collect_exec_ranges_dynamic_interp); +} diff --git a/tests/unit/test-loader-layout.c b/tests/unit/test-loader-layout.c new file mode 100644 index 0000000..dd93194 --- /dev/null +++ b/tests/unit/test-loader-layout.c @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: MIT */ +#include +#include + +#include "loader-layout.h" +#include "test-runner.h" + +#define ET_EXEC 2 +#define ET_DYN 3 +#define PT_LOAD 1 +#define PT_INTERP 3 +#define PT_PHDR 6 +#define PT_GNU_STACK 0x6474e551u +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +static void set_le16(unsigned char *p, uint16_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void set_le32(unsigned char *p, uint32_t v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void set_le64(unsigned char *p, uint64_t v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t buf_size, + uint16_t type, + uint16_t machine, + uint64_t entry, + uint64_t phoff, + uint16_t phnum) +{ + memset(buf, 0, buf_size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + set_le16(buf + 16, type); + set_le16(buf + 18, machine); + set_le32(buf + 20, 1); + set_le64(buf + 24, entry); + set_le64(buf + 32, phoff); + set_le16(buf + 52, 64); + set_le16(buf + 54, 56); + set_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + size_t index, + uint32_t type, + uint32_t flags, + uint64_t offset, + uint64_t vaddr, + uint64_t filesz, + uint64_t memsz, + uint64_t align) +{ + unsigned char *ph = buf + 64 + index * 56; + + set_le32(ph + 0, type); + set_le32(ph + 4, flags); + set_le64(ph + 8, offset); + set_le64(ph + 16, vaddr); + set_le64(ph + 32, filesz); + set_le64(ph + 40, memsz); + set_le64(ph + 48, align); +} + +static void build_static_elf(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, ET_EXEC, 0x3e, 0x401000, 64, 1); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0x400000, 0x200, 0x300, 0x1000); +} + +static void build_dynamic_elf(unsigned char *elf, + size_t elf_size, + uint16_t machine) +{ + init_elf64(elf, elf_size, ET_DYN, machine, 0x120, 64, 2); + set_phdr(elf, 0, PT_PHDR, PF_R, 64, 0x40, 112, 112, 8); + set_phdr(elf, 1, PT_LOAD, PF_R | PF_X, 0, 0, 0x200, 0x200, 0x1000); +} + +static void build_bss_heavy_elf(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, ET_EXEC, 0x3e, 0x401000, 64, 1); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_W, 0, 0x400000, 0x180, 0x2500, 0x1000); +} + +static void build_execstack_elf(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, ET_EXEC, 0x3e, 0x401000, 64, 2); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0x400000, 0x200, 0x200, 0x1000); + set_phdr(elf, 1, PT_GNU_STACK, PF_R | PF_W | PF_X, 0, 0, 0, 0, 16); +} + +static void build_large_align_interp_elf(unsigned char *elf, size_t elf_size) +{ + init_elf64(elf, elf_size, ET_DYN, 0xb7, 0x696cc, 64, 2); + set_phdr(elf, 0, PT_LOAD, PF_R | PF_X, 0, 0, 0xa19f4, 0xa19f4, 0x10000); + set_phdr(elf, 1, PT_LOAD, PF_R | PF_W, 0xafb00, 0xbfb00, 0x904, 0x3410, + 0x10000); +} + +static void test_loader_build_layout_static_exec(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + build_static_elf(elf, sizeof(elf)); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = elf; + spec.main_elf_len = sizeof(elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.stack_top = 0x700000010000ULL; + spec.main_load_bias = 0x600000000000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(layout.has_interp, 0); + ASSERT_EQ(layout.main_load_bias, 0); + ASSERT_EQ(layout.initial_pc, 0x401000); + ASSERT_EQ(layout.initial_sp, layout.stack.initial_sp); + ASSERT_EQ(layout.stack_size, KBOX_LOADER_DEFAULT_STACK_SIZE); + ASSERT_EQ(layout.mapping_count, 2); + ASSERT_EQ(layout.mappings[0].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(layout.mappings[0].addr, 0x400000); + ASSERT_EQ(layout.mappings[0].size, 0x1000); + ASSERT_EQ(layout.mappings[0].file_offset, 0); + ASSERT_EQ(layout.mappings[0].file_size, 0x200); + ASSERT_EQ(layout.mappings[0].zero_fill_start, 0x400200); + ASSERT_EQ(layout.mappings[0].zero_fill_size, 0xe00); + ASSERT_EQ(layout.mappings[0].prot, PROT_READ | PROT_EXEC); + ASSERT_EQ(layout.mappings[0].flags, MAP_PRIVATE | MAP_FIXED); + ASSERT_EQ(layout.mappings[1].source, KBOX_LOADER_MAPPING_STACK); + ASSERT_EQ(layout.mappings[1].addr, + spec.stack_top - KBOX_LOADER_DEFAULT_STACK_SIZE); + ASSERT_EQ(layout.mappings[1].prot, PROT_READ | PROT_WRITE); + + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_layout_dynamic_interp_entry(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "sh"}; + unsigned char main_elf[1024]; + unsigned char interp_elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + build_dynamic_elf(main_elf, sizeof(main_elf), 0xb7); + build_dynamic_elf(interp_elf, sizeof(interp_elf), 0xb7); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = main_elf; + spec.main_elf_len = sizeof(main_elf); + spec.interp_elf = interp_elf; + spec.interp_elf_len = sizeof(interp_elf); + spec.argv = argv; + spec.argc = 2; + spec.page_size = 4096; + spec.stack_top = 0x7fff00002000ULL; + spec.main_load_bias = 0x555500000000ULL; + spec.interp_load_bias = 0x777700000000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(layout.has_interp, 1); + ASSERT_EQ(layout.initial_pc, + spec.interp_load_bias + layout.interp_plan.entry); + ASSERT_EQ(layout.main_load_bias, spec.main_load_bias); + ASSERT_EQ(layout.interp_load_bias, spec.interp_load_bias); + ASSERT_EQ(layout.mapping_count, 3); + ASSERT_EQ(layout.mappings[0].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(layout.mappings[0].addr, spec.main_load_bias); + ASSERT_EQ(layout.mappings[0].zero_fill_size, 0); + ASSERT_EQ(layout.mappings[1].source, KBOX_LOADER_MAPPING_INTERP); + ASSERT_EQ(layout.mappings[1].addr, spec.interp_load_bias); + ASSERT_EQ(layout.mappings[1].zero_fill_size, 0); + ASSERT_EQ(layout.mappings[2].source, KBOX_LOADER_MAPPING_STACK); + + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_layout_emits_bss_extension_mapping(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[2048]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + build_bss_heavy_elf(elf, sizeof(elf)); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = elf; + spec.main_elf_len = sizeof(elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.stack_top = 0x700000010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(layout.mapping_count, 3); + ASSERT_EQ(layout.mappings[0].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(layout.mappings[0].addr, 0x400000); + ASSERT_EQ(layout.mappings[0].size, 0x1000); + ASSERT_EQ(layout.mappings[0].file_offset, 0); + ASSERT_EQ(layout.mappings[0].file_size, 0x180); + ASSERT_EQ(layout.mappings[0].zero_fill_start, 0x400180); + ASSERT_EQ(layout.mappings[0].zero_fill_size, 0xe80); + ASSERT_EQ(layout.mappings[0].flags, MAP_PRIVATE | MAP_FIXED); + ASSERT_EQ(layout.mappings[1].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(layout.mappings[1].addr, 0x401000); + ASSERT_EQ(layout.mappings[1].size, 0x2000); + ASSERT_EQ(layout.mappings[1].file_size, 0); + ASSERT_EQ(layout.mappings[1].zero_fill_size, 0); + ASSERT_EQ(layout.mappings[1].flags, + MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS); + ASSERT_EQ(layout.mappings[2].source, KBOX_LOADER_MAPPING_STACK); + + kbox_loader_layout_reset(&layout); +} + +static void test_loader_build_layout_rejects_bad_interp(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char main_elf[1024]; + unsigned char interp_elf[64]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + build_static_elf(main_elf, sizeof(main_elf)); + memset(interp_elf, 0, sizeof(interp_elf)); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = main_elf; + spec.main_elf_len = sizeof(main_elf); + spec.interp_elf = interp_elf; + spec.interp_elf_len = sizeof(interp_elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.stack_top = 0x700000010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), -1); +} + +static void test_loader_build_layout_honors_large_segment_align(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "true"}; + unsigned char *interp_elf; + unsigned char main_elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + interp_elf = calloc(1, 0xb1000); + ASSERT_NE(interp_elf, NULL); + build_dynamic_elf(main_elf, sizeof(main_elf), 0xb7); + build_large_align_interp_elf(interp_elf, 0xb1000); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = main_elf; + spec.main_elf_len = sizeof(main_elf); + spec.interp_elf = interp_elf; + spec.interp_elf_len = 0xb1000; + spec.argv = argv; + spec.argc = 2; + spec.page_size = 4096; + spec.stack_top = 0x7fff00002000ULL; + spec.main_load_bias = 0x600000000000ULL; + spec.interp_load_bias = 0x610000000000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(layout.mapping_count, 4); + ASSERT_EQ(layout.mappings[1].source, KBOX_LOADER_MAPPING_INTERP); + ASSERT_EQ(layout.mappings[1].addr, spec.interp_load_bias); + ASSERT_EQ(layout.mappings[1].size, 0xb0000); + ASSERT_EQ(layout.mappings[2].source, KBOX_LOADER_MAPPING_INTERP); + ASSERT_EQ(layout.mappings[2].addr, spec.interp_load_bias + 0xb0000); + ASSERT_EQ(layout.mappings[2].size, 0x20000); + ASSERT_EQ(layout.mappings[2].file_offset, 0xa0000); + ASSERT_EQ(layout.mappings[2].file_size, 0x10404); + ASSERT_EQ(layout.mappings[2].zero_fill_start, + spec.interp_load_bias + 0xc0404); + ASSERT_EQ(layout.mappings[2].zero_fill_size, 0xfbfc); + ASSERT_EQ(layout.mappings[3].source, KBOX_LOADER_MAPPING_STACK); + + kbox_loader_layout_reset(&layout); + free(interp_elf); +} + +static void test_loader_build_layout_honors_exec_stack(void) +{ + static const char *const argv[] = {"/bin/test"}; + unsigned char elf[1024]; + struct kbox_loader_layout layout; + struct kbox_loader_layout_spec spec; + + build_execstack_elf(elf, sizeof(elf)); + memset(&layout, 0, sizeof(layout)); + memset(&spec, 0, sizeof(spec)); + + spec.main_elf = elf; + spec.main_elf_len = sizeof(elf); + spec.argv = argv; + spec.argc = 1; + spec.page_size = 4096; + spec.stack_top = 0x700000010000ULL; + + ASSERT_EQ(kbox_loader_build_layout(&spec, &layout), 0); + ASSERT_EQ(layout.mappings[1].source, KBOX_LOADER_MAPPING_STACK); + ASSERT_EQ(layout.mappings[1].prot, PROT_READ | PROT_WRITE | PROT_EXEC); + + kbox_loader_layout_reset(&layout); +} + +void test_loader_layout_init(void) +{ + TEST_REGISTER(test_loader_build_layout_static_exec); + TEST_REGISTER(test_loader_build_layout_dynamic_interp_entry); + TEST_REGISTER(test_loader_build_layout_emits_bss_extension_mapping); + TEST_REGISTER(test_loader_build_layout_rejects_bad_interp); + TEST_REGISTER(test_loader_build_layout_honors_large_segment_align); + TEST_REGISTER(test_loader_build_layout_honors_exec_stack); +} diff --git a/tests/unit/test-loader-stack.c b/tests/unit/test-loader-stack.c new file mode 100644 index 0000000..404c1d3 --- /dev/null +++ b/tests/unit/test-loader-stack.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: MIT */ +#include + +#include "loader-stack.h" +#include "test-runner.h" + +#define AT_NULL 0 +#define AT_PHDR 3 +#define AT_PHENT 4 +#define AT_PHNUM 5 +#define AT_PAGESZ 6 +#define AT_BASE 7 +#define AT_ENTRY 9 +#define AT_RANDOM 25 +#define AT_EXECFN 31 + +static uint64_t read_u64(const unsigned char *p) +{ + uint64_t value; + + memcpy(&value, p, sizeof(value)); + return value; +} + +static const unsigned char *stack_ptr( + const struct kbox_loader_stack_image *image, + uint64_t addr) +{ + if (addr < image->initial_sp || addr >= image->initial_sp + image->size) + return NULL; + return image->data + (addr - image->initial_sp); +} + +static uint64_t find_auxv_value(const struct kbox_loader_stack_image *image, + size_t argc, + size_t envc, + uint64_t key) +{ + size_t word = 1 + argc + 1 + envc + 1; + const unsigned char *p = image->data + word * sizeof(uint64_t); + + for (;;) { + uint64_t a_type = read_u64(p); + uint64_t a_val = read_u64(p + sizeof(uint64_t)); + + if (a_type == AT_NULL) + return UINT64_MAX; + if (a_type == key) + return a_val; + p += 2 * sizeof(uint64_t); + } +} + +static void test_loader_build_initial_stack_static(void) +{ + static const char *const argv[] = {"/bin/test", "arg1"}; + static const char *const envp[] = {"A=B", "C=D"}; + static const unsigned char random_bytes[KBOX_LOADER_RANDOM_SIZE] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + }; + struct kbox_elf_load_plan main_plan; + struct kbox_loader_stack_spec spec; + struct kbox_loader_stack_image image; + uint64_t argc_word; + uint64_t argv0_addr; + uint64_t argv1_addr; + uint64_t env0_addr; + uint64_t at_random; + uint64_t at_execfn; + + memset(&main_plan, 0, sizeof(main_plan)); + memset(&spec, 0, sizeof(spec)); + memset(&image, 0, sizeof(image)); + + main_plan.entry = 0x401020; + main_plan.phdr_vaddr = 0x400040; + main_plan.phentsize = 56; + main_plan.phnum = 3; + + spec.argv = argv; + spec.argc = 2; + spec.envp = envp; + spec.envc = 2; + spec.execfn = "/guest/bin/test"; + spec.random_bytes = random_bytes; + spec.main_plan = &main_plan; + spec.page_size = 4096; + spec.stack_top = 0x700000010000ULL; + spec.stack_size = 0x4000; + spec.uid = 1000; + spec.euid = 1001; + spec.gid = 1002; + spec.egid = 1003; + + ASSERT_EQ(kbox_loader_build_initial_stack(&spec, &image), 0); + ASSERT_EQ(image.initial_sp & 15, 0); + + argc_word = read_u64(image.data); + argv0_addr = read_u64(image.data + 8); + argv1_addr = read_u64(image.data + 16); + env0_addr = read_u64(image.data + 32); + + ASSERT_EQ(argc_word, 2); + ASSERT_STREQ((const char *) stack_ptr(&image, argv0_addr), "/bin/test"); + ASSERT_STREQ((const char *) stack_ptr(&image, argv1_addr), "arg1"); + ASSERT_STREQ((const char *) stack_ptr(&image, env0_addr), "A=B"); + + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_PHDR), 0x400040); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_PHENT), 56); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_PHNUM), 3); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_PAGESZ), 4096); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_ENTRY), + 0x401020); + + at_random = find_auxv_value(&image, spec.argc, spec.envc, AT_RANDOM); + at_execfn = find_auxv_value(&image, spec.argc, spec.envc, AT_EXECFN); + ASSERT_TRUE(stack_ptr(&image, at_random) != NULL); + ASSERT_TRUE(stack_ptr(&image, at_execfn) != NULL); + ASSERT_EQ(memcmp(stack_ptr(&image, at_random), random_bytes, + KBOX_LOADER_RANDOM_SIZE), + 0); + ASSERT_STREQ((const char *) stack_ptr(&image, at_execfn), + "/guest/bin/test"); + + kbox_loader_stack_image_reset(&image); +} + +static void test_loader_build_initial_stack_dynamic_interp(void) +{ + static const char *const argv[] = {"/lib/ld-musl-aarch64.so.1", "sh"}; + struct kbox_elf_load_plan main_plan; + struct kbox_elf_load_plan interp_plan; + struct kbox_loader_stack_spec spec; + struct kbox_loader_stack_image image; + + memset(&main_plan, 0, sizeof(main_plan)); + memset(&interp_plan, 0, sizeof(interp_plan)); + memset(&spec, 0, sizeof(spec)); + memset(&image, 0, sizeof(image)); + + main_plan.entry = 0x1230; + main_plan.phdr_vaddr = 0x40; + main_plan.phentsize = 56; + main_plan.phnum = 5; + interp_plan.entry = 0x890; + + spec.argv = argv; + spec.argc = 2; + spec.main_plan = &main_plan; + spec.interp_plan = &interp_plan; + spec.main_load_bias = 0x555500000000ULL; + spec.interp_load_bias = 0x777700000000ULL; + spec.page_size = 4096; + spec.stack_top = 0x7fff00002000ULL; + spec.stack_size = 0x4000; + + ASSERT_EQ(kbox_loader_build_initial_stack(&spec, &image), 0); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_PHDR), + spec.main_load_bias + 0x40); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_ENTRY), + spec.main_load_bias + 0x1230); + ASSERT_EQ(find_auxv_value(&image, spec.argc, spec.envc, AT_BASE), + spec.interp_load_bias); + + kbox_loader_stack_image_reset(&image); +} + +static void test_loader_build_initial_stack_rejects_small_stack(void) +{ + static const char *const argv[] = {"/bin/test"}; + struct kbox_elf_load_plan main_plan; + struct kbox_loader_stack_spec spec; + struct kbox_loader_stack_image image; + + memset(&main_plan, 0, sizeof(main_plan)); + memset(&spec, 0, sizeof(spec)); + memset(&image, 0, sizeof(image)); + + main_plan.entry = 0x1000; + main_plan.phdr_vaddr = 0x40; + main_plan.phentsize = 56; + main_plan.phnum = 2; + + spec.argv = argv; + spec.argc = 1; + spec.main_plan = &main_plan; + spec.page_size = 4096; + spec.stack_top = 0x1000; + spec.stack_size = 32; + + ASSERT_EQ(kbox_loader_build_initial_stack(&spec, &image), -1); +} + +void test_loader_stack_init(void) +{ + TEST_REGISTER(test_loader_build_initial_stack_static); + TEST_REGISTER(test_loader_build_initial_stack_dynamic_interp); + TEST_REGISTER(test_loader_build_initial_stack_rejects_small_stack); +} diff --git a/tests/unit/test-loader-transfer.c b/tests/unit/test-loader-transfer.c new file mode 100644 index 0000000..0bdb08b --- /dev/null +++ b/tests/unit/test-loader-transfer.c @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: MIT */ +#include + +#include "loader-transfer.h" +#include "test-runner.h" + +static void test_loader_prepare_transfer_copies_handoff(void) +{ + struct kbox_loader_handoff handoff; + struct kbox_loader_transfer_state state; + + memset(&handoff, 0, sizeof(handoff)); + handoff.entry.arch = KBOX_LOADER_ENTRY_ARCH_X86_64; + handoff.entry.pc = 0x600000100080ULL; + handoff.entry.sp = 0x70000000fff0ULL; + handoff.entry.regs[0] = 11; + handoff.entry.regs[5] = 66; + handoff.entry_map_start = 0x600000100000ULL; + handoff.entry_map_end = 0x600000101000ULL; + handoff.stack_map_start = 0x700000000000ULL; + handoff.stack_map_end = 0x700000010000ULL; + + ASSERT_EQ(kbox_loader_prepare_transfer(&handoff, &state), 0); + ASSERT_EQ(state.arch, handoff.entry.arch); + ASSERT_EQ(state.pc, handoff.entry.pc); + ASSERT_EQ(state.sp, handoff.entry.sp); + ASSERT_EQ(state.regs[0], 11); + ASSERT_EQ(state.regs[5], 66); + ASSERT_EQ(state.entry_map_start, handoff.entry_map_start); + ASSERT_EQ(state.entry_map_end, handoff.entry_map_end); + ASSERT_EQ(state.stack_map_start, handoff.stack_map_start); + ASSERT_EQ(state.stack_map_end, handoff.stack_map_end); +} + +static void test_loader_prepare_transfer_rejects_unmapped_pc(void) +{ + struct kbox_loader_handoff handoff; + struct kbox_loader_transfer_state state; + + memset(&handoff, 0, sizeof(handoff)); + handoff.entry.arch = KBOX_LOADER_ENTRY_ARCH_AARCH64; + handoff.entry.pc = 0x5000; + handoff.entry.sp = 0x8ff0; + handoff.entry_map_start = 0x6000; + handoff.entry_map_end = 0x7000; + handoff.stack_map_start = 0x8000; + handoff.stack_map_end = 0x9000; + + ASSERT_EQ(kbox_loader_prepare_transfer(&handoff, &state), -1); +} + +static void test_loader_prepare_transfer_rejects_misaligned_sp(void) +{ + struct kbox_loader_handoff handoff; + struct kbox_loader_transfer_state state; + + memset(&handoff, 0, sizeof(handoff)); + handoff.entry.arch = KBOX_LOADER_ENTRY_ARCH_X86_64; + handoff.entry.pc = 0x6008; + handoff.entry.sp = 0x8ff8; + handoff.entry_map_start = 0x6000; + handoff.entry_map_end = 0x7000; + handoff.stack_map_start = 0x8000; + handoff.stack_map_end = 0x9000; + + ASSERT_EQ(kbox_loader_prepare_transfer(&handoff, &state), -1); +} + +void test_loader_transfer_init(void) +{ + TEST_REGISTER(test_loader_prepare_transfer_copies_handoff); + TEST_REGISTER(test_loader_prepare_transfer_rejects_unmapped_pc); + TEST_REGISTER(test_loader_prepare_transfer_rejects_misaligned_sp); +} diff --git a/tests/unit/test-procmem.c b/tests/unit/test-procmem.c new file mode 100644 index 0000000..c4873a0 --- /dev/null +++ b/tests/unit/test-procmem.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include + +#include + +#include "procmem.h" +#include "test-runner.h" + +static void test_current_guest_mem_read_write(void) +{ + char buf[16] = "hello"; + char out[16]; + + memset(out, 0, sizeof(out)); + ASSERT_EQ(kbox_current_read((uint64_t) (uintptr_t) buf, out, 6), 0); + ASSERT_STREQ(out, "hello"); + + ASSERT_EQ(kbox_current_write((uint64_t) (uintptr_t) buf, "world", 6), 0); + ASSERT_STREQ(buf, "world"); +} + +static void test_current_guest_mem_read_string(void) +{ + char buf[16]; + const char *src = "abc"; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ( + kbox_current_read_string((uint64_t) (uintptr_t) src, buf, sizeof(buf)), + 3); + ASSERT_STREQ(buf, "abc"); +} + +static void test_current_guest_mem_ops_wrapper(void) +{ + char value[8] = "xyz"; + char out[8]; + struct kbox_guest_mem guest = { + .ops = &kbox_current_guest_mem_ops, + .opaque = 0, + }; + + memset(out, 0, sizeof(out)); + ASSERT_EQ(kbox_guest_mem_read(&guest, (uint64_t) (uintptr_t) value, out, 4), + 0); + ASSERT_STREQ(out, "xyz"); +} + +static void test_current_guest_mem_rejects_bad_pointer(void) +{ + char out[8]; + + ASSERT_EQ(kbox_current_read(0, out, sizeof(out)), -EFAULT); + ASSERT_EQ(kbox_current_write(0, "x", 1), -EFAULT); + ASSERT_EQ(kbox_current_read_string(0, out, sizeof(out)), -EFAULT); +} + +static void test_current_guest_mem_force_write_cross_page(void) +{ + long page_size = sysconf(_SC_PAGESIZE); + char verify[4]; + char *mapping; + + ASSERT_TRUE(page_size > 0); + mapping = mmap(NULL, (size_t) page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(mapping, MAP_FAILED); + + memcpy(mapping + page_size - 2, "xxxx", 4); + ASSERT_EQ(mprotect(mapping, (size_t) page_size * 2, PROT_READ), 0); + ASSERT_EQ(kbox_current_write_force( + (uint64_t) (uintptr_t) (mapping + page_size - 2), "ABCD", 4), + 0); + ASSERT_EQ( + kbox_current_read((uint64_t) (uintptr_t) (mapping + page_size - 2), + verify, sizeof(verify)), + 0); + ASSERT_EQ(memcmp(verify, "ABCD", 4), 0); + ASSERT_EQ(munmap(mapping, (size_t) page_size * 2), 0); +} + +static void test_vm_write_force_rejects_bad_pointer(void) +{ + ASSERT_EQ(kbox_vm_write_force(getpid(), 0, "x", 1), -EFAULT); + ASSERT_EQ(kbox_vm_write_force(getpid(), 1, NULL, 1), -EFAULT); + ASSERT_EQ(kbox_vm_write_force(getpid(), 0, NULL, 0), 0); +} + +void test_procmem_init(void) +{ + TEST_REGISTER(test_current_guest_mem_read_write); + TEST_REGISTER(test_current_guest_mem_read_string); + TEST_REGISTER(test_current_guest_mem_ops_wrapper); + TEST_REGISTER(test_current_guest_mem_rejects_bad_pointer); + TEST_REGISTER(test_current_guest_mem_force_write_cross_page); + TEST_REGISTER(test_vm_write_force_rejects_bad_pointer); +} diff --git a/tests/unit/test-rewrite.c b/tests/unit/test-rewrite.c new file mode 100644 index 0000000..596e16a --- /dev/null +++ b/tests/unit/test-rewrite.c @@ -0,0 +1,1425 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include + +#include "kbox/elf.h" +#include "rewrite.h" +#include "test-runner.h" + +#define EHDR_SIZE 64 +#define PHDR_SIZE 56 +#define PT_LOAD 1 +#define PF_X 0x1 +#define EM_X86_64 62 +#define EM_AARCH64 183 + +static void put_le16(unsigned char *p, unsigned short v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); +} + +static void put_le32(unsigned char *p, unsigned int v) +{ + p[0] = (unsigned char) (v & 0xff); + p[1] = (unsigned char) ((v >> 8) & 0xff); + p[2] = (unsigned char) ((v >> 16) & 0xff); + p[3] = (unsigned char) ((v >> 24) & 0xff); +} + +static void put_le64(unsigned char *p, unsigned long long v) +{ + for (int i = 0; i < 8; i++) + p[i] = (unsigned char) ((v >> (i * 8)) & 0xff); +} + +static void init_elf64(unsigned char *buf, + size_t size, + unsigned short machine, + unsigned short phnum) +{ + memset(buf, 0, size); + buf[0] = 0x7f; + buf[1] = 'E'; + buf[2] = 'L'; + buf[3] = 'F'; + buf[4] = 2; + buf[5] = 1; + buf[6] = 1; + put_le16(buf + 16, 2); + put_le16(buf + 18, machine); + put_le32(buf + 20, 1); + put_le64(buf + 32, EHDR_SIZE); + put_le16(buf + 52, EHDR_SIZE); + put_le16(buf + 54, PHDR_SIZE); + put_le16(buf + 56, phnum); +} + +static void set_phdr(unsigned char *buf, + unsigned short index, + unsigned int type, + unsigned int flags, + unsigned long long offset, + unsigned long long vaddr, + unsigned long long filesz, + unsigned long long memsz) +{ + unsigned char *ph = buf + EHDR_SIZE + (size_t) index * PHDR_SIZE; + + put_le32(ph + 0, type); + put_le32(ph + 4, flags); + put_le64(ph + 8, offset); + put_le64(ph + 16, vaddr); + put_le64(ph + 24, vaddr); + put_le64(ph + 32, filesz); + put_le64(ph + 40, memsz); +} + +static void build_x86_64_elf(unsigned char *buf, size_t size) +{ + init_elf64(buf, size, EM_X86_64, 2); + set_phdr(buf, 0, PT_LOAD, PF_X, 176, 0x1000, 8, 8); + set_phdr(buf, 1, PT_LOAD, 0, 184, 0x2000, 4, 4); + + buf[176] = 0x90; + buf[177] = 0x0f; + buf[178] = 0x05; + buf[179] = 0x90; + buf[180] = 0x0f; + buf[181] = 0x34; + buf[182] = 0xc3; + buf[183] = 0x90; + buf[184] = 0xaa; + buf[185] = 0xbb; + buf[186] = 0xcc; + buf[187] = 0xdd; +} + +static void build_x86_64_wrapper_elf_nr(unsigned char *buf, + size_t size, + unsigned int nr) +{ + init_elf64(buf, size, EM_X86_64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x1000, 8, 8); + + buf[120] = 0xb8; + put_le32(buf + 121, nr); + buf[125] = 0x0f; + buf[126] = 0x05; + buf[127] = 0xc3; +} + +static void build_x86_64_wrapper_elf(unsigned char *buf, size_t size) +{ + build_x86_64_wrapper_elf_nr(buf, size, 39); +} + +static void build_aarch64_elf(unsigned char *buf, size_t size) +{ + init_elf64(buf, size, EM_AARCH64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x4000, 12, 12); + + buf[120] = 0x1f; + buf[121] = 0x20; + buf[122] = 0x03; + buf[123] = 0xd5; + buf[124] = 0x01; + buf[125] = 0x00; + buf[126] = 0x00; + buf[127] = 0xd4; + buf[128] = 0xc0; + buf[129] = 0x03; + buf[130] = 0x5f; + buf[131] = 0xd6; +} + +static void build_aarch64_wrapper_elf_nr(unsigned char *buf, + size_t size, + unsigned int nr) +{ + init_elf64(buf, size, EM_AARCH64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x4000, 12, 12); + + put_le32(buf + 120, 0xd2800008u | ((nr & 0xffffu) << 5)); + put_le32(buf + 124, 0xd4000001u); + put_le32(buf + 128, 0xd65f03c0u); +} + +static void build_aarch64_cancel_wrapper_elf(unsigned char *buf, size_t size) +{ + init_elf64(buf, size, EM_AARCH64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x4000, 24, 24); + + put_le32(buf + 120, 0xd2800848u); + put_le32(buf + 124, 0xd4000001u); + put_le32(buf + 128, 0xf9400bf3u); + put_le32(buf + 132, 0xa8c27bfdu); + put_le32(buf + 136, 0xd50323bfu); + put_le32(buf + 140, 0xd65f03c0u); +} + +static void build_aarch64_fstatat_wrapper_elf(unsigned char *buf, size_t size) +{ + init_elf64(buf, size, EM_AARCH64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x4000, 24, 24); + + put_le32(buf + 120, 0xd28009e8u); /* mov x8, #79 */ + put_le32(buf + 124, 0xd4000001u); /* svc #0 */ + put_le32(buf + 128, 0x3140041fu); /* cmn w0, #1, lsl #12 */ + put_le32(buf + 132, 0x540000a8u); /* b.hi +0x14 */ + put_le32(buf + 136, 0x52800000u); /* mov w0, #0 */ + put_le32(buf + 140, 0xd65f03c0u); /* ret */ +} + +static void build_aarch64_syscall_cancel_open_wrapper_elf(unsigned char *buf, + size_t size) +{ + init_elf64(buf, size, EM_AARCH64, 1); + set_phdr(buf, 0, PT_LOAD, PF_X, 120, 0x4000, 24, 24); + + put_le32(buf + 120, 0xd2800706u); /* mov x6, #56 */ + put_le32(buf + 124, 0xd2800005u); /* mov x5, #0 */ + put_le32(buf + 128, 0xd2800004u); /* mov x4, #0 */ + put_le32(buf + 132, 0x14000002u); /* b +8 */ + put_le32(buf + 136, 0xd503201fu); /* nop */ + put_le32(buf + 140, 0xd503201fu); /* nop */ +} + +static void build_unknown_elf(unsigned char *buf, size_t size) +{ + init_elf64(buf, size, 0x1234, 0); +} + +static int count_segments_cb(const struct kbox_elf_exec_segment *seg, + const unsigned char *segment_bytes, + void *opaque) +{ + int *count = opaque; + (void) segment_bytes; + if (seg->file_size == 0) + return -1; + (*count)++; + return 0; +} + +static void test_syscall_mode_parser(void) +{ + enum kbox_syscall_mode mode = KBOX_SYSCALL_MODE_AUTO; + + ASSERT_EQ(kbox_parse_syscall_mode("seccomp", &mode), 0); + ASSERT_EQ(mode, KBOX_SYSCALL_MODE_SECCOMP); + ASSERT_STREQ(kbox_syscall_mode_name(mode), "seccomp"); + + ASSERT_EQ(kbox_parse_syscall_mode("trap", &mode), 0); + ASSERT_EQ(mode, KBOX_SYSCALL_MODE_TRAP); + ASSERT_STREQ(kbox_syscall_mode_name(mode), "trap"); + + ASSERT_EQ(kbox_parse_syscall_mode("rewrite", &mode), 0); + ASSERT_EQ(mode, KBOX_SYSCALL_MODE_REWRITE); + ASSERT_STREQ(kbox_syscall_mode_name(mode), "rewrite"); + + ASSERT_EQ(kbox_parse_syscall_mode("auto", &mode), 0); + ASSERT_EQ(mode, KBOX_SYSCALL_MODE_AUTO); + ASSERT_STREQ(kbox_syscall_mode_name(mode), "auto"); + + ASSERT_EQ(kbox_parse_syscall_mode("bogus", &mode), -1); +} + +static void test_elf_exec_segment_walker(void) +{ + unsigned char elf[192]; + int count = 0; + int rc; + + build_x86_64_elf(elf, sizeof(elf)); + rc = kbox_visit_elf_exec_segments(elf, sizeof(elf), count_segments_cb, + &count); + ASSERT_EQ(rc, 1); + ASSERT_EQ(count, 1); +} + +static void test_rewrite_analyze_x86_64(void) +{ + unsigned char elf[192]; + struct kbox_rewrite_report report; + + build_x86_64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_analyze_elf(elf, sizeof(elf), &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(report.exec_segment_count, 1); + ASSERT_EQ(report.candidate_count, 2); + ASSERT_STREQ(kbox_rewrite_arch_name(report.arch), "x86_64"); +} + +static void test_rewrite_analyze_x86_64_wrapper(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + + build_x86_64_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_analyze_elf(elf, sizeof(elf), &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(report.exec_segment_count, 1); + ASSERT_EQ(report.candidate_count, 1); +} + +static void test_rewrite_analyze_aarch64(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + + build_aarch64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_analyze_elf(elf, sizeof(elf), &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(report.exec_segment_count, 1); + ASSERT_EQ(report.candidate_count, 1); + ASSERT_STREQ(kbox_rewrite_arch_name(report.arch), "aarch64"); +} + +static void test_rewrite_rejects_unknown_machine(void) +{ + unsigned char elf[64]; + struct kbox_rewrite_report report; + + build_unknown_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_analyze_elf(elf, sizeof(elf), &report), -1); +} + +struct site_list { + struct kbox_rewrite_site sites[4]; + int count; +}; + +struct planned_site_list { + struct kbox_rewrite_planned_site sites[4]; + int count; +}; + +static int collect_sites_cb(const struct kbox_rewrite_site *site, void *opaque) +{ + struct site_list *list = opaque; + + if (list->count >= 4) + return -1; + list->sites[list->count++] = *site; + return 0; +} + +static int collect_planned_sites_cb( + const struct kbox_rewrite_planned_site *planned, + void *opaque) +{ + struct planned_site_list *list = opaque; + + if (list->count >= 4) + return -1; + list->sites[list->count++] = *planned; + return 0; +} + +static void test_elf_exec_rejects_huge_phoff(void) +{ + /* Craft an ELF with e_phoff near UINT64_MAX so that phoff + i*phentsize + * would wrap around. The segment walker must reject this with -1 rather + * than reading out of bounds. + */ + unsigned char elf[192]; + int count = 0; + + build_x86_64_elf(elf, sizeof(elf)); + /* Overwrite e_phoff (offset 32, 8 bytes) with 0xFFFFFFFFFFFFFF00 */ + put_le64(elf + 32, 0xFFFFFFFFFFFFFF00ULL); + ASSERT_EQ(kbox_visit_elf_exec_segments(elf, sizeof(elf), count_segments_cb, + &count), + -1); + ASSERT_EQ(count, 0); +} + +static void test_elf_interp_rejects_huge_phoff(void) +{ + /* Same overflow scenario but for the PT_INTERP lookup path. + * kbox_find_elf_interp_loc must return -1 (malformed ELF), not 0 + * ("no interp" / static binary), so the caller does not silently + * treat a corrupted dynamic ELF as static. + */ + unsigned char elf[192]; + char interp[64]; + + build_x86_64_elf(elf, sizeof(elf)); + put_le64(elf + 32, 0xFFFFFFFFFFFFFF00ULL); + ASSERT_EQ(kbox_find_elf_interp_loc(elf, sizeof(elf), interp, sizeof(interp), + NULL, NULL), + -1); +} + +static void test_rewrite_visit_x86_64_sites(void) +{ + unsigned char elf[192]; + struct site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_x86_64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_sites(elf, sizeof(elf), collect_sites_cb, + &list, &report), + 0); + ASSERT_EQ(list.count, 2); + ASSERT_EQ(list.sites[0].file_offset, 177); + ASSERT_EQ(list.sites[0].vaddr, 0x1001); + ASSERT_EQ(list.sites[0].segment_vaddr, 0x1000); + ASSERT_EQ(list.sites[0].segment_mem_size, 8); + ASSERT_EQ(list.sites[0].width, 2); + ASSERT_EQ(list.sites[0].original[0], 0x0f); + ASSERT_EQ(list.sites[0].original[1], 0x05); + ASSERT_EQ(list.sites[1].file_offset, 180); + ASSERT_EQ(list.sites[1].vaddr, 0x1004); + ASSERT_EQ(list.sites[1].original[1], 0x34); +} + +static void test_rewrite_visit_x86_64_wrapper_site(void) +{ + unsigned char elf[160]; + struct site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_x86_64_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_sites(elf, sizeof(elf), collect_sites_cb, + &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].file_offset, 120); + ASSERT_EQ(list.sites[0].vaddr, 0x1000); + ASSERT_EQ(list.sites[0].segment_vaddr, 0x1000); + ASSERT_EQ(list.sites[0].segment_mem_size, 8); + ASSERT_EQ(list.sites[0].width, 8); + ASSERT_EQ(list.sites[0].original[0], 0xb8); + ASSERT_EQ(list.sites[0].original[5], 0x0f); + ASSERT_EQ(list.sites[0].original[6], 0x05); + ASSERT_EQ(list.sites[0].original[7], 0xc3); +} + +static void test_rewrite_visit_aarch64_sites(void) +{ + unsigned char elf[160]; + struct site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_aarch64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_sites(elf, sizeof(elf), collect_sites_cb, + &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].file_offset, 124); + ASSERT_EQ(list.sites[0].vaddr, 0x4004); + ASSERT_EQ(list.sites[0].segment_vaddr, 0x4000); + ASSERT_EQ(list.sites[0].segment_mem_size, 12); + ASSERT_EQ(list.sites[0].width, 4); + ASSERT_EQ(list.sites[0].original[0], 0x01); + ASSERT_EQ(list.sites[0].original[3], 0xd4); +} + +static void test_rewrite_visit_aarch64_cancel_wrapper_site(void) +{ + unsigned char elf[192]; + struct site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_aarch64_cancel_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_sites(elf, sizeof(elf), collect_sites_cb, + &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].file_offset, 124); + ASSERT_EQ(list.sites[0].vaddr, 0x4004); + ASSERT_EQ(list.sites[0].site_class, KBOX_REWRITE_SITE_WRAPPER); +} + +static void test_rewrite_visit_aarch64_fstatat_wrapper_site(void) +{ + unsigned char elf[192]; + struct site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_sites(elf, sizeof(elf), collect_sites_cb, + &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].file_offset, 124); + ASSERT_EQ(list.sites[0].vaddr, 0x4004); + ASSERT_EQ(list.sites[0].site_class, KBOX_REWRITE_SITE_WRAPPER); +} + +static void test_rewrite_plan_x86_64_sites(void) +{ + unsigned char elf[192]; + struct planned_site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_x86_64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_planned_sites( + elf, sizeof(elf), collect_planned_sites_cb, &list, &report), + 0); + ASSERT_EQ(list.count, 2); + ASSERT_EQ(list.sites[0].trampoline_addr, 0x1010); + ASSERT_EQ(list.sites[0].patch.width, 2); + ASSERT_EQ(list.sites[0].patch.bytes[0], 0xff); + ASSERT_EQ(list.sites[0].patch.bytes[1], 0xd0); + ASSERT_EQ(list.sites[1].trampoline_addr, 0x1030); + ASSERT_EQ(list.sites[1].patch.width, 2); + ASSERT_EQ(list.sites[1].patch.bytes[0], 0xff); + ASSERT_EQ(list.sites[1].patch.bytes[1], 0xd0); +} + +static void test_rewrite_plan_aarch64_sites(void) +{ + unsigned char elf[160]; + struct planned_site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + build_aarch64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_elf_planned_sites( + elf, sizeof(elf), collect_planned_sites_cb, &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].trampoline_addr, 0x4010); + ASSERT_EQ(list.sites[0].patch.width, 4); + ASSERT_EQ(list.sites[0].patch.bytes[0], 0x03); + ASSERT_EQ(list.sites[0].patch.bytes[1], 0x00); + ASSERT_EQ(list.sites[0].patch.bytes[2], 0x00); + ASSERT_EQ(list.sites[0].patch.bytes[3], 0x14); +} + +static void test_rewrite_plan_aarch64_segment_out_of_range(void) +{ + unsigned char elf[128]; + struct planned_site_list list; + struct kbox_rewrite_report report; + + memset(&list, 0, sizeof(list)); + init_elf64(elf, sizeof(elf), EM_AARCH64, 1); + set_phdr(elf, 0, PT_LOAD, PF_X, 120, 0x4000, 4, + (unsigned long long) ((128u * 1024u * 1024u) + 4096u)); + elf[120] = 0x01; + elf[121] = 0x00; + elf[122] = 0x00; + elf[123] = 0xd4; + ASSERT_EQ(kbox_rewrite_visit_elf_planned_sites( + elf, sizeof(elf), collect_planned_sites_cb, &list, &report), + 0); + ASSERT_EQ(list.count, 1); + ASSERT_EQ(list.sites[0].patch.width, 0); +} + +static void test_rewrite_encode_x86_64_patch(void) +{ + struct kbox_rewrite_site site; + struct kbox_rewrite_patch patch; + + memset(&site, 0, sizeof(site)); + site.vaddr = 0x1000; + site.width = 2; + site.original[0] = 0x0f; + site.original[1] = 0x05; + ASSERT_EQ(kbox_rewrite_encode_patch(&site, 0, &patch), 0); + ASSERT_EQ(patch.width, 2); + ASSERT_EQ(patch.bytes[0], 0xff); + ASSERT_EQ(patch.bytes[1], 0xd0); +} + +static void test_rewrite_encode_x86_64_wrapper_patch(void) +{ + struct kbox_rewrite_site site; + struct kbox_rewrite_patch patch; + + memset(&site, 0, sizeof(site)); + site.vaddr = 0x1000; + site.segment_mem_size = 8; + site.width = 8; + site.original[0] = 0xb8; + site.original[1] = 0x27; + site.original[2] = 0x00; + site.original[3] = 0x00; + site.original[4] = 0x00; + site.original[5] = 0x0f; + site.original[6] = 0x05; + site.original[7] = 0xc3; + ASSERT_EQ(kbox_rewrite_encode_patch(&site, 0x1100, &patch), 0); + ASSERT_EQ(patch.width, 8); + ASSERT_EQ(patch.bytes[0], 0xe9); + ASSERT_EQ(patch.bytes[1], 0xfb); + ASSERT_EQ(patch.bytes[2], 0x00); + ASSERT_EQ(patch.bytes[3], 0x00); + ASSERT_EQ(patch.bytes[4], 0x00); + ASSERT_EQ(patch.bytes[5], 0x90); + ASSERT_EQ(patch.bytes[6], 0x90); + ASSERT_EQ(patch.bytes[7], 0x90); +} + +static void test_rewrite_encode_x86_64_page_zero_trampoline(void) +{ + unsigned char page[256]; + + ASSERT_EQ(kbox_rewrite_encode_x86_64_page_zero_trampoline( + page, sizeof(page), 0x1122334455667788ULL), + 0); + ASSERT_EQ(page[0], 0x90); + ASSERT_EQ(page[sizeof(page) - 13], 0x49); + ASSERT_EQ(page[sizeof(page) - 12], 0xbb); + ASSERT_EQ(page[sizeof(page) - 11], 0x88); + ASSERT_EQ(page[sizeof(page) - 10], 0x77); + ASSERT_EQ(page[sizeof(page) - 9], 0x66); + ASSERT_EQ(page[sizeof(page) - 8], 0x55); + ASSERT_EQ(page[sizeof(page) - 7], 0x44); + ASSERT_EQ(page[sizeof(page) - 6], 0x33); + ASSERT_EQ(page[sizeof(page) - 5], 0x22); + ASSERT_EQ(page[sizeof(page) - 4], 0x11); + ASSERT_EQ(page[sizeof(page) - 3], 0x41); + ASSERT_EQ(page[sizeof(page) - 2], 0xff); + ASSERT_EQ(page[sizeof(page) - 1], 0xe3); +} + +static void test_rewrite_encode_aarch64_branch_patch(void) +{ + struct kbox_rewrite_site site; + struct kbox_rewrite_patch patch; + + memset(&site, 0, sizeof(site)); + site.vaddr = 0x4000; + site.width = 4; + site.original[0] = 0x01; + site.original[1] = 0x00; + site.original[2] = 0x00; + site.original[3] = 0xd4; + ASSERT_EQ(kbox_rewrite_encode_patch(&site, 0x4010, &patch), 0); + ASSERT_EQ(patch.width, 4); + ASSERT_EQ(patch.bytes[0], 0x04); + ASSERT_EQ(patch.bytes[1], 0x00); + ASSERT_EQ(patch.bytes[2], 0x00); + ASSERT_EQ(patch.bytes[3], 0x14); +} + +static void test_rewrite_encode_aarch64_branch_out_of_range(void) +{ + struct kbox_rewrite_site site; + struct kbox_rewrite_patch patch; + + memset(&site, 0, sizeof(site)); + site.vaddr = 0x4000; + site.width = 4; + site.original[0] = 0x01; + site.original[1] = 0x00; + site.original[2] = 0x00; + site.original[3] = 0xd4; + ASSERT_EQ(kbox_rewrite_encode_patch(&site, 0x10000000, &patch), -1); +} + +static void test_rewrite_apply_x86_64_elf(void) +{ + unsigned char elf[192]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_x86_64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_elf(elf, sizeof(elf), &applied, &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(applied, 2); + ASSERT_EQ(elf[177], 0xff); + ASSERT_EQ(elf[178], 0xd0); + ASSERT_EQ(elf[180], 0xff); + ASSERT_EQ(elf[181], 0xd0); +} + +static void test_rewrite_apply_x86_64_wrapper_elf(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_x86_64_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_elf(elf, sizeof(elf), &applied, &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(applied, 1); + ASSERT_EQ(elf[120], 0xe9); + ASSERT_EQ(elf[121], 0x0b); + ASSERT_EQ(elf[122], 0x00); + ASSERT_EQ(elf[123], 0x00); + ASSERT_EQ(elf[124], 0x00); + ASSERT_EQ(elf[125], 0x90); + ASSERT_EQ(elf[126], 0x90); + ASSERT_EQ(elf[127], 0x90); +} + +static void test_rewrite_apply_aarch64_elf(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_aarch64_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_elf(elf, sizeof(elf), &applied, &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(applied, 1); + ASSERT_EQ(elf[124], 0x03); + ASSERT_EQ(elf[125], 0x00); + ASSERT_EQ(elf[126], 0x00); + ASSERT_EQ(elf[127], 0x14); +} + +static void test_rewrite_apply_memfd_x86_64(void) +{ + unsigned char elf[192]; + struct kbox_rewrite_report report; + size_t applied = 0; + char path[128]; + unsigned char patched[8]; + int fd; + + build_x86_64_elf(elf, sizeof(elf)); + fd = test_mkstemp(path, sizeof(path), "kbox-rewrite-unit"); + ASSERT_TRUE(fd >= 0); + unlink(path); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (long) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_memfd(fd, &applied, &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(applied, 2); + ASSERT_EQ(pread(fd, patched, sizeof(patched), 176), (long) sizeof(patched)); + ASSERT_EQ(patched[1], 0xff); + ASSERT_EQ(patched[2], 0xd0); + ASSERT_EQ(patched[4], 0xff); + ASSERT_EQ(patched[5], 0xd0); + close(fd); +} + +static void test_rewrite_apply_virtual_procinfo_x86_64_wrapper_elf(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_x86_64_wrapper_elf_nr(elf, sizeof(elf), 39); + ASSERT_EQ(kbox_rewrite_apply_virtual_procinfo_elf(elf, sizeof(elf), + &applied, &report), + 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(applied, 1); + ASSERT_EQ(elf[120], 0xb8); + ASSERT_EQ(elf[121], 0x01); + ASSERT_EQ(elf[122], 0x00); + ASSERT_EQ(elf[123], 0x00); + ASSERT_EQ(elf[124], 0x00); + ASSERT_EQ(elf[125], 0xc3); + ASSERT_EQ(elf[126], 0x90); + ASSERT_EQ(elf[127], 0x90); +} + +static void test_rewrite_apply_virtual_procinfo_skips_non_procinfo_wrapper(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_x86_64_wrapper_elf_nr(elf, sizeof(elf), 96); + ASSERT_EQ(kbox_rewrite_apply_virtual_procinfo_elf(elf, sizeof(elf), + &applied, &report), + 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(applied, 0); + ASSERT_EQ(elf[120], 0xb8); + ASSERT_EQ(elf[121], 0x60); + ASSERT_EQ(elf[122], 0x00); + ASSERT_EQ(elf[123], 0x00); + ASSERT_EQ(elf[124], 0x00); + ASSERT_EQ(elf[125], 0x0f); + ASSERT_EQ(elf[126], 0x05); + ASSERT_EQ(elf[127], 0xc3); +} + +static void test_rewrite_apply_virtual_procinfo_aarch64_wrapper_elf(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_aarch64_wrapper_elf_nr(elf, sizeof(elf), 172); + ASSERT_EQ(kbox_rewrite_apply_virtual_procinfo_elf(elf, sizeof(elf), + &applied, &report), + 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(applied, 1); + ASSERT_EQ(elf[124], 0x20); + ASSERT_EQ(elf[125], 0x00); + ASSERT_EQ(elf[126], 0x80); + ASSERT_EQ(elf[127], 0xd2); + ASSERT_EQ(elf[128], 0xc0); + ASSERT_EQ(elf[129], 0x03); + ASSERT_EQ(elf[130], 0x5f); + ASSERT_EQ(elf[131], 0xd6); +} + +static void test_rewrite_apply_virtual_procinfo_skips_non_procinfo_aarch64(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_report report; + size_t applied = 0; + + build_aarch64_wrapper_elf_nr(elf, sizeof(elf), 174); + ASSERT_EQ(kbox_rewrite_apply_virtual_procinfo_elf(elf, sizeof(elf), + &applied, &report), + 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(applied, 0); + ASSERT_EQ(elf[124], 0x01); + ASSERT_EQ(elf[125], 0x00); + ASSERT_EQ(elf[126], 0x00); + ASSERT_EQ(elf[127], 0xd4); +} + +static void test_rewrite_wrapper_syscall_nr_x86_64(void) +{ + struct kbox_rewrite_site site; + uint64_t nr = 0; + + memset(&site, 0, sizeof(site)); + site.width = 8; + site.site_class = KBOX_REWRITE_SITE_WRAPPER; + site.original[0] = 0xb8; + site.original[1] = 0x01; + site.original[2] = 0x00; + site.original[3] = 0x00; + site.original[4] = 0x00; + site.original[5] = 0x0f; + site.original[6] = 0x05; + site.original[7] = 0xc3; + + ASSERT_EQ( + kbox_rewrite_wrapper_syscall_nr(&site, KBOX_REWRITE_ARCH_X86_64, &nr), + 0); + ASSERT_EQ(nr, 1); +} + +static void test_rewrite_wrapper_syscall_nr_aarch64(void) +{ + struct kbox_rewrite_site site; + uint64_t nr = 0; + + memset(&site, 0, sizeof(site)); + site.width = 4; + site.site_class = KBOX_REWRITE_SITE_WRAPPER; + put_le32(site.original, 0xd2800848u); + + ASSERT_EQ( + kbox_rewrite_wrapper_syscall_nr(&site, KBOX_REWRITE_ARCH_AARCH64, &nr), + 0); + ASSERT_EQ(nr, 66); +} + +static void test_rewrite_origin_map_x86_64(void) +{ + unsigned char elf[192]; + struct kbox_rewrite_origin_map map; + struct kbox_rewrite_origin_entry entry; + struct kbox_rewrite_report report; + + build_x86_64_elf(elf, sizeof(elf)); + kbox_rewrite_origin_map_init(&map, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ( + kbox_rewrite_origin_map_build_elf(&map, elf, sizeof(elf), &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(map.count, 2); + ASSERT_EQ(map.entries[0].origin, 0x1003); + ASSERT_EQ(map.entries[0].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(map.entries[1].origin, 0x1006); + ASSERT_EQ(map.entries[1].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(kbox_rewrite_origin_map_contains(&map, 0x1003), 1); + ASSERT_EQ(kbox_rewrite_origin_map_contains(&map, 0x1006), 1); + ASSERT_EQ(kbox_rewrite_origin_map_contains(&map, 0x1001), 0); + ASSERT_EQ(kbox_rewrite_origin_map_find(&map, 0x1006, &entry), 1); + ASSERT_EQ(entry.origin, 0x1006); + ASSERT_EQ(entry.source, KBOX_LOADER_MAPPING_MAIN); + kbox_rewrite_origin_map_reset(&map); +} + +static void test_rewrite_origin_map_aarch64(void) +{ + unsigned char elf[160]; + struct kbox_rewrite_origin_map map; + struct kbox_rewrite_origin_entry entry; + struct kbox_rewrite_report report; + + build_aarch64_elf(elf, sizeof(elf)); + kbox_rewrite_origin_map_init(&map, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ( + kbox_rewrite_origin_map_build_elf(&map, elf, sizeof(elf), &report), 0); + ASSERT_EQ(report.arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(map.count, 1); + ASSERT_EQ(map.entries[0].origin, 0x4004); + ASSERT_EQ(map.entries[0].source, KBOX_LOADER_MAPPING_MAIN); + ASSERT_EQ(kbox_rewrite_origin_map_contains(&map, 0x4004), 1); + ASSERT_EQ(kbox_rewrite_origin_map_contains(&map, 0x4008), 0); + ASSERT_EQ(kbox_rewrite_origin_map_find(&map, 0x4004, &entry), 1); + ASSERT_EQ(entry.origin, 0x4004); + ASSERT_EQ(entry.source, KBOX_LOADER_MAPPING_MAIN); + kbox_rewrite_origin_map_reset(&map); +} + +static void test_rewrite_origin_map_add_site_source(void) +{ + struct kbox_rewrite_origin_map map; + struct kbox_rewrite_site site; + struct kbox_rewrite_origin_entry entry; + + memset(&site, 0, sizeof(site)); + site.vaddr = 0x2000; + site.width = 2; + site.original[0] = 0x0f; + site.original[1] = 0x05; + + kbox_rewrite_origin_map_init(&map, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(kbox_rewrite_origin_map_add_site_source( + &map, &site, KBOX_LOADER_MAPPING_INTERP), + 0); + ASSERT_EQ(map.count, 1); + ASSERT_EQ(map.entries[0].origin, 0x2002); + ASSERT_EQ(map.entries[0].source, KBOX_LOADER_MAPPING_INTERP); + ASSERT_EQ(kbox_rewrite_origin_map_find(&map, 0x2002, &entry), 1); + ASSERT_EQ(entry.origin, 0x2002); + ASSERT_EQ(entry.source, KBOX_LOADER_MAPPING_INTERP); + ASSERT_EQ(kbox_rewrite_origin_map_find(&map, 0x2001, &entry), 0); + kbox_rewrite_origin_map_reset(&map); +} + +static void test_rewrite_probe_x86_64_page_zero_allowed(void) +{ + struct kbox_rewrite_trampoline_probe probe; + + ASSERT_EQ(kbox_rewrite_probe_x86_64_page_zero(0, &probe), 0); + ASSERT_EQ(probe.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_TRUE(probe.feasible); + ASSERT_STREQ(probe.reason, "page-zero trampoline available"); +} + +static void test_rewrite_probe_x86_64_page_zero_blocked(void) +{ + struct kbox_rewrite_trampoline_probe probe; + + ASSERT_EQ(kbox_rewrite_probe_x86_64_page_zero(65536, &probe), 0); + ASSERT_EQ(probe.arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_TRUE(!probe.feasible); + ASSERT_STREQ(probe.reason, "vm.mmap_min_addr must be 0 for x86_64 rewrite"); +} + +static void test_rewrite_fast_host_syscall0_classification(void) +{ + struct kbox_host_nrs host_nrs; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + + ASSERT_EQ(kbox_rewrite_is_fast_host_syscall0(&host_nrs, 172), 1); + ASSERT_EQ(kbox_rewrite_is_fast_host_syscall0(&host_nrs, 173), 1); + ASSERT_EQ(kbox_rewrite_is_fast_host_syscall0(&host_nrs, 178), 1); + ASSERT_EQ(kbox_rewrite_is_fast_host_syscall0(&host_nrs, 999), 0); +} + +static void test_rewrite_has_wrapper_syscalls_x86_64(void) +{ + unsigned char elf[160]; + uint64_t allow[] = {1, 257}; + + build_x86_64_wrapper_elf_nr(elf, sizeof(elf), 257); + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_X86_64, allow, 2), + 1); + + allow[0] = 39; + allow[1] = 40; + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_X86_64, allow, 2), + 0); +} + +static void test_rewrite_has_wrapper_syscalls_aarch64(void) +{ + unsigned char elf[192]; + uint64_t allow[] = {56, 79}; + + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_AARCH64, allow, 2), + 1); + + allow[0] = 172; + allow[1] = 173; + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_AARCH64, allow, 2), + 0); +} + +static void test_rewrite_has_syscall_cancel_wrapper_syscalls_aarch64(void) +{ + unsigned char elf[192]; + uint64_t allow[] = {56, 79}; + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_AARCH64, allow, 2), + 1); + + allow[0] = 63; + allow[1] = 80; + ASSERT_EQ(kbox_rewrite_has_wrapper_syscalls( + elf, sizeof(elf), KBOX_REWRITE_ARCH_AARCH64, allow, 2), + 0); +} + +static void test_rewrite_wrapper_family_mask_memfd_x86_64(void) +{ + unsigned char elf[160]; + struct kbox_host_nrs host_nrs; + uint32_t mask = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 39; + host_nrs.getppid = 110; + host_nrs.gettid = 186; + host_nrs.newfstatat = 262; + host_nrs.fstat = 5; + host_nrs.stat = 4; + host_nrs.lstat = 6; + host_nrs.openat = 257; + host_nrs.openat2 = 437; + host_nrs.open = 2; + + build_x86_64_wrapper_elf_nr(elf, sizeof(elf), 257); + fd = memfd_create("rewrite-mask-x86", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_wrapper_family_mask_memfd(fd, &host_nrs, &mask), 0); + ASSERT_EQ(mask, (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + close(fd); +} + +static void test_rewrite_wrapper_family_mask_memfd_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + uint32_t mask = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-mask-aarch64-stat", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_wrapper_family_mask_memfd(fd, &host_nrs, &mask), 0); + ASSERT_EQ(mask, (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_STAT); + close(fd); + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-mask-aarch64-open", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_wrapper_family_mask_memfd(fd, &host_nrs, &mask), 0); + ASSERT_EQ(mask, (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + close(fd); +} + +struct wrapper_candidate_collect { + struct kbox_rewrite_wrapper_candidate candidates[8]; + size_t count; +}; + +static int collect_wrapper_candidate_cb( + const struct kbox_rewrite_wrapper_candidate *candidate, + void *opaque) +{ + struct wrapper_candidate_collect *collect = opaque; + + if (!candidate || !collect) + return -1; + if (collect->count >= + (sizeof(collect->candidates) / sizeof(collect->candidates[0]))) { + return -1; + } + collect->candidates[collect->count++] = *candidate; + return 0; +} + +static void test_rewrite_visit_memfd_wrapper_candidates_x86_64(void) +{ + unsigned char elf[160]; + struct kbox_host_nrs host_nrs; + struct wrapper_candidate_collect collect; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 39; + host_nrs.getppid = 110; + host_nrs.gettid = 186; + host_nrs.newfstatat = 262; + host_nrs.fstat = 5; + host_nrs.stat = 4; + host_nrs.lstat = 6; + host_nrs.openat = 257; + host_nrs.openat2 = 437; + host_nrs.open = 2; + + memset(&collect, 0, sizeof(collect)); + build_x86_64_wrapper_elf_nr(elf, sizeof(elf), 257); + fd = memfd_create("rewrite-candidates-x86", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_memfd_wrapper_candidates( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + collect_wrapper_candidate_cb, &collect), + 0); + ASSERT_EQ(collect.count, (size_t) 1); + ASSERT_EQ(collect.candidates[0].arch, KBOX_REWRITE_ARCH_X86_64); + ASSERT_EQ(collect.candidates[0].kind, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + ASSERT_EQ(collect.candidates[0].file_offset, (uint64_t) 120); + ASSERT_EQ(collect.candidates[0].vaddr, (uint64_t) 0x1000); + ASSERT_EQ(collect.candidates[0].nr, (uint64_t) 257); + ASSERT_EQ(collect.candidates[0].family_mask, + (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + close(fd); +} + +static void test_rewrite_visit_memfd_wrapper_candidates_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + struct wrapper_candidate_collect collect; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + memset(&collect, 0, sizeof(collect)); + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-candidates-aarch64-stat", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_memfd_wrapper_candidates( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_STAT, + collect_wrapper_candidate_cb, &collect), + 0); + ASSERT_EQ(collect.count, (size_t) 1); + ASSERT_EQ(collect.candidates[0].arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(collect.candidates[0].kind, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + ASSERT_EQ(collect.candidates[0].file_offset, (uint64_t) 124); + ASSERT_EQ(collect.candidates[0].vaddr, (uint64_t) 0x4004); + ASSERT_EQ(collect.candidates[0].nr, (uint64_t) 79); + ASSERT_EQ(collect.candidates[0].family_mask, + (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_STAT); + close(fd); + + memset(&collect, 0, sizeof(collect)); + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-candidates-aarch64-open", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_visit_memfd_wrapper_candidates( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + collect_wrapper_candidate_cb, &collect), + 0); + ASSERT_EQ(collect.count, (size_t) 1); + ASSERT_EQ(collect.candidates[0].arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(collect.candidates[0].kind, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL); + ASSERT_EQ(collect.candidates[0].file_offset, (uint64_t) 132); + ASSERT_EQ(collect.candidates[0].vaddr, (uint64_t) 0x400c); + ASSERT_EQ(collect.candidates[0].nr, (uint64_t) 56); + ASSERT_EQ(collect.candidates[0].family_mask, + (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + close(fd); +} + +static void test_rewrite_collect_memfd_wrapper_candidates_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + struct kbox_rewrite_wrapper_candidate candidates[2]; + size_t count = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-collect-aarch64-open", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_collect_memfd_wrapper_candidates( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, candidates, + 2, &count), + 0); + ASSERT_EQ(count, (size_t) 1); + ASSERT_EQ(candidates[0].arch, KBOX_REWRITE_ARCH_AARCH64); + ASSERT_EQ(candidates[0].kind, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL); + ASSERT_EQ(candidates[0].file_offset, (uint64_t) 132); + ASSERT_EQ(candidates[0].vaddr, (uint64_t) 0x400c); + ASSERT_EQ(candidates[0].nr, (uint64_t) 56); + close(fd); +} + +static void test_rewrite_collect_memfd_wrapper_candidates_by_kind_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + struct kbox_rewrite_wrapper_candidate candidates[2]; + size_t count = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-collect-aarch64-open-kind", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_collect_memfd_wrapper_candidates_by_kind( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT, candidates, 2, &count), + 0); + ASSERT_EQ(count, (size_t) 0); + ASSERT_EQ(kbox_rewrite_collect_memfd_wrapper_candidates_by_kind( + fd, &host_nrs, KBOX_REWRITE_WRAPPER_FAMILY_OPEN, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL, candidates, 2, + &count), + 0); + ASSERT_EQ(count, (size_t) 1); + ASSERT_EQ(candidates[0].kind, + KBOX_REWRITE_WRAPPER_CANDIDATE_SYSCALL_CANCEL); + ASSERT_EQ(candidates[0].file_offset, (uint64_t) 132); + ASSERT_EQ(candidates[0].vaddr, (uint64_t) 0x400c); + close(fd); +} + +static void test_rewrite_collect_memfd_phase1_path_candidates_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + struct kbox_rewrite_wrapper_candidate candidates[2]; + size_t count = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-collect-aarch64-phase1-stat", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, &host_nrs, candidates, 2, &count), + 0); + ASSERT_EQ(count, (size_t) 1); + ASSERT_EQ(candidates[0].kind, KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + ASSERT_EQ(candidates[0].family_mask, + (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_STAT); + ASSERT_EQ(candidates[0].file_offset, (uint64_t) 124); + ASSERT_EQ(candidates[0].vaddr, (uint64_t) 0x4004); + ASSERT_EQ(candidates[0].nr, (uint64_t) 79); + close(fd); + + build_aarch64_wrapper_elf_nr(elf, sizeof(elf), 56); + fd = memfd_create("rewrite-collect-aarch64-phase1-open", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, &host_nrs, candidates, 2, &count), + 0); + ASSERT_EQ(count, (size_t) 1); + ASSERT_EQ(candidates[0].kind, KBOX_REWRITE_WRAPPER_CANDIDATE_DIRECT); + ASSERT_EQ(candidates[0].family_mask, + (uint32_t) KBOX_REWRITE_WRAPPER_FAMILY_OPEN); + ASSERT_EQ(candidates[0].file_offset, (uint64_t) 124); + ASSERT_EQ(candidates[0].vaddr, (uint64_t) 0x4004); + ASSERT_EQ(candidates[0].nr, (uint64_t) 56); + close(fd); + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-collect-aarch64-phase1-cancel", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_collect_memfd_phase1_path_candidates( + fd, &host_nrs, candidates, 2, &count), + 0); + ASSERT_EQ(count, (size_t) 0); + close(fd); +} + +static void test_rewrite_apply_memfd_phase1_path_candidates_aarch64(void) +{ + unsigned char elf[192]; + struct kbox_host_nrs host_nrs; + unsigned char patched[4]; + size_t applied = 0; + int fd; + + memset(&host_nrs, 0xff, sizeof(host_nrs)); + host_nrs.getpid = 172; + host_nrs.getppid = 173; + host_nrs.gettid = 178; + host_nrs.newfstatat = 79; + host_nrs.fstat = 80; + host_nrs.stat = -1; + host_nrs.lstat = -1; + host_nrs.openat = 56; + host_nrs.openat2 = -1; + host_nrs.open = -1; + + build_aarch64_fstatat_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-apply-aarch64-phase1-stat", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_memfd_phase1_path_candidates(fd, &host_nrs, + &applied, NULL), + 0); + ASSERT_EQ(applied, (size_t) 1); + ASSERT_EQ(pread(fd, patched, sizeof(patched), 124), (ssize_t) 4); + ASSERT_NE(memcmp(patched, "\x01\x00\x00\xd4", 4), 0); + close(fd); + + build_aarch64_wrapper_elf_nr(elf, sizeof(elf), 56); + fd = memfd_create("rewrite-apply-aarch64-phase1-open", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_memfd_phase1_path_candidates(fd, &host_nrs, + &applied, NULL), + 0); + ASSERT_EQ(applied, (size_t) 1); + ASSERT_EQ(pread(fd, patched, sizeof(patched), 124), (ssize_t) 4); + ASSERT_NE(memcmp(patched, "\x01\x00\x00\xd4", 4), 0); + close(fd); + + build_aarch64_syscall_cancel_open_wrapper_elf(elf, sizeof(elf)); + fd = memfd_create("rewrite-apply-aarch64-phase1-cancel", 0); + ASSERT_TRUE(fd >= 0); + ASSERT_EQ(write(fd, elf, sizeof(elf)), (ssize_t) sizeof(elf)); + ASSERT_EQ(kbox_rewrite_apply_memfd_phase1_path_candidates(fd, &host_nrs, + &applied, NULL), + 0); + ASSERT_EQ(applied, (size_t) 0); + ASSERT_EQ(pread(fd, patched, sizeof(patched), 132), (ssize_t) 4); + ASSERT_EQ(memcmp(patched, "\x02\x00\x00\x14", 4), 0); + close(fd); +} + +void test_rewrite_init(void) +{ + TEST_REGISTER(test_syscall_mode_parser); + TEST_REGISTER(test_elf_exec_segment_walker); + TEST_REGISTER(test_rewrite_analyze_x86_64); + TEST_REGISTER(test_rewrite_analyze_x86_64_wrapper); + TEST_REGISTER(test_rewrite_analyze_aarch64); + TEST_REGISTER(test_rewrite_rejects_unknown_machine); + TEST_REGISTER(test_elf_exec_rejects_huge_phoff); + TEST_REGISTER(test_elf_interp_rejects_huge_phoff); + TEST_REGISTER(test_rewrite_visit_x86_64_sites); + TEST_REGISTER(test_rewrite_visit_x86_64_wrapper_site); + TEST_REGISTER(test_rewrite_visit_aarch64_sites); + TEST_REGISTER(test_rewrite_visit_aarch64_cancel_wrapper_site); + TEST_REGISTER(test_rewrite_visit_aarch64_fstatat_wrapper_site); + TEST_REGISTER(test_rewrite_plan_x86_64_sites); + TEST_REGISTER(test_rewrite_plan_aarch64_sites); + TEST_REGISTER(test_rewrite_plan_aarch64_segment_out_of_range); + TEST_REGISTER(test_rewrite_encode_x86_64_patch); + TEST_REGISTER(test_rewrite_encode_x86_64_wrapper_patch); + TEST_REGISTER(test_rewrite_encode_x86_64_page_zero_trampoline); + TEST_REGISTER(test_rewrite_encode_aarch64_branch_patch); + TEST_REGISTER(test_rewrite_encode_aarch64_branch_out_of_range); + TEST_REGISTER(test_rewrite_apply_x86_64_elf); + TEST_REGISTER(test_rewrite_apply_x86_64_wrapper_elf); + TEST_REGISTER(test_rewrite_apply_aarch64_elf); + TEST_REGISTER(test_rewrite_apply_memfd_x86_64); + TEST_REGISTER(test_rewrite_apply_virtual_procinfo_x86_64_wrapper_elf); + TEST_REGISTER( + test_rewrite_apply_virtual_procinfo_skips_non_procinfo_wrapper); + TEST_REGISTER(test_rewrite_apply_virtual_procinfo_aarch64_wrapper_elf); + TEST_REGISTER( + test_rewrite_apply_virtual_procinfo_skips_non_procinfo_aarch64); + TEST_REGISTER(test_rewrite_wrapper_syscall_nr_x86_64); + TEST_REGISTER(test_rewrite_wrapper_syscall_nr_aarch64); + TEST_REGISTER(test_rewrite_origin_map_x86_64); + TEST_REGISTER(test_rewrite_origin_map_aarch64); + TEST_REGISTER(test_rewrite_origin_map_add_site_source); + TEST_REGISTER(test_rewrite_probe_x86_64_page_zero_allowed); + TEST_REGISTER(test_rewrite_probe_x86_64_page_zero_blocked); + TEST_REGISTER(test_rewrite_fast_host_syscall0_classification); + TEST_REGISTER(test_rewrite_has_wrapper_syscalls_x86_64); + TEST_REGISTER(test_rewrite_has_wrapper_syscalls_aarch64); + TEST_REGISTER(test_rewrite_has_syscall_cancel_wrapper_syscalls_aarch64); + TEST_REGISTER(test_rewrite_wrapper_family_mask_memfd_x86_64); + TEST_REGISTER(test_rewrite_wrapper_family_mask_memfd_aarch64); + TEST_REGISTER(test_rewrite_visit_memfd_wrapper_candidates_x86_64); + TEST_REGISTER(test_rewrite_visit_memfd_wrapper_candidates_aarch64); + TEST_REGISTER(test_rewrite_collect_memfd_wrapper_candidates_aarch64); + TEST_REGISTER( + test_rewrite_collect_memfd_wrapper_candidates_by_kind_aarch64); + TEST_REGISTER(test_rewrite_collect_memfd_phase1_path_candidates_aarch64); + TEST_REGISTER(test_rewrite_apply_memfd_phase1_path_candidates_aarch64); +} diff --git a/tests/unit/test-runner.c b/tests/unit/test-runner.c index e9f1fff..1bb45ba 100644 --- a/tests/unit/test-runner.c +++ b/tests/unit/test-runner.c @@ -8,6 +8,8 @@ #include "test-runner.h" +#include + #define MAX_TESTS 256 struct test_entry { @@ -31,6 +33,20 @@ void test_register(const char *name, test_fn fn) test_count++; } +int test_mkstemp(char *path, size_t path_len, const char *name) +{ + const char *tmpdir = getenv("TMPDIR"); + + if (!path || path_len == 0 || !name || !name[0]) + return -1; + if (!tmpdir || !tmpdir[0]) + tmpdir = "/tmp"; + if ((size_t) snprintf(path, path_len, "%s/%s-XXXXXX", tmpdir, name) >= + path_len) + return -1; + return mkstemp(path); +} + void test_fail(const char *file, int line, const char *expr) { fprintf(stderr, " FAIL: %s:%d: %s\n", file, line, expr); @@ -72,23 +88,56 @@ void test_pass(void) } /* External init functions from each test file */ +/* Portable test suites (all hosts) */ extern void test_fd_table_init(void); extern void test_path_init(void); extern void test_identity_init(void); extern void test_syscall_nr_init(void); extern void test_elf_init(void); +extern void test_x86_decode_init(void); + +/* Linux-only test suites */ +#ifdef __linux__ +extern void test_procmem_init(void); +extern void test_syscall_request_init(void); +extern void test_syscall_trap_init(void); +extern void test_loader_entry_init(void); +extern void test_loader_handoff_init(void); +extern void test_loader_image_init(void); +extern void test_loader_layout_init(void); +extern void test_loader_launch_init(void); +extern void test_loader_stack_init(void); +extern void test_loader_transfer_init(void); +extern void test_rewrite_init(void); +#endif int main(int argc, char *argv[]) { (void) argc; (void) argv; - /* Register all test suites */ + /* Portable suites */ test_fd_table_init(); test_path_init(); test_identity_init(); test_syscall_nr_init(); test_elf_init(); + test_x86_decode_init(); + + /* Linux-only suites */ +#ifdef __linux__ + test_procmem_init(); + test_syscall_request_init(); + test_syscall_trap_init(); + test_loader_entry_init(); + test_loader_handoff_init(); + test_loader_image_init(); + test_loader_layout_init(); + test_loader_launch_init(); + test_loader_stack_init(); + test_loader_transfer_init(); + test_rewrite_init(); +#endif /* Run all tests */ int suite_fails = 0; diff --git a/tests/unit/test-runner.h b/tests/unit/test-runner.h index b54907a..f9c84a9 100644 --- a/tests/unit/test-runner.h +++ b/tests/unit/test-runner.h @@ -14,6 +14,7 @@ typedef void (*test_fn)(void); void test_register(const char *name, test_fn fn); +int test_mkstemp(char *path, size_t path_len, const char *name); void test_fail(const char *file, int line, const char *expr); void test_fail_eq_long(const char *file, int line, diff --git a/tests/unit/test-seccomp-stubs.c b/tests/unit/test-seccomp-stubs.c new file mode 100644 index 0000000..9ee547b --- /dev/null +++ b/tests/unit/test-seccomp-stubs.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "seccomp.h" + +struct kbox_dispatch kbox_dispatch_errno(int err) +{ + if (err <= 0) + err = EIO; + return (struct kbox_dispatch) { + .kind = KBOX_DISPATCH_RETURN, + .val = 0, + .error = err, + }; +} + +struct kbox_dispatch kbox_dispatch_value(int64_t val) +{ + return (struct kbox_dispatch) { + .kind = KBOX_DISPATCH_RETURN, + .val = val, + .error = 0, + }; +} + +struct kbox_dispatch kbox_dispatch_request( + struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req) +{ + struct kbox_dispatch dispatch; + + (void) ctx; + dispatch.kind = KBOX_DISPATCH_RETURN; + dispatch.val = req ? req->nr : -1; + dispatch.error = 0; + return dispatch; +} diff --git a/tests/unit/test-syscall-nr.c b/tests/unit/test-syscall-nr.c index 7f989b8..7bd5c12 100644 --- a/tests/unit/test-syscall-nr.c +++ b/tests/unit/test-syscall-nr.c @@ -70,6 +70,11 @@ static void test_host_x86_64_sendmsg(void) ASSERT_EQ(HOST_NRS_X86_64.sendmsg, 46); } +static void test_host_aarch64_gettimeofday(void) +{ + ASSERT_EQ(HOST_NRS_AARCH64.gettimeofday, 169); +} + static void test_host_aarch64_no_open(void) { /* aarch64 has no legacy open syscall */ @@ -98,6 +103,7 @@ void test_syscall_nr_init(void) TEST_REGISTER(test_generic_mkdirat_style); TEST_REGISTER(test_generic_getdents_unavailable); TEST_REGISTER(test_host_x86_64_sendmsg); + TEST_REGISTER(test_host_aarch64_gettimeofday); TEST_REGISTER(test_host_aarch64_no_open); TEST_REGISTER(test_at_fdcwd); } diff --git a/tests/unit/test-syscall-request.c b/tests/unit/test-syscall-request.c new file mode 100644 index 0000000..26c99c9 --- /dev/null +++ b/tests/unit/test-syscall-request.c @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: MIT */ + +#include + +#include "seccomp.h" +#include "test-runner.h" + +static const struct kbox_guest_mem_ops custom_guest_mem_ops = { + .read = NULL, + .write = NULL, + .write_force = NULL, + .read_string = NULL, + .read_open_how = NULL, +}; + +static void test_request_init_from_regs_seccomp_defaults_process_vm(void) +{ + struct kbox_syscall_request req; + struct kbox_syscall_regs regs = { + .nr = 39, + .instruction_pointer = 0x401000, + .args = {1, 2, 3, 4, 5, 6}, + }; + + ASSERT_EQ(kbox_syscall_request_init_from_regs( + &req, KBOX_SYSCALL_SOURCE_SECCOMP, 1234, 0x55, ®s, NULL), + 0); + ASSERT_EQ(req.source, KBOX_SYSCALL_SOURCE_SECCOMP); + ASSERT_EQ(req.pid, 1234); + ASSERT_EQ(req.cookie, 0x55); + ASSERT_EQ(req.nr, 39); + ASSERT_EQ(req.instruction_pointer, 0x401000); + ASSERT_EQ(req.args[0], 1); + ASSERT_EQ(req.args[5], 6); + ASSERT_EQ(req.guest_mem.ops, &kbox_process_vm_guest_mem_ops); + ASSERT_EQ(req.guest_mem.opaque, (uintptr_t) 1234); +} + +static void test_request_init_from_regs_preserves_custom_guest_mem(void) +{ + struct kbox_syscall_request req; + struct kbox_syscall_regs regs = { + .nr = 172, + .instruction_pointer = 0x8040, + .args = {7, 8, 9, 10, 11, 12}, + }; + struct kbox_guest_mem guest_mem = { + .ops = &custom_guest_mem_ops, + .opaque = 0xdeadbeef, + }; + + ASSERT_EQ(kbox_syscall_request_init_from_regs( + &req, KBOX_SYSCALL_SOURCE_TRAP, 77, 0, ®s, &guest_mem), + 0); + ASSERT_EQ(req.source, KBOX_SYSCALL_SOURCE_TRAP); + ASSERT_EQ(req.pid, 77); + ASSERT_EQ(req.nr, 172); + ASSERT_EQ(req.instruction_pointer, 0x8040); + ASSERT_EQ(req.args[1], 8); + ASSERT_EQ(req.guest_mem.ops, &custom_guest_mem_ops); + ASSERT_EQ(req.guest_mem.opaque, (uintptr_t) 0xdeadbeef); +} + +static void test_request_from_notif_uses_shared_decoder(void) +{ + struct kbox_seccomp_notif notif; + struct kbox_syscall_request req; + + memset(¬if, 0, sizeof(notif)); + notif.id = 0x1234; + notif.pid = 4321; + notif.data.nr = 59; + notif.data.instruction_pointer = 0xfeedface; + notif.data.args[0] = 11; + notif.data.args[5] = 66; + + ASSERT_EQ(kbox_syscall_request_from_notif(¬if, &req), 0); + ASSERT_EQ(req.source, KBOX_SYSCALL_SOURCE_SECCOMP); + ASSERT_EQ(req.pid, 4321); + ASSERT_EQ(req.cookie, 0x1234); + ASSERT_EQ(req.nr, 59); + ASSERT_EQ(req.instruction_pointer, 0xfeedface); + ASSERT_EQ(req.args[0], 11); + ASSERT_EQ(req.args[5], 66); + ASSERT_EQ(req.guest_mem.ops, &kbox_process_vm_guest_mem_ops); + ASSERT_EQ(req.guest_mem.opaque, (uintptr_t) 4321); +} + +void test_syscall_request_init(void) +{ + TEST_REGISTER(test_request_init_from_regs_seccomp_defaults_process_vm); + TEST_REGISTER(test_request_init_from_regs_preserves_custom_guest_mem); + TEST_REGISTER(test_request_from_notif_uses_shared_decoder); +} diff --git a/tests/unit/test-syscall-trap.c b/tests/unit/test-syscall-trap.c new file mode 100644 index 0000000..3f53650 --- /dev/null +++ b/tests/unit/test-syscall-trap.c @@ -0,0 +1,521 @@ +/* SPDX-License-Identifier: MIT */ + +#include +#include +#include +#include +#include +#include + +#include "seccomp.h" +#include "syscall-nr.h" +#include "syscall-trap.h" +#include "test-runner.h" + +static const struct kbox_guest_mem_ops trap_guest_mem_ops = { + .read = NULL, + .write = NULL, + .write_force = NULL, + .read_string = NULL, + .read_open_how = NULL, +}; + +static int custom_execute_calls; +static int custom_execute_last_nr; + +static int custom_trap_execute(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out) +{ + (void) runtime; + custom_execute_calls++; + custom_execute_last_nr = req ? req->nr : -1; + out->kind = KBOX_DISPATCH_RETURN; + out->val = req ? (req->nr + 10) : -1; + out->error = 0; + return 0; +} + +static const struct kbox_syscall_trap_ops custom_trap_ops = { + .execute = custom_trap_execute, +}; + +static int capture_only_execute(struct kbox_syscall_trap_runtime *runtime, + const struct kbox_syscall_request *req, + struct kbox_dispatch *out) +{ + (void) out; + return kbox_syscall_trap_runtime_capture(runtime, req); +} + +static const struct kbox_syscall_trap_ops capture_only_trap_ops = { + .execute = capture_only_execute, +}; + +static void init_sigsys(siginfo_t *info, int nr) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGSYS; + info->si_code = 1; + info->si_syscall = nr; +} + +static void test_sigsys_decode_rejects_non_sigsys(void) +{ + siginfo_t info; + ucontext_t uc; + struct kbox_syscall_regs regs; + + memset(&info, 0, sizeof(info)); + memset(&uc, 0, sizeof(uc)); + info.si_signo = SIGUSR1; + ASSERT_EQ(kbox_syscall_regs_from_sigsys(&info, &uc, ®s), -1); +} + +static void test_reserved_sigsys_helpers(void) +{ + unsigned char mask[8]; + + memset(mask, 0, sizeof(mask)); + ASSERT_EQ(kbox_syscall_trap_reserved_signal(), SIGSYS); + ASSERT_EQ(kbox_syscall_trap_signal_is_reserved(SIGSYS), 1); + ASSERT_EQ(kbox_syscall_trap_signal_is_reserved(SIGUSR1), 0); + ASSERT_EQ(kbox_syscall_trap_sigset_blocks_reserved(mask, sizeof(mask)), 0); + + mask[(SIGSYS - 1) / 8] = (unsigned char) (1U << ((SIGSYS - 1) % 8)); + ASSERT_EQ(kbox_syscall_trap_sigset_blocks_reserved(mask, sizeof(mask)), 1); +} + +static void test_host_syscall_range_contains_ip(void) +{ + struct kbox_syscall_trap_ip_range range; + uintptr_t ip = kbox_syscall_trap_host_syscall_ip(); + +#if defined(__x86_64__) || defined(__aarch64__) + ASSERT_EQ(kbox_syscall_trap_host_syscall_range(&range), 0); + ASSERT_TRUE(range.start < range.end); + ASSERT_TRUE(ip >= range.start); + ASSERT_TRUE(ip < range.end); +#else + ASSERT_EQ(kbox_syscall_trap_host_syscall_range(&range), -1); + ASSERT_EQ(ip, (uintptr_t) 0); +#endif +} + +#if defined(__x86_64__) +static void test_sigsys_decode_x86_64_registers(void) +{ + siginfo_t info; + ucontext_t uc; + struct kbox_syscall_regs regs; + + memset(&uc, 0, sizeof(uc)); + init_sigsys(&info, 257); + uc.uc_mcontext.gregs[REG_RIP] = 0x401234; + uc.uc_mcontext.gregs[REG_RDI] = 11; + uc.uc_mcontext.gregs[REG_RSI] = 22; + uc.uc_mcontext.gregs[REG_RDX] = 33; + uc.uc_mcontext.gregs[REG_R10] = 44; + uc.uc_mcontext.gregs[REG_R8] = 55; + uc.uc_mcontext.gregs[REG_R9] = 66; + uc.uc_mcontext.gregs[REG_RAX] = 999; + + ASSERT_EQ(kbox_syscall_regs_from_sigsys(&info, &uc, ®s), 0); + ASSERT_EQ(regs.nr, 257); + ASSERT_EQ(regs.instruction_pointer, 0x401234); + ASSERT_EQ(regs.args[0], 11); + ASSERT_EQ(regs.args[5], 66); +} +#elif defined(__aarch64__) +static void test_sigsys_decode_aarch64_registers(void) +{ + siginfo_t info; + ucontext_t uc; + struct kbox_syscall_regs regs; + + memset(&uc, 0, sizeof(uc)); + init_sigsys(&info, 56); + uc.uc_mcontext.pc = 0x4000; + uc.uc_mcontext.regs[0] = 101; + uc.uc_mcontext.regs[1] = 202; + uc.uc_mcontext.regs[2] = 303; + uc.uc_mcontext.regs[3] = 404; + uc.uc_mcontext.regs[4] = 505; + uc.uc_mcontext.regs[5] = 606; + uc.uc_mcontext.regs[8] = 999; + + ASSERT_EQ(kbox_syscall_regs_from_sigsys(&info, &uc, ®s), 0); + ASSERT_EQ(regs.nr, 56); + ASSERT_EQ(regs.instruction_pointer, 0x4000); + ASSERT_EQ(regs.args[0], 101); + ASSERT_EQ(regs.args[5], 606); +} +#endif + +static void test_sigsys_request_builder_uses_trap_source(void) +{ + siginfo_t info; + ucontext_t uc; + int expected_rc = -1; + struct kbox_guest_mem guest_mem = { + .ops = &trap_guest_mem_ops, + .opaque = 0x1234, + }; + struct kbox_syscall_request req; + + memset(&uc, 0, sizeof(uc)); +#if defined(__x86_64__) + init_sigsys(&info, 60); + uc.uc_mcontext.gregs[REG_RIP] = 0x5000; + uc.uc_mcontext.gregs[REG_RDI] = 7; + expected_rc = 0; +#elif defined(__aarch64__) + init_sigsys(&info, 93); + uc.uc_mcontext.pc = 0x5000; + uc.uc_mcontext.regs[0] = 7; + expected_rc = 0; +#else + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; +#endif + + ASSERT_EQ( + kbox_syscall_request_from_sigsys(&req, 777, &info, &uc, &guest_mem), + expected_rc); +#if defined(__x86_64__) || defined(__aarch64__) + ASSERT_EQ(req.source, KBOX_SYSCALL_SOURCE_TRAP); + ASSERT_EQ(req.pid, 777); + ASSERT_EQ(req.cookie, 0); + ASSERT_EQ(req.instruction_pointer, 0x5000); + ASSERT_EQ(req.args[0], 7); + ASSERT_EQ(req.guest_mem.ops, &trap_guest_mem_ops); + ASSERT_EQ(req.guest_mem.opaque, (uintptr_t) 0x1234); +#endif +} + +static void test_sigsys_request_builder_defaults_current_guest_mem(void) +{ + siginfo_t info; + ucontext_t uc; + int expected_rc = -1; + struct kbox_syscall_request req; + + memset(&uc, 0, sizeof(uc)); +#if defined(__x86_64__) + init_sigsys(&info, 39); + uc.uc_mcontext.gregs[REG_RIP] = 0x6000; + expected_rc = 0; +#elif defined(__aarch64__) + init_sigsys(&info, 172); + uc.uc_mcontext.pc = 0x6000; + expected_rc = 0; +#else + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; +#endif + + ASSERT_EQ(kbox_syscall_request_from_sigsys(&req, 123, &info, &uc, NULL), + expected_rc); +#if defined(__x86_64__) || defined(__aarch64__) + ASSERT_EQ(req.guest_mem.ops, &kbox_current_guest_mem_ops); + ASSERT_EQ(req.guest_mem.opaque, 0); +#endif +} + +static void test_sigsys_result_writer(void) +{ + ucontext_t uc; + int expected_rc = -1; + struct kbox_dispatch dispatch; + + memset(&uc, 0, sizeof(uc)); + dispatch.kind = KBOX_DISPATCH_RETURN; + dispatch.val = 1234; + dispatch.error = 0; + +#if defined(__x86_64__) || defined(__aarch64__) + expected_rc = 0; +#endif + ASSERT_EQ(kbox_syscall_result_to_sigsys(&uc, &dispatch), expected_rc); +#if defined(__x86_64__) + ASSERT_EQ(uc.uc_mcontext.gregs[REG_RAX], 1234); +#elif defined(__aarch64__) + ASSERT_EQ(uc.uc_mcontext.regs[0], 1234); +#endif +} + +static void test_sigsys_continue_executes_host_syscall(void) +{ + ucontext_t uc; + struct kbox_dispatch dispatch; + int expected_rc = -1; + + memset(&uc, 0, sizeof(uc)); + dispatch.kind = KBOX_DISPATCH_CONTINUE; + dispatch.val = 0; + dispatch.error = 0; + +#if defined(__x86_64__) + uc.uc_mcontext.gregs[REG_RAX] = HOST_NRS_X86_64.getpid; + expected_rc = 0; +#elif defined(__aarch64__) + uc.uc_mcontext.regs[8] = HOST_NRS_AARCH64.getpid; + expected_rc = 0; +#endif + + ASSERT_EQ(kbox_syscall_result_to_sigsys(&uc, &dispatch), expected_rc); +#if defined(__x86_64__) + ASSERT_EQ(uc.uc_mcontext.gregs[REG_RAX], getpid()); +#elif defined(__aarch64__) + ASSERT_EQ(uc.uc_mcontext.regs[0], (uint64_t) getpid()); +#endif +} + +static void test_sigsys_runtime_install_uninstall(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + + memset(&ctx, 0, sizeof(ctx)); +#if defined(__x86_64__) + ctx.host_nrs = &HOST_NRS_X86_64; +#elif defined(__aarch64__) + ctx.host_nrs = &HOST_NRS_AARCH64; +#endif + + ASSERT_EQ(kbox_syscall_trap_runtime_install(&runtime, &ctx), 0); + ASSERT_EQ(runtime.ctx, &ctx); + ASSERT_EQ(runtime.pid, getpid()); + ASSERT_EQ(runtime.installed, 1); + kbox_syscall_trap_runtime_uninstall(&runtime); + ASSERT_EQ(runtime.installed, 0); +} + +static void test_sigsys_trap_handle_uses_runtime_executor(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + siginfo_t info; + ucontext_t uc; + int expected_rc = -1; + + memset(&ctx, 0, sizeof(ctx)); + memset(&uc, 0, sizeof(uc)); + custom_execute_calls = 0; + custom_execute_last_nr = -1; + +#if defined(__x86_64__) + init_sigsys(&info, 39); + uc.uc_mcontext.gregs[REG_RIP] = 0x7100; + expected_rc = 0; +#elif defined(__aarch64__) + init_sigsys(&info, 172); + uc.uc_mcontext.pc = 0x7100; + expected_rc = 0; +#else + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; +#endif + + ASSERT_EQ(kbox_syscall_trap_runtime_init(&runtime, &ctx, &custom_trap_ops), + 0); + ASSERT_EQ(kbox_syscall_trap_handle(&runtime, &info, &uc), expected_rc); +#if defined(__x86_64__) + ASSERT_EQ(uc.uc_mcontext.gregs[REG_RAX], info.si_syscall + 10); +#elif defined(__aarch64__) + ASSERT_EQ(uc.uc_mcontext.regs[0], (uint64_t) info.si_syscall + 10); +#endif + ASSERT_EQ(custom_execute_calls, 1); + ASSERT_EQ(custom_execute_last_nr, info.si_syscall); + ASSERT_EQ(runtime.has_last_request, 1); + ASSERT_EQ(runtime.has_last_dispatch, 1); + ASSERT_EQ(runtime.last_request.nr, info.si_syscall); + ASSERT_EQ(runtime.last_dispatch.val, info.si_syscall + 10); +} + +static void test_sigsys_dispatch_helper(void) +{ + struct kbox_supervisor_ctx ctx; + siginfo_t info; + ucontext_t uc; + int expected_rc = -1; + + memset(&ctx, 0, sizeof(ctx)); + memset(&uc, 0, sizeof(uc)); +#if defined(__x86_64__) + ctx.host_nrs = &HOST_NRS_X86_64; + init_sigsys(&info, 39); + uc.uc_mcontext.gregs[REG_RIP] = 0x7000; + expected_rc = 0; +#elif defined(__aarch64__) + ctx.host_nrs = &HOST_NRS_AARCH64; + init_sigsys(&info, 172); + uc.uc_mcontext.pc = 0x7000; + expected_rc = 0; +#else + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; +#endif + + ASSERT_EQ(kbox_syscall_dispatch_sigsys(&ctx, 55, &info, &uc), expected_rc); +#if defined(__x86_64__) + ASSERT_EQ(uc.uc_mcontext.gregs[REG_RAX], info.si_syscall); +#elif defined(__aarch64__) + ASSERT_EQ(uc.uc_mcontext.regs[0], (uint64_t) info.si_syscall); +#endif +} + +static void test_trap_runtime_capture_and_dispatch_pending(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + siginfo_t info; + ucontext_t uc; + struct kbox_dispatch dispatch; + int expected_rc = -1; + + memset(&ctx, 0, sizeof(ctx)); + memset(&uc, 0, sizeof(uc)); +#if defined(__x86_64__) + ctx.host_nrs = &HOST_NRS_X86_64; + init_sigsys(&info, 39); + uc.uc_mcontext.gregs[REG_RIP] = 0x7200; + expected_rc = 0; +#elif defined(__aarch64__) + ctx.host_nrs = &HOST_NRS_AARCH64; + init_sigsys(&info, 172); + uc.uc_mcontext.pc = 0x7200; + expected_rc = 0; +#else + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; +#endif + + ASSERT_EQ( + kbox_syscall_trap_runtime_init(&runtime, &ctx, &capture_only_trap_ops), + 0); + ASSERT_EQ(kbox_syscall_request_from_sigsys(&runtime.pending_request, + runtime.pid, &info, &uc, NULL), + expected_rc); +#if defined(__x86_64__) || defined(__aarch64__) + runtime.has_pending_request = 1; + ASSERT_EQ(kbox_syscall_trap_runtime_dispatch_pending(&runtime, &dispatch), + 0); + ASSERT_EQ(dispatch.val, info.si_syscall); + ASSERT_EQ(runtime.has_pending_request, 0); + ASSERT_EQ(runtime.has_pending_dispatch, 1); + ASSERT_EQ(runtime.last_dispatch.val, info.si_syscall); +#endif +} + +static void test_trap_runtime_capture_wakes_fd(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + struct kbox_syscall_request req; + int pipefd[2]; + uint64_t wake_value = 0; + + memset(&ctx, 0, sizeof(ctx)); + memset(&req, 0, sizeof(req)); + req.nr = 42; + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(kbox_syscall_trap_runtime_init(&runtime, &ctx, NULL), 0); + kbox_syscall_trap_runtime_set_wake_fd(&runtime, pipefd[1]); + + ASSERT_EQ(kbox_syscall_trap_runtime_capture(&runtime, &req), 0); + ASSERT_EQ(runtime.has_pending_request, 1); + ASSERT_EQ(read(pipefd[0], &wake_value, sizeof(wake_value)), + (long) sizeof(wake_value)); + ASSERT_EQ((long) wake_value, 1); + + close(pipefd[0]); + close(pipefd[1]); +} + +static void test_trap_runtime_service_thread_dispatches(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + struct kbox_syscall_request req; + struct kbox_dispatch dispatch; + int i; + + memset(&ctx, 0, sizeof(ctx)); +#if defined(__aarch64__) + ctx.host_nrs = &HOST_NRS_AARCH64; +#else + ctx.host_nrs = &HOST_NRS_X86_64; +#endif + memset(&req, 0, sizeof(req)); + req.nr = 77; + + ASSERT_EQ( + kbox_syscall_trap_runtime_init(&runtime, &ctx, &capture_only_trap_ops), + 0); + ASSERT_EQ(kbox_syscall_trap_runtime_service_start(&runtime), 0); + ASSERT_EQ(kbox_syscall_trap_runtime_capture(&runtime, &req), 0); + + for (i = 0; i < 200; i++) { + if (__atomic_load_n(&runtime.has_pending_dispatch, __ATOMIC_ACQUIRE)) + break; + usleep(1000); + } + + ASSERT_EQ(__atomic_load_n(&runtime.has_pending_dispatch, __ATOMIC_ACQUIRE), + 1); + ASSERT_EQ(kbox_syscall_trap_runtime_take_dispatch(&runtime, &dispatch), 0); + ASSERT_EQ(dispatch.val, 77); + ASSERT_EQ(kbox_syscall_trap_runtime_service_stop(&runtime), 0); +} + +static void test_trap_active_dispatch_uses_service_thread(void) +{ + struct kbox_supervisor_ctx ctx; + struct kbox_syscall_trap_runtime runtime; + struct kbox_syscall_request req; + struct kbox_dispatch dispatch; + + memset(&ctx, 0, sizeof(ctx)); + memset(&req, 0, sizeof(req)); +#if defined(__x86_64__) + ctx.host_nrs = &HOST_NRS_X86_64; +#elif defined(__aarch64__) + ctx.host_nrs = &HOST_NRS_AARCH64; +#endif + req.nr = 88; + +#if defined(__x86_64__) || defined(__aarch64__) + ASSERT_EQ(kbox_syscall_trap_runtime_install(&runtime, &ctx), 0); + ASSERT_EQ(kbox_syscall_trap_active_pid(), runtime.pid); + ASSERT_EQ(kbox_syscall_trap_active_dispatch(&req, &dispatch), 0); + ASSERT_EQ(dispatch.val, 88); + kbox_syscall_trap_runtime_uninstall(&runtime); +#else + ASSERT_EQ(kbox_syscall_trap_active_pid(), (pid_t) -1); + ASSERT_EQ(kbox_syscall_trap_active_dispatch(&req, &dispatch), -1); +#endif +} + +void test_syscall_trap_init(void) +{ + TEST_REGISTER(test_sigsys_decode_rejects_non_sigsys); + TEST_REGISTER(test_reserved_sigsys_helpers); + TEST_REGISTER(test_host_syscall_range_contains_ip); +#if defined(__x86_64__) + TEST_REGISTER(test_sigsys_decode_x86_64_registers); +#elif defined(__aarch64__) + TEST_REGISTER(test_sigsys_decode_aarch64_registers); +#endif + TEST_REGISTER(test_sigsys_request_builder_uses_trap_source); + TEST_REGISTER(test_sigsys_request_builder_defaults_current_guest_mem); + TEST_REGISTER(test_sigsys_result_writer); + TEST_REGISTER(test_sigsys_continue_executes_host_syscall); + TEST_REGISTER(test_sigsys_runtime_install_uninstall); + TEST_REGISTER(test_sigsys_trap_handle_uses_runtime_executor); + TEST_REGISTER(test_sigsys_dispatch_helper); + TEST_REGISTER(test_trap_runtime_capture_and_dispatch_pending); + TEST_REGISTER(test_trap_runtime_capture_wakes_fd); + TEST_REGISTER(test_trap_runtime_service_thread_dispatches); + TEST_REGISTER(test_trap_active_dispatch_uses_service_thread); +} diff --git a/tests/unit/test-x86-decode.c b/tests/unit/test-x86-decode.c new file mode 100644 index 0000000..068a7eb --- /dev/null +++ b/tests/unit/test-x86-decode.c @@ -0,0 +1,405 @@ +/* SPDX-License-Identifier: MIT */ + +#include "../../src/x86-decode.h" +#include "test-runner.h" + +static void test_x86_nop(void) +{ + unsigned char buf[] = {0x90}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 1); +} + +static void test_x86_ret(void) +{ + unsigned char buf[] = {0xC3}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 1); +} + +static void test_x86_push_pop(void) +{ + unsigned char push_rax[] = {0x50}; + unsigned char pop_rbx[] = {0x5B}; + ASSERT_EQ(kbox_x86_insn_length(push_rax, 1), 1); + ASSERT_EQ(kbox_x86_insn_length(pop_rbx, 1), 1); +} + +static void test_x86_rex_push(void) +{ + unsigned char buf[] = {0x48, 0x50}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_syscall(void) +{ + unsigned char buf[] = {0x0F, 0x05}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_sysenter(void) +{ + unsigned char buf[] = {0x0F, 0x34}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_mov_eax_imm32(void) +{ + unsigned char buf[] = {0xB8, 0x01, 0x00, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +static void test_x86_mov_al_imm8(void) +{ + unsigned char buf[] = {0xB0, 0x42}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_movabs_rax_imm64(void) +{ + unsigned char buf[] = {0x48, 0xB8, 1, 2, 3, 4, 5, 6, 7, 8}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 10); +} + +static void test_x86_movabs_r11_imm64(void) +{ + unsigned char buf[] = {0x49, 0xBB, 1, 2, 3, 4, 5, 6, 7, 8}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 10); +} + +static void test_x86_xor_rax_rax(void) +{ + unsigned char buf[] = {0x48, 0x31, 0xC0}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 3); +} + +static void test_x86_mov_rbp_disp8(void) +{ + /* MOV [RBP-8], RAX */ + unsigned char buf[] = {0x48, 0x89, 0x45, 0xF8}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_mov_rsp_sib_disp8(void) +{ + /* MOV [RSP+0x10], RAX -- needs SIB byte */ + unsigned char buf[] = {0x48, 0x89, 0x44, 0x24, 0x10}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +static void test_x86_lea_rip_relative(void) +{ + /* LEA RAX, [RIP+disp32] */ + unsigned char buf[] = {0x48, 0x8D, 0x05, 0x00, 0x01, 0x02, 0x03}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 7); +} + +static void test_x86_jcc_rel8(void) +{ + unsigned char je[] = {0x74, 0x0A}; + unsigned char jne[] = {0x75, 0x10}; + ASSERT_EQ(kbox_x86_insn_length(je, sizeof(je)), 2); + ASSERT_EQ(kbox_x86_insn_length(jne, sizeof(jne)), 2); +} + +static void test_x86_jcc_rel32(void) +{ + unsigned char buf[] = {0x0F, 0x84, 0x00, 0x01, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 6); +} + +static void test_x86_call_rel32(void) +{ + unsigned char buf[] = {0xE8, 0x00, 0x01, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +static void test_x86_jmp_rel32(void) +{ + unsigned char buf[] = {0xE9, 0x00, 0x01, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +static void test_x86_jmp_rel8(void) +{ + unsigned char buf[] = {0xEB, 0x10}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_group1_imm8(void) +{ + /* ADD [RBP-4], imm8: 83 45 FC 01 */ + unsigned char buf[] = {0x83, 0x45, 0xFC, 0x01}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_group1_imm32(void) +{ + /* ADD [RBP-4], imm32: 81 45 FC 01020304 */ + unsigned char buf[] = {0x81, 0x45, 0xFC, 0x01, 0x02, 0x03, 0x04}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 7); +} + +static void test_x86_enter(void) +{ + unsigned char buf[] = {0xC8, 0x00, 0x01, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_ret_imm16(void) +{ + unsigned char buf[] = {0xC2, 0x08, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 3); +} + +static void test_x86_group3_test_imm8(void) +{ + /* TEST [RBP-1], imm8: F6 45 FF 01 */ + unsigned char buf[] = {0xF6, 0x45, 0xFF, 0x01}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_group3_test_imm32(void) +{ + /* TEST EAX, imm32: F7 C0 01020304 */ + unsigned char buf[] = {0xF7, 0xC0, 0x01, 0x02, 0x03, 0x04}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 6); +} + +static void test_x86_group3_not(void) +{ + /* NOT EAX: F7 D0 (no immediate) */ + unsigned char buf[] = {0xF7, 0xD0}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_call_rax(void) +{ + /* FF D0 */ + unsigned char buf[] = {0xFF, 0xD0}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_jmp_r10(void) +{ + /* 41 FF E2 */ + unsigned char buf[] = {0x41, 0xFF, 0xE2}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 3); +} + +static void test_x86_lock_xchg(void) +{ + /* F0 87 03 */ + unsigned char buf[] = {0xF0, 0x87, 0x03}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 3); +} + +static void test_x86_rep_movsb(void) +{ + unsigned char buf[] = {0xF3, 0xA4}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_cpuid(void) +{ + unsigned char buf[] = {0x0F, 0xA2}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_ud2(void) +{ + unsigned char buf[] = {0x0F, 0x0B}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 2); +} + +static void test_x86_66_prefix_imm16(void) +{ + /* 66 05 01 02: ADD AX, imm16 */ + unsigned char buf[] = {0x66, 0x05, 0x01, 0x02}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_multi_byte_nop(void) +{ + /* 0F 1F 00: NOP DWORD [RAX] */ + unsigned char nop3[] = {0x0F, 0x1F, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(nop3, sizeof(nop3)), 3); + + /* 0F 1F 40 00: NOP DWORD [RAX+0] */ + unsigned char nop4[] = {0x0F, 0x1F, 0x40, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(nop4, sizeof(nop4)), 4); + + /* 66 0F 1F 44 00 00: 6-byte NOP */ + unsigned char nop6[] = {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(nop6, sizeof(nop6)), 6); +} + +static void test_x86_truncated(void) +{ + unsigned char buf[] = {0x0F}; + ASSERT_EQ(kbox_x86_insn_length(buf, 1), 0); +} + +static void test_x86_null_input(void) +{ + ASSERT_EQ(kbox_x86_insn_length(NULL, 10), 0); +} + +static void test_x86_zero_len(void) +{ + unsigned char buf[] = {0x90}; + ASSERT_EQ(kbox_x86_insn_length(buf, 0), 0); +} + +/* Critical: bytes 0F 05 inside an immediate must NOT look like syscall. */ +static void test_x86_false_positive_syscall_in_imm(void) +{ + /* MOV EAX, 0x0000050F: B8 0F 05 00 00 */ + unsigned char mov[] = {0xB8, 0x0F, 0x05, 0x00, 0x00}; + ASSERT_EQ(kbox_x86_insn_length(mov, sizeof(mov)), 5); +} + +static void test_x86_false_positive_syscall_in_disp(void) +{ + /* MOV byte [rsp+0x0f], 5: C6 44 24 0F 05 */ + unsigned char mov[] = {0xC6, 0x44, 0x24, 0x0F, 0x05}; + ASSERT_EQ(kbox_x86_insn_length(mov, sizeof(mov)), 5); +} + +static void test_x86_3byte_escape_0f38(void) +{ + /* 66 0F 38 00 C1: PSHUFB XMM0, XMM1 */ + unsigned char buf[] = {0x66, 0x0F, 0x38, 0x00, 0xC1}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +static void test_x86_3byte_escape_0f3a(void) +{ + /* 66 0F 3A 0F C1 04: PALIGNR XMM0, XMM1, 4 */ + unsigned char buf[] = {0x66, 0x0F, 0x3A, 0x0F, 0xC1, 0x04}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 6); +} + +static void test_x86_moffs64(void) +{ + /* A1 + 8-byte address */ + unsigned char buf[] = {0xA1, 1, 2, 3, 4, 5, 6, 7, 8}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 9); +} + +static void test_x86_moffs32_67(void) +{ + /* 67 A1 + 4-byte address */ + unsigned char buf[] = {0x67, 0xA1, 1, 2, 3, 4}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 6); +} + +static void test_x86_bswap(void) +{ + unsigned char bswap32[] = {0x0F, 0xC8}; + unsigned char bswap64[] = {0x48, 0x0F, 0xC8}; + ASSERT_EQ(kbox_x86_insn_length(bswap32, sizeof(bswap32)), 2); + ASSERT_EQ(kbox_x86_insn_length(bswap64, sizeof(bswap64)), 3); +} + +static void test_x86_sib_disp32_no_base(void) +{ + /* MOV EAX, [disp32+RSI*4]: 8B 04 B5 00 10 20 30 */ + unsigned char buf[] = {0x8B, 0x04, 0xB5, 0x00, 0x10, 0x20, 0x30}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 7); +} + +static void test_x86_cmovcc(void) +{ + /* 48 0F 44 C1: CMOVE RAX, RCX */ + unsigned char buf[] = {0x48, 0x0F, 0x44, 0xC1}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_bt_imm8(void) +{ + /* 0F BA E0 34: BT EAX, 0x34 (byte 34 could look like sysenter) */ + unsigned char buf[] = {0x0F, 0xBA, 0xE0, 0x34}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +/* REX followed by legacy prefix: REX is invalidated. */ +static void test_x86_rex_then_lock(void) +{ + /* 48 F0 87 03: ignored-REX, LOCK XCHG [RBX], EAX (4 bytes) */ + unsigned char buf[] = {0x48, 0xF0, 0x87, 0x03}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_rex_then_66(void) +{ + /* 48 66 89 C0: ignored-REX, 66 MOV AX, AX (4 bytes) */ + unsigned char buf[] = {0x48, 0x66, 0x89, 0xC0}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 4); +} + +static void test_x86_stray_rex_at_end(void) +{ + unsigned char buf[] = {0x48}; + ASSERT_EQ(kbox_x86_insn_length(buf, 1), 0); +} + +static void test_x86_66_group1_imm16(void) +{ + /* 66 81 C0 01 02: ADD AX, imm16 (5 bytes with 66h) */ + unsigned char buf[] = {0x66, 0x81, 0xC0, 0x01, 0x02}; + ASSERT_EQ(kbox_x86_insn_length(buf, sizeof(buf)), 5); +} + +void test_x86_decode_init(void) +{ + TEST_REGISTER(test_x86_nop); + TEST_REGISTER(test_x86_ret); + TEST_REGISTER(test_x86_push_pop); + TEST_REGISTER(test_x86_rex_push); + TEST_REGISTER(test_x86_syscall); + TEST_REGISTER(test_x86_sysenter); + TEST_REGISTER(test_x86_mov_eax_imm32); + TEST_REGISTER(test_x86_mov_al_imm8); + TEST_REGISTER(test_x86_movabs_rax_imm64); + TEST_REGISTER(test_x86_movabs_r11_imm64); + TEST_REGISTER(test_x86_xor_rax_rax); + TEST_REGISTER(test_x86_mov_rbp_disp8); + TEST_REGISTER(test_x86_mov_rsp_sib_disp8); + TEST_REGISTER(test_x86_lea_rip_relative); + TEST_REGISTER(test_x86_jcc_rel8); + TEST_REGISTER(test_x86_jcc_rel32); + TEST_REGISTER(test_x86_call_rel32); + TEST_REGISTER(test_x86_jmp_rel32); + TEST_REGISTER(test_x86_jmp_rel8); + TEST_REGISTER(test_x86_group1_imm8); + TEST_REGISTER(test_x86_group1_imm32); + TEST_REGISTER(test_x86_enter); + TEST_REGISTER(test_x86_ret_imm16); + TEST_REGISTER(test_x86_group3_test_imm8); + TEST_REGISTER(test_x86_group3_test_imm32); + TEST_REGISTER(test_x86_group3_not); + TEST_REGISTER(test_x86_call_rax); + TEST_REGISTER(test_x86_jmp_r10); + TEST_REGISTER(test_x86_lock_xchg); + TEST_REGISTER(test_x86_rep_movsb); + TEST_REGISTER(test_x86_cpuid); + TEST_REGISTER(test_x86_ud2); + TEST_REGISTER(test_x86_66_prefix_imm16); + TEST_REGISTER(test_x86_multi_byte_nop); + TEST_REGISTER(test_x86_truncated); + TEST_REGISTER(test_x86_null_input); + TEST_REGISTER(test_x86_zero_len); + TEST_REGISTER(test_x86_false_positive_syscall_in_imm); + TEST_REGISTER(test_x86_false_positive_syscall_in_disp); + TEST_REGISTER(test_x86_3byte_escape_0f38); + TEST_REGISTER(test_x86_3byte_escape_0f3a); + TEST_REGISTER(test_x86_moffs64); + TEST_REGISTER(test_x86_moffs32_67); + TEST_REGISTER(test_x86_bswap); + TEST_REGISTER(test_x86_sib_disp32_no_base); + TEST_REGISTER(test_x86_cmovcc); + TEST_REGISTER(test_x86_bt_imm8); + TEST_REGISTER(test_x86_rex_then_lock); + TEST_REGISTER(test_x86_rex_then_66); + TEST_REGISTER(test_x86_stray_rex_at_end); + TEST_REGISTER(test_x86_66_group1_imm16); +}