From 51adfb7297cc256db605170380be10b431933986 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 12 May 2026 11:58:32 +0800 Subject: [PATCH] Harden runtime and tests for Apple Silicon hosts runtime_set_process_title rewrote argv[0] through the end of envp using memcpy and memset. Apple libc on M-series can lower those into cache-line-aligned stp ladders and DC ZVA stores that step past the explicit byte count when the destination tail sits at the host stack ceiling under a small RLIMIT_STACK. The overshoot crossed the guard page and delivered SIGSEGV before main() could even reach the vCPU run loop, so the binary appeared to crash with no syscall trace. Walk only the contiguous argv block (stop at the first non-contiguous argv[i] or first NULL) and overwrite it through a volatile char pointer one byte at a time, so the compiler cannot fold the loop into memset. envp is no longer touched, which also removes the need to duplicate the environment block. argc bounds and the title-vs-avail arithmetic remain unchanged. main.c now reads DCZID_EL0 at startup and aborts if BS != 4. The shim emulates each trapped DC ZVA by zeroing exactly 64 bytes, while guest libc reads DCZID_EL0 directly (no MRS trap) and uses its value as the stride for memset(0). Every Apple M1 through M4 reports BS=4 (64 bytes), but a future host that bumps the granule would cause silent partial-zero corruption of guest memory. Fail closed at startup with a pointer to src/core/shim.S instead of letting the mismatch surface as data corruption later. DZP=1 hosts are also rejected. mk/shim.mk now inspects the output of OBJCOPY -O binary for residual Mach-O magics (MH_MAGIC / MH_MAGIC_64 / FAT, both byte orders). Apple ships an objcopy that leaves the Mach-O container in place; only GNU binutils objcopy reliably extracts the raw aarch64 section. The check deletes the bogus shim.bin and prints a recovery hint instead of silently embedding a Mach-O header into shim_blob.h. tests/lib/test-runner.sh now resolves timeout(1) at source time. macOS does not ship timeout, so the runner falls back to gtimeout on PATH or the Homebrew opt symlinks under both arm64 and x86_64 prefixes, and exposes the resolved binary through a shell function so callers keep using the bare name. TIMEOUT_BIN overrides the search. tests/test-busybox.sh probes busybox --list under elfuse once at start and exposes the applet set through BB_APPLETS. test_skip_missing_tool is overridden to consult that list, so reduced busybox builds (Debian busybox-static drops a handful of applets) yield SKIP instead of FAIL. The probe hard-fails if --list itself fails so a broken elfuse cannot silently degrade the whole suite to SKIP. tests/test-proctitle-low-stack.sh exercises the proctitle rewrite path under a capped stack and wires into make check via mk/tests.mk. The test runs through elfuse busybox echo and verifies both that the run completes and that the expected output is preserved. Close #12 --- mk/shim.mk | 9 ++++ mk/tests.mk | 11 ++++ src/main.c | 35 +++++++++++++ src/runtime/proctitle.c | 83 ++++++++++--------------------- tests/lib/test-runner.sh | 35 +++++++++++++ tests/test-busybox.sh | 41 +++++++++++++++ tests/test-proctitle-low-stack.sh | 55 ++++++++++++++++++++ 7 files changed, 213 insertions(+), 56 deletions(-) create mode 100644 tests/test-proctitle-low-stack.sh diff --git a/mk/shim.mk b/mk/shim.mk index 32ffb8a..d14ad23 100644 --- a/mk/shim.mk +++ b/mk/shim.mk @@ -9,6 +9,15 @@ $(BUILD_DIR)/shim.o: src/core/shim.S | $(BUILD_DIR) $(BUILD_DIR)/shim.bin: $(BUILD_DIR)/shim.o @echo " OBJCOPY $@" $(Q)$(OBJCOPY) -O binary $< $@ + $(Q)magic=$$(od -An -N4 -tx1 $@ | tr -d '[:space:]'); \ + case "$$magic" in \ + cffaedfe|cefaedfe|feedface|feedfacf|cafebabe|bebafeca|cafebabf|bfbafeca) \ + echo "ERROR: $@ still has a Mach-O header (magic $$magic)."; \ + echo " $(OBJCOPY) does not strip Mach-O containers in -O binary mode."; \ + echo " Install GNU binutils (brew install binutils) and rebuild, or"; \ + echo " set OBJCOPY=/opt/homebrew/opt/binutils/bin/objcopy."; \ + rm -f $@; exit 1;; \ + esac $(BUILD_DIR)/shim_blob.h: $(BUILD_DIR)/shim.bin @echo " GEN $@" diff --git a/mk/tests.mk b/mk/tests.mk index 2268273..cd86dc3 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -6,6 +6,7 @@ test-glibc-coreutils test-perf \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ + test-proctitle-low-stack \ test-sysroot-procfs-exec test-timeout-disable \ test-sysroot-nofollow test-sysroot-chdir perf @@ -17,6 +18,8 @@ test-hello: $(ELFUSE_BIN) $(TEST_HELLO_DEP) ## Run the unit test suite plus busybox applet validation check: $(ELFUSE_BIN) $(TEST_DEPS) @bash tests/driver.sh -e $(ELFUSE_BIN) -d $(TEST_DIR) -v + @printf "\n$(BLUE)━━━ proctitle low-stack regression ━━━$(RESET)\n" + @$(MAKE) --no-print-directory test-proctitle-low-stack @printf "\n$(BLUE)━━━ busybox applet validation ━━━$(RESET)\n" @$(MAKE) --no-print-directory test-busybox @printf "\n$(BLUE)━━━ sysroot procfs exec validation ━━━$(RESET)\n" @@ -205,6 +208,14 @@ test-busybox: $(ELFUSE_BIN) $(BUSYBOX_DEPS) fi @bash tests/test-busybox.sh $(ELFUSE_BIN) $(BUSYBOX_BIN) +## Run the low-stack argv rewrite regression on busybox startup +test-proctitle-low-stack: $(ELFUSE_BIN) $(BUSYBOX_DEPS) + @if [ ! -x "$(BUSYBOX_BIN)" ]; then \ + printf "$(RED)✗ Busybox not found.$(RESET) Set BUSYBOX_BIN=/path/to/busybox.\n"; \ + exit 1; \ + fi + @bash tests/test-proctitle-low-stack.sh $(ELFUSE_BIN) $(BUSYBOX_BIN) + # ── Static binary integration tests ────────────────────────────── ifdef GUEST_STATIC_BINS diff --git a/src/main.c b/src/main.c index a4966ff..f82ca45 100644 --- a/src/main.c +++ b/src/main.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,37 @@ static void cleanup_main_resources(guest_t *g, /* Build-time version string (generated by make into build/version.h) */ #include "version.h" +/* Verify the host CPU's DC ZVA granule matches the shim's hardcoded value. + * + * DCZID_EL0 is readable from EL0 without trapping, so guest libc reads the + * host's value directly and uses it as the stride for memset(0) loops. The + * shim emulates each trapped DC ZVA by zeroing exactly 64 bytes + * (src/core/shim.S). Apple Silicon M1..M4 report DCZID_EL0.BS=4 (64 bytes); + * any future host that advertises a different granule would cause silent + * partial-zero corruption of guest memory. Abort here so the mismatch + * surfaces at startup instead of as data corruption later. + */ +static int host_dc_zva_assert(void) +{ + uint64_t dczid; + __asm__ volatile("mrs %0, DCZID_EL0" : "=r"(dczid)); + if (dczid & (1ULL << 4)) { + log_error( + "host CPU prohibits DC ZVA (DCZID_EL0.DZP=1); cannot run " + "guests that depend on it"); + return -1; + } + unsigned bs = (unsigned) (dczid & 0xF); + if (bs != 4) { + log_error( + "host DCZID_EL0.BS=%u (%u-byte DC ZVA block) but the shim " + "emulates 64 bytes; update src/core/shim.S before running", + bs, 1u << (bs + 2)); + return -1; + } + return 0; +} + int main(int argc, char **argv) { log_init(); @@ -114,6 +146,9 @@ int main(int argc, char **argv) } } + if (host_dc_zva_assert() < 0) + return 1; + /* Parse elfuse options until the first guest argv element. */ while (arg_start < argc && argv[arg_start][0] == '-') { if (!strcmp(argv[arg_start], "--verbose") || diff --git a/src/runtime/proctitle.c b/src/runtime/proctitle.c index 4e296dc..61be8c2 100644 --- a/src/runtime/proctitle.c +++ b/src/runtime/proctitle.c @@ -15,68 +15,35 @@ #include "runtime/proctitle.h" -static char *runtime_find_argv_environ_end(int argc, char **argv, char **envp) +/* Return the contiguous argv block size starting at argv[0]. + * + * Stop at the first non-contiguous argv entry and exclude the environment block + * entirely. Rewriting through envp is unsafe on Apple Silicon because libc's + * optimized memset may zero in cache-line chunks and step past the top of the + * stack when argv/env reach the stack ceiling under a small RLIMIT_STACK. + */ +static size_t runtime_argv_block_size(int argc, char **argv) { - char *end = argv[0]; + char *next = argv[0]; for (int i = 0; i < argc; i++) { - if (!argv[i]) - continue; - - char *next = argv[i] + strlen(argv[i]) + 1; - if (next > end) - end = next; - } - - for (int i = 0; envp[i]; i++) { - char *next = envp[i] + strlen(envp[i]) + 1; - if (next > end) - end = next; - } - - return end; -} - -static bool runtime_duplicate_environment(char ***out_envp) -{ - extern char **environ; - int env_count = 0; - - while (environ[env_count]) - env_count++; - - char **new_environ = - (char **) malloc((size_t) (env_count + 1) * sizeof(char *)); - if (!new_environ) - return false; - - for (int i = 0; i < env_count; i++) { - new_environ[i] = strdup(environ[i]); - if (new_environ[i]) - continue; - - for (int j = 0; j < i; j++) - free(new_environ[j]); - free(new_environ); - return false; + if (!argv[i] || argv[i] != next) + break; + next = argv[i] + strlen(argv[i]) + 1; } - new_environ[env_count] = NULL; - *out_envp = new_environ; - return true; + return (size_t) (next - argv[0]); } void runtime_set_process_title(int argc, char **argv, const char *elf_path) { - extern char **environ; - char **new_environ = NULL; size_t avail; const char *arch = "aarch64"; char title[256]; char thread_name[64]; size_t title_len; - if (argc <= 0 || !argv || !argv[0] || !elf_path || !environ) + if (argc <= 0 || !argv || !argv[0] || !elf_path) return; const char *slash = strrchr(elf_path, '/'); @@ -90,19 +57,23 @@ void runtime_set_process_title(int argc, char **argv, const char *elf_path) snprintf(thread_name, sizeof(thread_name), "%s (%s-linux)", bin, arch); pthread_setname_np(thread_name); - avail = - (size_t) (runtime_find_argv_environ_end(argc, argv, environ) - argv[0]); + avail = runtime_argv_block_size(argc, argv); if (avail == 0) return; - if (!runtime_duplicate_environment(&new_environ)) - return; - environ = new_environ; + /* Write the argv block with explicit byte stores through a volatile + * destination. The libc memcpy/memset on Apple Silicon are free to use + * cache-line-aligned stp/DC ZVA stores; using single-byte STRB removes + * any chance of touching the byte past avail, which on a Linux-style + * initial stack is the first character of envp[0]. + */ + size_t copy = title_len < avail ? title_len : avail - 1; + volatile char *dst = (volatile char *) argv[0]; + for (size_t i = 0; i < copy; i++) + dst[i] = title[i]; + for (size_t i = copy; i < avail; i++) + dst[i] = '\0'; - if (title_len < avail) { - memcpy(argv[0], title, title_len); - memset(argv[0] + title_len, '\0', avail - title_len); - } for (int i = 1; i < argc; i++) argv[i] = NULL; } diff --git a/tests/lib/test-runner.sh b/tests/lib/test-runner.sh index b5322e1..aa6ef6e 100644 --- a/tests/lib/test-runner.sh +++ b/tests/lib/test-runner.sh @@ -10,6 +10,41 @@ : "${TEST_LABEL_WIDTH:=14}" : "${TEST_TIMEOUT:=10}" +# Resolve a working `timeout` binary. macOS doesn't ship one, so fall back to +# GNU coreutils' gtimeout. Wrap as a function so callers keep using the bare +# name `timeout`. Resolution order: TIMEOUT_BIN env override, `timeout` on +# PATH, `gtimeout` on PATH, then Homebrew's stable opt symlinks for ARM and +# Intel macOS (the install prefix differs between the two). +if [ -n "${TIMEOUT_BIN:-}" ]; then + timeout() + { + "$TIMEOUT_BIN" "$@" + } +elif ! command -v timeout > /dev/null 2>&1; then + _timeout_bin= + if command -v gtimeout > /dev/null 2>&1; then + _timeout_bin=gtimeout + else + for _candidate in /opt/homebrew/opt/coreutils/bin/gtimeout \ + /usr/local/opt/coreutils/bin/gtimeout; do + if [ -x "$_candidate" ]; then + _timeout_bin="$_candidate" + break + fi + done + fi + if [ -n "$_timeout_bin" ]; then + # shellcheck disable=SC2317 # Invoked indirectly via `timeout` callers. + eval "timeout() { \"$_timeout_bin\" \"\$@\"; }" + else + echo "test-runner: no 'timeout' or 'gtimeout' in PATH." >&2 + echo " Install GNU coreutils (brew install coreutils), put gtimeout" >&2 + echo " on PATH, or set TIMEOUT_BIN=/path/to/timeout." >&2 + exit 127 + fi + unset _timeout_bin _candidate +fi + if [ -t 1 ]; then # Use ANSI-C quoting so the variables hold real ESC bytes, not the literal # 4-char "\033" sequence. Without this, callers that pass colors as printf diff --git a/tests/test-busybox.sh b/tests/test-busybox.sh index e5dbe07..9969912 100755 --- a/tests/test-busybox.sh +++ b/tests/test-busybox.sh @@ -37,9 +37,50 @@ test_tool_path() printf "%s" "$1" } +# Probe which applets this busybox binary actually carries. The Debian +# busybox-static drops a handful of applets (e.g. comm) compared to a +# full build, and tests for them must skip rather than fail. Hard-fail +# the whole suite if the probe itself fails so a broken elfuse/busybox +# does not silently degrade to "all SKIP". +if ! _bb_list=$(timeout "$TEST_TIMEOUT" "$ELFUSE" "$BB" --list 2>&1); then + printf "test-busybox: probing '%s --list' under elfuse failed:\n%s\n" \ + "$BB" "$_bb_list" >&2 + exit 1 +fi +BB_APPLETS=" $(printf '%s\n' "$_bb_list" | tr '\n' ' ') " +# Sanity: a usable busybox should expose at least one of these common +# applets. A reduced build may legitimately omit sh, so accept any of +# the small universal set; only fail if --list produced nothing usable. +case "$BB_APPLETS" in + *" sh "* | *" echo "* | *" cat "* | *" ls "* | *" true "*) ;; + *) + printf "test-busybox: applet list from '%s --list' looks empty or malformed:\n%s\n" \ + "$BB" "$_bb_list" >&2 + exit 1 + ;; +esac +unset _bb_list + +# Override: skip if the requested applet isn't compiled into this busybox. +# shellcheck disable=SC2329 # Invoked indirectly by tests/lib/test-runner.sh. +test_skip_missing_tool() +{ + local tool="$1" + case "$BB_APPLETS" in + *" $tool "*) return 1 ;; + esac + run_skip "$tool" "applet not in this busybox build" + return 0 +} + run_nc_http_check() { local applet="nc" output rc server_pid port_file port + + if test_skip_missing_tool "$applet"; then + return + fi + port_file=$(mktemp "${TMPDIR}/nc-http-port.XXXXXX") || { test_report skip "$applet" " (failed to create port file)" skip=$((skip + 1)) diff --git a/tests/test-proctitle-low-stack.sh b/tests/test-proctitle-low-stack.sh new file mode 100644 index 0000000..276dc70 --- /dev/null +++ b/tests/test-proctitle-low-stack.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# test-proctitle-low-stack.sh — Regress Apple Silicon argv/env stack overwrite +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Usage: tests/test-proctitle-low-stack.sh + +set -euo pipefail + +ELFUSE="${1:?Usage: $0 }" +BB="${2:?Usage: $0 }" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck disable=SC2034 # Consumed by tests/lib/test-runner.sh. +TEST_TIMEOUT="${TEST_TIMEOUT:-10}" +# shellcheck source=tests/lib/test-runner.sh +source "$SCRIPT_DIR/lib/test-runner.sh" + +output= +if output="$( + # shellcheck disable=SC2016 # Positional params are expanded by the child shell. + timeout "$TEST_TIMEOUT" sh -c ' + current_stack=$(ulimit -S -s) + case "$current_stack" in + unlimited) ulimit -S -s 8192 ;; + "" | *[!0-9]*) ;; + *) + if [ "$current_stack" -gt 8192 ]; then + ulimit -S -s 8192 + fi + ;; + esac + exec "$1" "$2" echo hello + ' sh "$ELFUSE" "$BB" +)"; then + : +else + rc=$? + if [ "$rc" -eq 124 ]; then + printf "test-proctitle-low-stack: elfuse hung under low stack (timeout after %ss)\n" \ + "$TEST_TIMEOUT" >&2 + exit 1 + fi + printf "test-proctitle-low-stack: elfuse failed under low stack (rc=%d)\n" \ + "$rc" >&2 + exit "$rc" +fi + +if [ "$output" != "hello" ]; then + printf "test-proctitle-low-stack: unexpected output under low stack: %s\n" \ + "$output" >&2 + exit 1 +fi + +printf "test-proctitle-low-stack: PASS\n"