From 40a759e18d107b8b7ef78776da33a820cef97649 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 10 May 2026 23:37:30 +0800 Subject: [PATCH] Move runtime infrastructure out of low user-VA Page-table pool and EL1 shim previously sat at fixed low addresses [0x10000, 0x400000), colliding with low-linked ET_EXECs. Android linker64 binaries link at 0x200000 and the loader accepted them, but sys_{mprotect,munmap,mmap} MAP_FIXED, and rt_sigreturn then rejected any operation on the overlapping pages with a bare EINVAL as soon as the binary tried to RELRO its data segment. Relocate the page-table pool, shim code, and shim data into a 4 MiB reserve placed just below g->interp_base, in the dead zone between g->mmap_limit and g->interp_base. PT_POOL_BASE, SHIM_BASE, and SHIM_DATA_BASE become runtime guest_t fields computed by compute_infra_layout from guest_size; for 36-bit IPA the reserve sits at [60 GiB - 4 MiB, 60 GiB), for 40-bit IPA at [1020 GiB - 4 MiB, 1020 GiB). Two helpers guest_range_hits_infra and guest_addr_in_infra retarget the four infra guards at the new range without weakening their security intent. The 64 KiB null-guard slot at the bottom of the reserve is covered too so guest mmap state cannot semantically reserve it either. Bump fork IPC to v9 to carry elf_load_min so nested forks from low-linked ET_EXECs see the actual load address rather than the legacy ELF_DEFAULT_BASE constant. Validate hdr.ipa_bits, hdr.guest_size, and the page-aligned in-pool location of hdr.pt_pool_next and hdr.ttbr0 in the child path before any size-derived arithmetic so a malformed header cannot underflow interp_base or misalign the page-table walker. Plumb guest_t through thread_alloc_sp_el1 and record the slot index in thread_entry_t so thread_free_sp_el1_locked can clear the bitmap from teardown contexts (thread_{deactivate,destroy_all_vcpus,ptrace_wait) that lack a guest_t reference. Add tests/test-fork-lowbase.c, a static ET_EXEC linked at 0x200000 that exercises a nested fork. The grandchild only completes when intermediate child preserved elf_load_min across the IPC handoff. --- Makefile | 7 ++ src/core/bootstrap.c | 28 +++++--- src/core/guest.c | 130 ++++++++++++++++++++++------------ src/core/guest.h | 144 ++++++++++++++++++++++++++++---------- src/core/shim.S | 44 ++++++------ src/runtime/fork-state.h | 3 +- src/runtime/forkipc.c | 58 ++++++++++++--- src/runtime/thread.c | 42 ++++++----- src/runtime/thread.h | 24 +++++-- src/syscall/exec.c | 21 +++--- src/syscall/mem.c | 11 +-- src/syscall/signal.c | 4 +- tests/manifest.txt | 1 + tests/test-fork-lowbase.c | 95 +++++++++++++++++++++++++ 14 files changed, 448 insertions(+), 164 deletions(-) create mode 100644 tests/test-fork-lowbase.c diff --git a/Makefile b/Makefile index 78c96bf..276b933 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,13 @@ $(BUILD_DIR)/test-futex-waitv: tests/test-futex-waitv.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-fork-lowbase must be a non-PIE ET_EXEC linked below ELF_DEFAULT_BASE so +# nested forks exercise elf_load_min preservation across fork IPC. +$(BUILD_DIR)/test-fork-lowbase: tests/test-fork-lowbase.c | $(BUILD_DIR) + @echo " CROSS $< (low-base ET_EXEC)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -no-pie \ + -Wl,-Ttext-segment=0x200000 -o $@ $< + endif include mk/tests.mk diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index e7912fb..5bdd386 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -134,10 +134,10 @@ static bool build_boot_regions(mem_region_t *regions, * to the vDSO page when splitting the block; otherwise vdso_build cannot * write into it through guest_ptr. */ - if (!append_boot_region(regions, nregions, SHIM_BASE, - SHIM_BASE + shim_bin_len, MEM_PERM_RX) || - !append_boot_region(regions, nregions, SHIM_DATA_BASE, - SHIM_DATA_BASE + BLOCK_2MIB, MEM_PERM_RW) || + if (!append_boot_region(regions, nregions, g->shim_base, + g->shim_base + shim_bin_len, MEM_PERM_RX) || + !append_boot_region(regions, nregions, g->shim_data_base, + g->shim_data_base + BLOCK_2MIB, MEM_PERM_RW) || !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE, MEM_PERM_RX)) { return false; @@ -234,6 +234,11 @@ int guest_bootstrap_prepare(guest_t *g, return -1; } + /* Track the lowest loaded ELF address so the legacy fork IPC path + * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full. + */ + g->elf_load_min = boot->elf_info.load_min + boot->elf_load_base; + g->brk_base = PAGE_ALIGN_UP(boot->elf_info.load_max + boot->elf_load_base); if (g->brk_base < BRK_BASE_DEFAULT) g->brk_base = BRK_BASE_DEFAULT; @@ -252,15 +257,16 @@ int guest_bootstrap_prepare(guest_t *g, return -1; } - memcpy((uint8_t *) g->host_base + SHIM_BASE, shim_bin, shim_bin_len); + memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len); log_debug("shim loaded at offset 0x%llx (%zu bytes)", - (unsigned long long) SHIM_BASE, shim_bin_len); + (unsigned long long) g->shim_base, shim_bin_len); invalidate_exec_segments(&boot->elf_info, g->host_base, boot->elf_load_base); invalidate_exec_segments(&boot->interp_info, g->host_base, boot->interp_base); - sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_bin_len); + sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, + shim_bin_len); if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) { log_error("too many memory regions (%d >= %d)", nregions, @@ -275,10 +281,10 @@ int guest_bootstrap_prepare(guest_t *g, } g->need_tlbi = true; - guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_bin_len, + guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, + guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); @@ -380,10 +386,10 @@ int guest_bootstrap_create_vcpu(guest_t *g, uint64_t sctlr; uint64_t sctlr_with_mmu; uint64_t tcr_value = TCR_EL1_VALUE; - uint64_t shim_ipa = guest_ipa(g, SHIM_BASE); + uint64_t shim_ipa = guest_ipa(g, g->shim_base); uint64_t entry_ipa = guest_ipa(g, boot->entry_point); uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer); - uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MIB); + uint64_t el1_sp = guest_ipa(g, g->shim_data_base + BLOCK_2MIB); hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; diff --git a/src/core/guest.c b/src/core/guest.c index 853ab13..e9af7f8 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -118,6 +118,36 @@ static void guest_region_clip_overlay(guest_region_t *r) r->overlay_end = overlay_end; } +/* Compute infra reserve placement from guest_size and store derived fields in + * @g. Called from guest_init and guest_init_from_shm. + * + * Layout: a 4MiB region anchored at [interp_base - INFRA_RESERVE, interp_base) + * sits in the dead zone between mmap_limit and interp_base. PT pool, shim, and + * shim data fall at fixed offsets within the reserve (see guest.h). + * + * Returns 0 on success, -1 if the layout cannot be derived (interp_base too + * small to fit the reserve). Today guest_init enforces a 64GiB minimum so the + * underflow path is unreachable, but the explicit check guards future + * configurations and any IPC restore that bypasses size selection. + */ +static int compute_infra_layout(guest_t *g) +{ + if (g->interp_base < INFRA_RESERVE) { + log_error( + "guest: interp_base 0x%llx smaller than INFRA_RESERVE (0x%llx); " + "guest_size too small", + (unsigned long long) g->interp_base, + (unsigned long long) INFRA_RESERVE); + return -1; + } + uint64_t infra_base = g->interp_base - INFRA_RESERVE; + g->pt_pool_base = infra_base + INFRA_PT_POOL_OFF; + g->pt_pool_end = infra_base + INFRA_PT_POOL_END_OFF; + g->shim_base = infra_base + INFRA_SHIM_OFF; + g->shim_data_base = infra_base + INFRA_SHIM_DATA_OFF; + return 0; +} + /* Allocate a zeroed 4KiB page from the page table pool. * Returns GPA of the page, or 0 on pool exhaustion. * Acquires pt_lock internally. Caller typically holds mmap_lock. @@ -125,12 +155,12 @@ static void guest_region_clip_overlay(guest_region_t *r) static uint64_t pt_alloc_page(guest_t *g) { pthread_mutex_lock(&pt_lock); - if (g->pt_pool_next + PAGE_SIZE > PT_POOL_END) { + if (g->pt_pool_next + PAGE_SIZE > g->pt_pool_end) { log_error( "guest: page table pool exhausted " "(used %llu / %llu bytes)", - (unsigned long long) (g->pt_pool_next - PT_POOL_BASE), - (unsigned long long) (PT_POOL_END - PT_POOL_BASE)); + (unsigned long long) (g->pt_pool_next - g->pt_pool_base), + (unsigned long long) (g->pt_pool_end - g->pt_pool_base)); pthread_mutex_unlock(&pt_lock); return 0; } @@ -138,8 +168,8 @@ static uint64_t pt_alloc_page(guest_t *g) g->pt_pool_next += PAGE_SIZE; /* Warn at 80% pool usage so users can anticipate exhaustion */ - uint64_t used = gpa + PAGE_SIZE - PT_POOL_BASE; - uint64_t total = PT_POOL_END - PT_POOL_BASE; + uint64_t used = gpa + PAGE_SIZE - g->pt_pool_base; + uint64_t total = g->pt_pool_end - g->pt_pool_base; if (!pt_pool_warned && used > (total * 4 / 5)) { log_debug( "guest: page table pool at %llu%% " @@ -149,8 +179,8 @@ static uint64_t pt_alloc_page(guest_t *g) pt_pool_warned = true; } - /* Zero the page while still holding the lock so no other thread can - * observe a partially-zeroed page table page. + /* Zero the page while still holding the lock so no other thread can observe + * a partially-zeroed page table page. */ memset((uint8_t *) g->host_base + gpa, 0, PAGE_SIZE); pthread_mutex_unlock(&pt_lock); @@ -170,15 +200,15 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) memset(g, 0, sizeof(*g)); g->shm_fd = -1; g->ipa_base = GUEST_IPA_BASE; - g->pt_pool_next = PT_POOL_BASE; + g->elf_load_min = ELF_DEFAULT_BASE; g->brk_base = BRK_BASE_DEFAULT; g->brk_current = BRK_BASE_DEFAULT; g->mmap_next = MMAP_BASE; g->mmap_rx_next = MMAP_RX_BASE; - /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ - * on Apple Silicon reports 40 bits (1TiB). Older versions or fallback - * yields 36 bits (64GiB). + /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ on + * Apple Silicon reports 40 bits (1TiB). Older versions or fallback yields + * 36 bits (64GiB). */ uint32_t max_ipa = 0; hv_vm_config_get_max_ipa_size(&max_ipa); @@ -216,6 +246,9 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) */ g->interp_base = g->guest_size - 0x100000000ULL; g->mmap_limit = g->guest_size - 0x200000000ULL; + if (compute_infra_layout(g) < 0) + return -1; + g->pt_pool_next = g->pt_pool_base; /* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages * this: physical pages are allocated only on first touch, so reserving up @@ -238,12 +271,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) * The child maps it MAP_PRIVATE, giving it an instant copy-on-write * clone of all guest memory. * - * macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular - * file fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED - * correctly. The file is unlinked immediately; the fd keeps it alive. - * macOS demand-pages file mappings, so untouched pages cost nothing. - * If any step fails, guest memory silently keeps the MAP_ANON mapping and - * falls back to the IPC region-copy path on fork. + * macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular file + * fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED correctly. + * The file is unlinked immediately; the fd keeps it alive. macOS + * demand-pages file mappings, so untouched pages cost nothing. If any step + * fails, guest memory silently keeps the MAP_ANON mapping and falls back to + * the IPC region-copy path on fork. */ { char tmppath[] = "/tmp/elfuse-XXXXXX"; @@ -322,6 +355,11 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) g->guest_size = size; g->interp_base = size - 0x100000000ULL; g->mmap_limit = size - 0x200000000ULL; + if (compute_infra_layout(g) < 0) { + hv_vm_destroy(); + return -1; + } + g->pt_pool_next = g->pt_pool_base; g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); if (g->host_base == MAP_FAILED) { @@ -362,7 +400,7 @@ int guest_init_from_shm(guest_t *g, memset(g, 0, sizeof(*g)); g->shm_fd = -1; /* Child does not own the shm */ g->ipa_base = GUEST_IPA_BASE; - g->pt_pool_next = PT_POOL_BASE; + g->elf_load_min = ELF_DEFAULT_BASE; g->brk_base = BRK_BASE_DEFAULT; g->brk_current = BRK_BASE_DEFAULT; g->mmap_next = MMAP_BASE; @@ -373,6 +411,9 @@ int guest_init_from_shm(guest_t *g, /* Compute layout limits (same formula as guest_init) */ g->interp_base = size - 0x100000000ULL; g->mmap_limit = size - 0x200000000ULL; + if (compute_infra_layout(g) < 0) + return -1; + g->pt_pool_next = g->pt_pool_base; /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see * the parent's frozen snapshot; writes are private to this process. @@ -836,21 +877,21 @@ void guest_reset(guest_t *g) } /* Zero page table pool (not tracked in region array) */ - if (g->pt_pool_next > PT_POOL_BASE) - memset((uint8_t *) g->host_base + PT_POOL_BASE, 0, - g->pt_pool_next - PT_POOL_BASE); + if (g->pt_pool_next > g->pt_pool_base) + memset((uint8_t *) g->host_base + g->pt_pool_base, 0, + g->pt_pool_next - g->pt_pool_base); /* Zero shim code + data (not tracked in region array by guest_reset * callers; shim regions are added AFTER reset by the exec path) */ - memset((uint8_t *) g->host_base + SHIM_BASE, 0, - SHIM_DATA_BASE + BLOCK_2MIB - SHIM_BASE); + memset((uint8_t *) g->host_base + g->shim_base, 0, + g->shim_data_base + BLOCK_2MIB - g->shim_base); /* Reset allocation state */ guest_pt_gen_bump(g); guest_tlb_flush(); __atomic_store_n(&pt_pool_warned, false, __ATOMIC_RELAXED); - g->pt_pool_next = PT_POOL_BASE; + g->pt_pool_next = g->pt_pool_base; g->brk_base = BRK_BASE_DEFAULT; g->brk_current = BRK_BASE_DEFAULT; g->mmap_next = MMAP_BASE; @@ -861,6 +902,7 @@ void guest_reset(guest_t *g) g->mmap_rx_gap_hint = 0; g->ttbr0 = 0; g->need_tlbi = false; + g->elf_load_min = ELF_DEFAULT_BASE; /* Clear semantic region tracking (will be re-populated after exec) */ guest_region_clear(g); @@ -875,34 +917,36 @@ int guest_get_used_regions(const guest_t *g, { int n = 0; - /* Page table pool */ - if (n < max && g->pt_pool_next > PT_POOL_BASE) { - out[n].offset = PT_POOL_BASE; - out[n].size = g->pt_pool_next - PT_POOL_BASE; + /* Page table pool (high IPA, just below interp_base) */ + if (n < max && g->pt_pool_next > g->pt_pool_base) { + out[n].offset = g->pt_pool_base; + out[n].size = g->pt_pool_next - g->pt_pool_base; n++; } - /* Shim code */ + /* Shim code (high IPA) */ if (n < max && shim_size > 0) { - out[n].offset = SHIM_BASE; + out[n].offset = g->shim_base; out[n].size = shim_size; n++; } - /* Shim data/stack (full 2MiB block) */ + /* Shim data/stack (full 2MiB block, high IPA) */ if (n < max) { - out[n].offset = SHIM_DATA_BASE; + out[n].offset = g->shim_data_base; out[n].size = BLOCK_2MIB; n++; } - /* ELF + brk region: from ELF_DEFAULT_BASE to brk_current. - * guest memory does not track the exact ELF load range, but static musl - * binaries always load at or above ELF_DEFAULT_BASE (0x400000). + /* ELF + brk region: from elf_load_min (set by ELF loader) to brk_current. + * The lower bound is the actual ELF load address, not ELF_DEFAULT_BASE: + * ET_EXECs linked below 0x400000 (e.g. at 0x200000) have segments below the + * legacy default and would otherwise be silently dropped from the legacy + * fork-IPC copy. */ - if (n < max && g->brk_current > ELF_DEFAULT_BASE) { - out[n].offset = ELF_DEFAULT_BASE; - out[n].size = g->brk_current - ELF_DEFAULT_BASE; + if (n < max && g->brk_current > g->elf_load_min) { + out[n].offset = g->elf_load_min; + out[n].size = g->brk_current - g->elf_load_min; n++; } @@ -1167,9 +1211,9 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end) right->offset += (end - r->start); right->start = end; if (r->backing_fd >= 0) { - /* A dup failure leaves backing_fd=-1, silently converting - * this half to anonymous semantics (msync and MADV_DONTNEED - * skip regions with backing_fd<0). Propagating the error would + /* A dup failure leaves backing_fd=-1, silently converting this + * half to anonymous semantics (msync and MADV_DONTNEED skip + * regions with backing_fd<0). Propagating the error would * require making all region split callers (mprotect, munmap) * fallible. */ @@ -1513,8 +1557,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t *l0 = pt_at(g, l0_gpa); /* For each region, determine which 2MiB blocks need mapping. - * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block - * descriptor output address are both derived from gpa_start + ipa_base. + * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block descriptor + * output address are both derived from gpa_start + ipa_base. */ for (int r = 0; r < n; r++) { uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start); diff --git a/src/core/guest.h b/src/core/guest.h index 7425d01..c087d89 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -26,16 +26,35 @@ /* Memory layout constants. * - * Guest memory size is determined dynamically from the VM's IPA width - * (36-bit = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the - * runtime probe that selects the correct size. - */ - -#define PT_POOL_BASE 0x00010000ULL /* Page table pool start */ -#define PT_POOL_END 0x00100000ULL /* Page table pool end (960KiB) */ -#define SHIM_BASE 0x00100000ULL /* Shim code (2MiB block, RX) */ -#define SHIM_DATA_BASE 0x00200000ULL /* Shim stack/data (2MiB block, RW) */ -#define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ + * Guest memory size is determined dynamically from the VM's IPA width (36-bit + * = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the runtime probe that + * selects the correct size. + * + * Infrastructure layout (page-table pool, shim code, shim data): a 4MiB reserve + * placed just below g->interp_base, in the dead zone between g->mmap_limit and + * g->interp_base. The exact base is computed at guest_init time and stored in + * guest_t.pt_pool_base / pt_pool_end / shim_base / shim_data_base. EL0 user + * binaries are therefore free to load at low addresses (down to 64KiB) without + * colliding with the runtime. + * + * Internal layout within the 4MiB reserve: + * +0x000000 .. +0x010000 unused (64KiB null guard) + * +0x010000 .. +0x100000 page-table pool (960KiB, RW) + * +0x100000 .. +0x200000 shim code slot (1MiB, RX). Sits in the same + * 2MiB L2 block as the PT pool, so that block + * is split into 4KiB L3 pages (mixed RX/RW). + * +0x200000 .. +0x400000 shim data + EL1 stack (full 2MiB L2 block, RW) + */ + +/* Total size of the runtime infrastructure reserve. Shifted to + * [g->interp_base - INFRA_RESERVE, g->interp_base) at guest_init. + */ +#define INFRA_RESERVE 0x00400000ULL /* 4MiB */ +#define INFRA_PT_POOL_OFF 0x00010000ULL /* offset of PT pool */ +#define INFRA_PT_POOL_END_OFF 0x00100000ULL /* PT pool end (960KiB) */ +#define INFRA_SHIM_OFF 0x00100000ULL /* offset of shim code slot */ +#define INFRA_SHIM_DATA_OFF 0x00200000ULL /* offset of shim data slot */ +#define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ #define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MiB) */ #define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MiB) */ @@ -46,36 +65,36 @@ #define STACK_TOP_DEFAULT 0x08000000ULL #define STACK_GUARD_SIZE 0x00001000ULL /* 4KiB guard at stack bottom */ -/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap - * region clear for runtimes that demand a specific minimum heap address. +/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap region + * clear for runtimes that demand a specific minimum heap address. */ #define MMAP_RX_BASE 0x10000000ULL -/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block; - * additional pages are mapped lazily by guest_extend_page_tables() - * when sys_mmap needs more PROT_EXEC space. Reduces startup time - * and memory pressure for small binaries that never call mmap. +/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block; additional + * pages are mapped lazily by guest_extend_page_tables() when sys_mmap needs + * more PROT_EXEC space. Reduces startup time and memory pressure for small + * binaries that never call mmap. */ #define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MiB */ /* mmap RW region starts at 8GiB to match real Linux address layouts. */ #define MMAP_BASE 0x200000000ULL -/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block; - * additional pages are mapped lazily by guest_extend_page_tables(). +/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block; additional + * pages are mapped lazily by guest_extend_page_tables(). */ #define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MiB */ -/* mmap_limit and interp_base are computed dynamically from guest_size - * in main.c and stored in guest_t. +/* mmap_limit and interp_base are computed dynamically from guest_size in main.c + * and stored in guest_t. */ #define BLOCK_2MIB (2ULL * 1024 * 1024) /* IPA base: guest memory is mapped at this IPA in the hypervisor. * All guest physical addresses = GUEST_IPA_BASE + offset. - * Must be 0 so that guest virtual addresses match ELF link addresses - * (e.g. 0x400000). A non-zero IPA base would require all ELF binaries - * to be linked at IPA_BASE+vaddr, which is impractical. + * Must be 0 so that guest virtual addresses match ELF link addresses (e.g. + * 0x400000). A non-zero IPA base would require all ELF binaries to be linked at + * IPA_BASE+vaddr, which is impractical. */ #define GUEST_IPA_BASE 0x0ULL @@ -100,10 +119,10 @@ typedef struct { /* Maximum number of tracked memory regions (heap/stack/mmap/ELF/etc.). * Adjacent anonymous regions with matching permissions are automatically - * coalesced (see regions_mergeable in core/guest.c). Threaded runtimes - * create many thread stacks with guard pages; with coalescing, typical - * workloads use ~50 regions. 4096 provides ample headroom for edge cases - * (many interleaved guard pages, file-backed mappings, etc.). + * coalesced (see regions_mergeable in core/guest.c). Threaded runtimes create + * many thread stacks with guard pages; with coalescing, typical workloads use + * ~50 regions. 4096 provides ample headroom for edge cases (many interleaved + * guard pages, file-backed mappings, etc.). */ #define GUEST_MAX_REGIONS 4096 @@ -141,14 +160,16 @@ typedef struct { bool shared; /* MAP_SHARED (writes should propagate) */ bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */ bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay - * of backing_fd at host_base+start. The kernel's - * page cache keeps it coherent with the file and - * with peer overlays of the same file, so msync - * skips the snapshot-style pwrite-the-diff and - * refresh-from-file paths for these regions. */ + * of backing_fd at host_base+start. The kernel's page + * cache keeps it coherent with the file and with peer + * overlays of the same file, so msync skips the + * snapshot-style pwrite-the-diff and refresh-from-file + * paths for these regions. + */ uint64_t overlay_start; /* Host-page-aligned overlay start. May extend * outside [start, end) when only part of a host - * page is guest-visible. */ + * page is guest-visible. + */ uint64_t overlay_end; /* Host-page-aligned overlay end (exclusive). */ char name[64]; /* Label: "[heap]", "[stack]", ELF path, etc. */ } guest_region_t; @@ -162,12 +183,30 @@ typedef struct { uint64_t ipa_base; /* IPA base for hv_vm_map (GUEST_IPA_BASE) */ uint64_t mmap_limit; /* Max mmap address (computed from guest_size) */ - uint64_t interp_base; /* Dynamic linker load base (from guest_size) */ + uint64_t interp_base; /* Dynamic linker load base (from guest_size) */ + + /* Runtime-infrastructure reserve. Computed at guest_init time and placed at + * [interp_base - INFRA_RESERVE, interp_base). All four values are derived + * from the same base, so the inequalities + * pt_pool_base < pt_pool_end <= shim_base < shim_data_base + * always hold, and shim_data_base + BLOCK_2MIB == interp_base. + */ + uint64_t pt_pool_base; /* Page-table pool start (high IPA) */ + uint64_t pt_pool_end; /* Page-table pool end (exclusive) */ + uint64_t shim_base; /* Shim code (2MiB block, RX) */ + uint64_t shim_data_base; /* Shim stack/data (2MiB block, RW) */ + uint64_t pt_pool_next; /* Next free page table page in pool */ - uint64_t brk_base; /* Initial brk (set after ELF load) */ - uint64_t brk_current; /* Current brk position */ - uint64_t stack_base; /* Bottom of stack region (dynamic, above brk) */ - uint64_t stack_top; /* Top of stack (stack grows down from here) */ + /* Lowest virtual address of the loaded ELF (executable image, not the + * dynamic linker). Set by bootstrap and re-set by execve. Used by the + * legacy fork IPC path to bound the ELF + brk copy chunk; it must cover + * ET_EXECs linked below ELF_DEFAULT_BASE (e.g. 0x200000). + */ + uint64_t elf_load_min; + uint64_t brk_base; /* Initial brk (set after ELF load) */ + uint64_t brk_current; /* Current brk position */ + uint64_t stack_base; /* Bottom of stack region (dynamic, above brk) */ + uint64_t stack_top; /* Top of stack (stack grows down from here) */ uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */ uint64_t mmap_end; /* Current page-table-covered RW mmap limit */ @@ -220,6 +259,35 @@ static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset) return g->ipa_base + offset; } +/* True iff [start, end) overlaps the runtime infra reserve + * [interp_base - INFRA_RESERVE, interp_base). Covers the full 4 MiB + * reserve including the 64 KiB null-guard slot at the bottom (which + * has no PT entries but must not become semantically reachable from + * guest mmap state). Used by sys_mmap (MAP_FIXED), sys_munmap, and + * sys_mprotect to reject guest attempts to touch page tables, shim + * code, or shim data through the syscall surface. + */ +static inline bool guest_range_hits_infra(const guest_t *g, + uint64_t start, + uint64_t end) +{ + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; + return start < infra_hi && end > infra_lo; +} + +/* True iff a single address (PC, hint, etc.) falls inside the infra reserve. + * Used by rt_sigreturn to reject forged frames that would redirect EL0 PC into + * EL1 shim or page-table memory. Covers the full 4 MiB reserve, matching + * guest_range_hits_infra. + */ +static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr) +{ + uint64_t infra_lo = g->interp_base - INFRA_RESERVE; + uint64_t infra_hi = g->interp_base; + return addr >= infra_lo && addr < infra_hi; +} + /* API */ /* Allocate guest memory, create VM, map to hypervisor. diff --git a/src/core/shim.S b/src/core/shim.S index fe82f9a..62328d3 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -4,11 +4,12 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Loaded at SHIM_BASE (0x100000). Runs at EL1. - * All system registers (VBAR, MAIR, TCR, TTBR0, SCTLR, etc.) are - * configured by the host before vCPU start. The shim entry point - * transitions to EL0 via ERET. Exception vectors handle SVC #0 - * (Linux syscall) forwarding to the host via HVC #5. + * Loaded at g->shim_base (a 4MiB infra reserve placed just below g->interp_base; + * computed at guest_init time). Runs at EL1. + * All system registers (VBAR, MAIR, TCR, TTBR0, SCTLR, etc.) are configured by + * the host before vCPU start. The shim entry point transitions to EL0 via ERET. + * Exception vectors handle SVC #0 (Linux syscall) forwarding to the host via + * HVC #5. * * HVC protocol: * #0 Normal exit (x0 = exit code) @@ -141,10 +142,10 @@ .globl _start _start: /* Host has configured all system registers EXCEPT SCTLR.M (MMU enable). - * Apple's Hypervisor.framework requires the MMU to be enabled DURING - * vCPU execution (via HVC #4), not before hv_vcpu_run(). Setting - * SCTLR.M=1 via hv_vcpu_set_sys_reg before start causes permission - * faults on the first instruction fetch. + * Apple's Hypervisor.framework requires the MMU to be enabled DURING vCPU + * execution (via HVC #4), not before hv_vcpu_run(). Setting SCTLR.M=1 via + * hv_vcpu_set_sys_reg before start causes permission faults on the first + * instruction fetch. * * Host passes the final SCTLR value (with M=1) in X0 before start. */ @@ -173,8 +174,8 @@ _start: * * bad_exception vectors: mov x5, #offset + b bad_exception * X5 carries the vector offset for host-side debugging. - * This is safe because bad_exception halts, so no register - * preservation needed. + * This is safe because bad_exception halts, so no register preservation + needed. * * svc_handler vectors: b svc_handler (NO mov x5!) * These MUST NOT clobber any GPR before svc_handler saves them. The Linux @@ -327,14 +328,14 @@ handle_sysreg_trap: b 3f 4: /* System instruction: trapped cache/data operations. - * DC ZVA (Data Cache Zero by VA) is trapped when HCR_EL2.TDZ=1 - * despite SCTLR.DZE=1. DC ZVA zeroes a cache-line-sized block (64 bytes on - * Apple Silicon). JIT translators and libc use it as fast memset(0). + * DC ZVA (Data Cache Zero by VA) is trapped when HCR_EL2.TDZ=1 despite + * SCTLR.DZE=1. DC ZVA zeroes a cache-line-sized block (64 bytes on Apple + * Silicon). JIT translators and libc use it as fast memset(0). * Failure to emulate leaves stale data. * - * DC CVAU, IC IVAU, and other cache maintenance instructions trap - * here despite SCTLR.UCI=1 (HVF sets HCR_EL2.TPU=1). The shim executes - * IC IALLU as safety net for I-cache coherency after any of these. + * DC CVAU, IC IVAU, and other cache maintenance instructions trap here + * despite SCTLR.UCI=1 (HVF sets HCR_EL2.TPU=1). The shim executes IC IALLU + * as safety net for I-cache coherency after any of these. * * ISS encoding for system instructions: * Op0[21:20] Op2[19:17] Op1[16:14] CRn[13:10] Rt[9:5] CRm[4:1] Dir[0] @@ -372,9 +373,9 @@ handle_sysreg_trap: stp xzr, xzr, [x11, #48] 6: /* Forward to host for counting, then I-cache maintenance */ -5: /* Pass the source register value to the host. Cache maintenance - * instructions ignore it, but MSR writes (notably TPIDR_EL0 for - * userspace TLS setup) need the original Xt value. +5: /* Pass the source register value to host. Cache maintenance instructions + * ignore it, but MSR writes (notably TPIDR_EL0 for userspace TLS setup) + * need the original Xt value. */ ubfx x10, x9, #5, #5 /* Rt = ISS[9:5] */ cmp x10, #31 @@ -400,7 +401,6 @@ handle_sysreg_trap: * 1. Permission fault (IFSC[5:2]=0x03): W^X demand toggle for JIT. * Code pages are initially RW; execution triggers permission fault. * Ask host to flip to RX via HVC #9, flush TLB, retry. - * * 2. Translation fault (IFSC[5:2]=0x01) or other non-permission fault: Address * not mapped in page tables. A real Linux kernel delivers SIGSEGV * (si_code=SEGV_MAPERR) to the process. JIT translators may use SIGSEGV for @@ -430,11 +430,9 @@ handle_inst_abort: * Three cases: * 1. Write permission fault (WnR=1, DFSC[5:2]=0x03): W^X toggle. * Code page is RX; JIT write triggers permission fault. Flip to RW. - * * 2. Read/write translation fault (DFSC[5:2]=0x01): Address not mapped. * Deliver SIGSEGV (SEGV_MAPERR) via host. Programs may use SIGSEGV * handlers for lazy allocation or guard pages. - * * 3. Read permission fault (WnR=0, DFSC[5:2]=0x03): Permission denied. * Deliver SIGSEGV (SEGV_ACCERR) via host. */ diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h index c8721fb..4b45c61 100644 --- a/src/runtime/fork-state.h +++ b/src/runtime/fork-state.h @@ -18,7 +18,7 @@ /* Magic values for IPC frame delimiters */ #define IPC_MAGIC_HEADER 0x454C464BU /* "ELFK" */ #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */ -#define IPC_VERSION 8 /* v8: session / process group state */ +#define IPC_VERSION 9 /* v9: preserve elf_load_min */ typedef struct { uint32_t magic; @@ -27,6 +27,7 @@ typedef struct { uint32_t has_shm; int64_t child_pid, parent_pid; uint64_t guest_size; + uint64_t elf_load_min; uint64_t brk_base, brk_current; uint64_t stack_base; uint64_t stack_top; diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 0803503..d115cb3 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -110,6 +110,32 @@ int fork_child_main(int ipc_fd, absock_set_namespace_id(hdr.absock_namespace_id); proc_set_session(hdr.sid, hdr.pgid); + /* Validate header layout fields before any size-derived arithmetic. + * guest_init / guest_init_from_shm derive interp_base, mmap_limit, and + * the high-IPA infra reserve from these inputs; underflow on tiny or + * malformed values would place pt_pool_base and friends near UINT64_MAX, + * which then feeds unchecked host-buffer offsets in pt_alloc_page and + * pt_at. Reject impossible layouts up front. + * + * Lower bound: guest_size must leave room for both mmap_limit + * (size - 8 GiB) and interp_base (size - 4 GiB) plus the 4 MiB infra + * reserve below it. 8 GiB satisfies all three with margin. + * Upper bound: guest_size must fit in the negotiated IPA width. + * IPA bits: 36 (Apple M2) and 40 (M3+) are the supported widths. + */ + if (hdr.ipa_bits < 36 || hdr.ipa_bits > 40) { + log_error("fork-child: invalid ipa_bits %u", (unsigned) hdr.ipa_bits); + close(ipc_fd); + return 1; + } + if (hdr.guest_size < 0x200000000ULL || + hdr.guest_size > (1ULL << hdr.ipa_bits)) { + log_error("fork-child: invalid guest_size 0x%llx (ipa_bits=%u)", + (unsigned long long) hdr.guest_size, (unsigned) hdr.ipa_bits); + close(ipc_fd); + return 1; + } + /* Create guest memory before receiving state so all incoming offsets can be * bounds-checked against the negotiated guest size. */ @@ -144,11 +170,20 @@ int fork_child_main(int ipc_fd, } /* Restore allocator/page-table cursors before mmap/brk can run in child. - * Validate pt_pool_next and ttbr0: both must reside within the page table - * pool [PT_POOL_BASE, PT_POOL_END). Accepting out-of-range values from IPC - * would corrupt page table allocation or translation walks. + * Validate pt_pool_next and ttbr0 against the child's own page-table + * pool, which the child just computed from hdr.guest_size + + * hdr.ipa_bits via compute_infra_layout. + * + * Range alone is not enough: pt_alloc_page advances pt_pool_next in + * GUEST_PAGE_SIZE quanta, and pt_at converts page-table GPAs straight + * into host-buffer pointers. An unaligned value passes the [base, end) + * gate but then misaligns the walker. Require: + * - pt_pool_next page-aligned relative to pt_pool_base + * - ttbr0 strictly inside the in-use pool [pt_pool_base, pt_pool_next) + * (parent must have allocated the L0 page) and page-aligned. */ - if (hdr.pt_pool_next < PT_POOL_BASE || hdr.pt_pool_next > PT_POOL_END) { + if (hdr.pt_pool_next < g.pt_pool_base || hdr.pt_pool_next > g.pt_pool_end || + ((hdr.pt_pool_next - g.pt_pool_base) % GUEST_PAGE_SIZE) != 0) { log_error("fork-child: invalid pt_pool_next 0x%llx", (unsigned long long) hdr.pt_pool_next); guest_destroy(&g); @@ -156,7 +191,8 @@ int fork_child_main(int ipc_fd, return 1; } uint64_t ttbr0_off = hdr.ttbr0 - g.ipa_base; - if (ttbr0_off < PT_POOL_BASE || ttbr0_off >= PT_POOL_END) { + if (ttbr0_off < g.pt_pool_base || ttbr0_off >= hdr.pt_pool_next || + ((ttbr0_off - g.pt_pool_base) % GUEST_PAGE_SIZE) != 0) { log_error("fork-child: invalid ttbr0 0x%llx", (unsigned long long) hdr.ttbr0); guest_destroy(&g); @@ -165,6 +201,7 @@ int fork_child_main(int ipc_fd, } g.brk_base = hdr.brk_base; g.brk_current = hdr.brk_current; + g.elf_load_min = hdr.elf_load_min; g.stack_base = hdr.stack_base; g.stack_top = hdr.stack_top; g.mmap_next = hdr.mmap_next; @@ -379,13 +416,12 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu, if (current_thread) t->blocked = current_thread->blocked; - /* Allocate per-thread EL1 stack */ - uint64_t child_sp_el1 = thread_alloc_sp_el1(); + /* Allocate per-thread EL1 stack (records both sp and slot in t). */ + uint64_t child_sp_el1 = thread_alloc_sp_el1(g, t); if (child_sp_el1 == 0) { thread_deactivate(t); return -LINUX_ENOMEM; } - t->sp_el1 = child_sp_el1; /* Capture parent register state before spawning worker. * HVF binds vCPU to the creating thread, so the worker must call @@ -656,13 +692,12 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu, if (current_thread) t->blocked = current_thread->blocked; - /* Allocate per-thread EL1 stack */ - uint64_t child_sp_el1 = thread_alloc_sp_el1(); + /* Allocate per-thread EL1 stack (records both sp and slot in t). */ + uint64_t child_sp_el1 = thread_alloc_sp_el1(g, t); if (child_sp_el1 == 0) { thread_deactivate(t); return -LINUX_ENOMEM; } - t->sp_el1 = child_sp_el1; /* Capture parent register state */ uint64_t parent_elr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_ELR_EL1); @@ -1085,6 +1120,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, .child_pid = child_guest_pid, .parent_pid = proc_get_pid(), .guest_size = g->guest_size, + .elf_load_min = g->elf_load_min, .brk_base = g->brk_base, .brk_current = g->brk_current, .stack_base = g->stack_base, diff --git a/src/runtime/thread.c b/src/runtime/thread.c index 36d6a2d..e024ec8 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -21,7 +21,7 @@ #include "runtime/thread.h" #include "debug/log.h" -#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MIB, GUEST_IPA_BASE */ +#include "core/guest.h" /* guest_t (shim_data_base/ipa_base), BLOCK_2MIB */ #include "hvutil.h" /* vcpu_get_gpr, vcpu_get_sysreg */ /* From syscall/signal.h, included here directly to avoid pulling in @@ -38,8 +38,15 @@ static int thread_can_add_deferred_unmap_locked(thread_entry_t *t, uint64_t start, uint64_t end); -/* Top of the EL1 exception stack region (one 4KiB slot per thread) */ -#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MIB) +/* Top of the EL1 exception stack region (one 4KiB slot per thread). + * The shim data block sits at high IPA, computed at guest_init time and + * stored in g->shim_data_base; the top of the EL1 stacks is the next + * 2MiB boundary above that. Caller must hold a guest_t reference. + */ +static inline uint64_t sp_el1_top(const guest_t *g) +{ + return g->ipa_base + g->shim_data_base + BLOCK_2MIB; +} /* Thread table. */ @@ -103,6 +110,7 @@ void thread_register_main(hv_vcpu_t vcpu, t->host_thread = pthread_self(); t->clear_child_tid = 0; t->sp_el1 = sp_el1; + t->sp_el1_slot = 0; /* Main thread always owns slot 0 */ t->active = 1; t->altstack_flags = LINUX_SS_DISABLE; t->on_altstack = false; @@ -138,6 +146,7 @@ thread_entry_t *thread_alloc(int64_t tid, pthread_cond_destroy(&t->resume_cond); } memset(t, 0, sizeof(*t)); + t->sp_el1_slot = -1; /* No SP_EL1 yet; thread_alloc_sp_el1 fills this */ t->guest_tid = tid; if (stack_start < stack_end) { t->stack_map_start = stack_start; @@ -156,18 +165,15 @@ thread_entry_t *thread_alloc(int64_t tid, } /* Free an SP_EL1 slot for reuse. Must be called with thread_lock held. - * Derives the slot index from the IPA and clears the bitmask bit. + * Reads the slot index recorded at allocation time and clears the bit. */ -static void thread_free_sp_el1_locked(uint64_t sp) +static void thread_free_sp_el1_locked(thread_entry_t *t) { - if (sp == 0) - return; - uint64_t top = SP_EL1_TOP; - if (sp > top) - return; - int slot = (int) ((top - sp) / 4096); + int slot = t->sp_el1_slot; if (RANGE_CHECK(slot, 0, MAX_THREADS)) sp_el1_allocated &= ~BIT64(slot); + t->sp_el1 = 0; + t->sp_el1_slot = -1; } static void thread_ptrace_cleanup_locked(thread_entry_t *t) @@ -205,7 +211,7 @@ void thread_deactivate(thread_entry_t *t) } /* Free SP_EL1 slot so it can be reused by future threads */ - thread_free_sp_el1_locked(t->sp_el1); + thread_free_sp_el1_locked(t); t->active = 0; atomic_fetch_sub(&active_thread_count, 1); @@ -272,7 +278,7 @@ int thread_count_active_vm_clones(void) return count; } -uint64_t thread_alloc_sp_el1(void) +uint64_t thread_alloc_sp_el1(const guest_t *g, thread_entry_t *t) { uint64_t sp = 0; @@ -284,12 +290,14 @@ uint64_t thread_alloc_sp_el1(void) log_error("thread: SP_EL1 slots exhausted"); } else { int slot = bit_ctz64(free_mask); - /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MiB. + /* Main thread's SP_EL1 sits at the top of the shim data block. * Each subsequent thread is 4KiB below. */ - uint64_t top = SP_EL1_TOP; + uint64_t top = sp_el1_top(g); sp = top - (uint64_t) slot * 4096; sp_el1_allocated |= BIT64(slot); + t->sp_el1 = sp; + t->sp_el1_slot = slot; } pthread_mutex_unlock(&thread_lock); @@ -357,7 +365,7 @@ void thread_destroy_all_vcpus(void) continue; hv_vcpu_destroy(t->vcpu); t->vcpu = 0; - thread_free_sp_el1_locked(t->sp_el1); + thread_free_sp_el1_locked(t); t->active = 0; /* Do NOT destroy condvars. Same race as thread_deactivate: a waiter * woken by an earlier broadcast may still reference the condvar. @@ -892,7 +900,7 @@ int64_t thread_ptrace_wait(int64_t tracer_tid, /* Destroy condvars after the last waiter returns from * pthread_cond_wait(). */ - thread_free_sp_el1_locked(t->sp_el1); + thread_free_sp_el1_locked(t); t->active = 0; atomic_fetch_sub(&active_thread_count, 1); t->ptrace_cleanup_pending = true; diff --git a/src/runtime/thread.h b/src/runtime/thread.h index 25ab5ff..a8d35ab 100644 --- a/src/runtime/thread.h +++ b/src/runtime/thread.h @@ -10,8 +10,8 @@ * O(1) access to the current thread's entry from any syscall handler. * * SP_EL1 allocation: each thread gets a 4KiB EL1 exception stack carved from - * the shim data region (SHIM_DATA_BASE + 2MiB). Thread 0 (main) gets the top, - * thread N gets offset -(N * 4096). + * the shim data region (g->shim_data_base + 2MiB). Thread 0 (main) gets the + * top, thread N gets offset -(N * 4096). */ #pragma once @@ -20,6 +20,8 @@ #include #include #include + +#include "core/guest.h" /* guest_t (for thread_alloc_sp_el1) */ #include "syscall/abi.h" /* linux_user_pt_regs_t */ /* Maximum number of concurrent guest threads in one VM. */ @@ -34,6 +36,11 @@ typedef struct { pthread_t host_thread; /* macOS host thread running this vCPU */ uint64_t clear_child_tid; /* GVA for CLONE_CHILD_CLEARTID (0=none) */ uint64_t sp_el1; /* Per-thread EL1 stack top (IPA) */ + int sp_el1_slot; /* Slot index in sp_el1_allocated (-1 = none). + * Stored at alloc time so the free path does + * not need to recompute (top - sp) / 4096; the + * shim data block is now at high IPA and only + * known via guest_t. */ int active; /* Non-zero while thread is running. * Stays int (not bool) because lock-free paths in thread.c * use __atomic_load_n on this field; the 32-bit width keeps @@ -181,12 +188,15 @@ int thread_active_count(void); /* Fast path: return non-zero when exactly one guest thread is active. */ int thread_is_single_active(void); -/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KiB slot counting - * down from the top of the shim data region. The IPA base (GUEST_IPA_BASE + - * SHIM_DATA_BASE + 2MiB) is the main thread's SP_EL1; each subsequent thread - * subtracts 4KiB. Returns the IPA, or 0 on failure. +/* Allocate a per-thread SP_EL1 stack and record both the IPA and the slot + * index into t. Thread N gets the Nth 4KiB slot counting down from the top + * of the shim data block (g->shim_data_base + 2MiB). The shim block lives + * at high IPA computed by guest_init, so callers must pass g; the slot + * index is stored in t->sp_el1_slot so the free path (which is reached + * from teardown contexts that lack g) can clear the bitmask directly. + * Returns the SP_EL1 IPA, or 0 on slot exhaustion. */ -uint64_t thread_alloc_sp_el1(void); +uint64_t thread_alloc_sp_el1(const guest_t *g, thread_entry_t *t); /* Iterate over all active threads, calling fn(entry, ctx) for each. * Holds the thread table lock during iteration. diff --git a/src/syscall/exec.c b/src/syscall/exec.c index b52366c..ac0e9cc 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -488,7 +488,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, const unsigned char *shim_ptr = proc_get_shim_blob(); unsigned int shim_size = proc_get_shim_size(); if (shim_ptr && shim_size > 0) { - memcpy((uint8_t *) g->host_base + SHIM_BASE, shim_ptr, shim_size); + memcpy((uint8_t *) g->host_base + g->shim_base, shim_ptr, shim_size); } /* Load the executable image that was validated before guest_reset(). */ @@ -501,6 +501,11 @@ int64_t sys_execve(hv_vcpu_t vcpu, exit(128); } + /* Track lowest loaded ELF address for the legacy fork IPC path + * after exec replaces the previous image (see guest_get_used_regions). + */ + g->elf_load_min = elf_info.load_min + elf_load_base; + /* If PT_INTERP was present, map the already-validated interpreter at the * exec-time interp_base. */ @@ -540,7 +545,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, sys_icache_invalidate(host_addr, interp_info.segments[i].memsz); } } - sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_size); + sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, shim_size); /* Reset brk to the first page after loaded executable data. */ uint64_t brk_start = PAGE_ALIGN_UP(elf_info.load_max + elf_load_base); @@ -574,16 +579,16 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* Keep the shim executable-only; HVF faults on merged RWX mappings. */ if (nregions >= MAX_REGIONS) goto too_many_regions; - regions[nregions++] = (mem_region_t) {.gpa_start = SHIM_BASE, - .gpa_end = SHIM_BASE + shim_size, + regions[nregions++] = (mem_region_t) {.gpa_start = g->shim_base, + .gpa_end = g->shim_base + shim_size, .perms = MEM_PERM_RX}; /* EL1 exception handlers use this block for stack and scratch state. */ if (nregions >= MAX_REGIONS) goto too_many_regions; regions[nregions++] = - (mem_region_t) {.gpa_start = SHIM_DATA_BASE, - .gpa_end = SHIM_DATA_BASE + BLOCK_2MIB, + (mem_region_t) {.gpa_start = g->shim_data_base, + .gpa_end = g->shim_data_base + BLOCK_2MIB, .perms = MEM_PERM_RW}; /* The vDSO sits in the same 2MiB block as the shim. The page-table builder @@ -667,10 +672,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, } /* Rebuild /proc/self/maps metadata in parallel with the new page tables. */ - guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_size, + guest_region_add(g, g->shim_base, g->shim_base + shim_size, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, + guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); for (int i = 0; i < elf_info.num_segments; i++) { diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 4787745..f1e3eb9 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -1185,10 +1185,13 @@ int64_t sys_mmap(guest_t *g, /* Reject MAP_FIXED targeting VM infrastructure: page table pool, * shim code, and shim data/stack regions. A guest must not be - * able to overwrite EL1 exception vectors or page tables. + * able to overwrite EL1 exception vectors or page tables. The + * reserve sits at high IPA (just below g->interp_base) so the + * range check uses the runtime fields rather than compile-time + * low-memory constants. */ uint64_t fix_end = off + length; - if (off < ELF_DEFAULT_BASE && fix_end > PT_POOL_BASE) + if (guest_range_hits_infra(g, off, fix_end)) return -LINUX_EINVAL; result_off = off; @@ -2414,7 +2417,7 @@ static int compare_range_pair(const void *a, const void *b) static int munmap_guest_range(guest_t *g, uint64_t unmap_off, uint64_t end) { /* Reject munmap targeting VM infrastructure regions. */ - if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE) + if (guest_range_hits_infra(g, unmap_off, end)) return -LINUX_EINVAL; /* Restore slab backing under any active MAP_SHARED file overlay before @@ -2565,7 +2568,7 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot) /* Reject mprotect targeting VM infrastructure (page tables, shim). * Matches the guard in sys_munmap. */ - if (mprot_off < ELF_DEFAULT_BASE && mprot_end > PT_POOL_BASE) + if (guest_range_hits_infra(g, mprot_off, mprot_end)) return -LINUX_EINVAL; guest_region_set_prot(g, mprot_off, mprot_end, prot); diff --git a/src/syscall/signal.c b/src/syscall/signal.c index 1ea952f..f38e4b2 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -1457,9 +1457,11 @@ int signal_rt_sigreturn(hv_vcpu_t vcpu, guest_t *g) * signal frame could redirect execution into EL1 code. * Must happen before GPR/SP/PSTATE restore so that a failed check * does not leave the vCPU with partially-attacker-controlled state. + * The infra reserve sits at high IPA (just below g->interp_base); + * use the runtime check rather than compile-time constants. */ uint64_t restored_pc = frame.uc.uc_mcontext.pc; - if (restored_pc >= PT_POOL_BASE && restored_pc < ELF_DEFAULT_BASE) + if (guest_addr_in_infra(g, restored_pc)) return -LINUX_EFAULT; /* Restore all 31 GPRs */ diff --git a/tests/manifest.txt b/tests/manifest.txt index 49136eb..a54078a 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -80,6 +80,7 @@ test-signal-thread [section] Fork edge cases test-clone3 # diff=skip test-fork-exec $TESTDIR/echo-test +test-fork-lowbase [section] COW fork isolation tests test-cow-fork diff --git a/tests/test-fork-lowbase.c b/tests/test-fork-lowbase.c new file mode 100644 index 0000000..4e6e148 --- /dev/null +++ b/tests/test-fork-lowbase.c @@ -0,0 +1,95 @@ +/* Low-base nested fork regression test + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Exercises the legacy fork IPC path from a low-linked ET_EXEC. The child + * forks again, so the grandchild only runs correctly if the intermediate child + * preserved the executable's true low load address when cloning guest state. + */ + +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +static void test_binary_is_low_linked(void) +{ + TEST("binary linked below 0x400000"); + uintptr_t pc = (uintptr_t) &test_binary_is_low_linked; + EXPECT_TRUE(pc < 0x400000ULL, "test binary not linked at low address"); +} + +static void test_nested_fork_lowbase(void) +{ + TEST("nested fork from low-base ET_EXEC"); + + int pipefd[2]; + if (pipe(pipefd) != 0) { + FAIL("pipe() failed"); + return; + } + + pid_t child = fork(); + if (child < 0) { + close(pipefd[0]); + close(pipefd[1]); + FAIL("fork() failed"); + return; + } + + if (child == 0) { + close(pipefd[0]); + + pid_t grandchild = fork(); + if (grandchild < 0) + _exit(101); + + if (grandchild == 0) { + char ok = 'G'; + if (write(pipefd[1], &ok, 1) != 1) + _exit(102); + _exit(0); + } + + int status = 0; + if (waitpid(grandchild, &status, 0) != grandchild) + _exit(103); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + _exit(104); + _exit(0); + } + + close(pipefd[1]); + + char ok = 0; + ssize_t n = read(pipefd[0], &ok, 1); + close(pipefd[0]); + + int status = 0; + if (waitpid(child, &status, 0) != child) { + FAIL("waitpid(child) failed"); + return; + } + + EXPECT_TRUE( + n == 1 && ok == 'G' && WIFEXITED(status) && WEXITSTATUS(status) == 0, + "grandchild did not complete from low-base nested fork"); +} + +int main(void) +{ + printf("test-fork-lowbase: starting\n"); + + test_binary_is_low_linked(); + test_nested_fork_lowbase(); + + SUMMARY("test-fork-lowbase"); + return fails ? 1 : 0; +}