From 40a759e18d107b8b7ef78776da33a820cef97649 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sun, 10 May 2026 23:37:30 +0800
Subject: [PATCH] Move runtime infrastructure out of low user-VA

Page-table pool and EL1 shim previously sat at fixed low addresses
[0x10000, 0x400000), colliding with low-linked ET_EXECs. Android
linker64 binaries link at 0x200000 and the loader accepted them, but
sys_{mprotect,munmap,mmap} MAP_FIXED, and rt_sigreturn then rejected any
operation on the overlapping pages with a bare EINVAL as soon as the
binary tried to RELRO its data segment.

Relocate the page-table pool, shim code, and shim data into a 4 MiB
reserve placed just below g->interp_base, in the dead zone between
g->mmap_limit and g->interp_base. PT_POOL_BASE, SHIM_BASE, and
SHIM_DATA_BASE become runtime guest_t fields computed by compute_infra_layout
from guest_size; for 36-bit IPA the reserve sits at [60 GiB - 4 MiB,
60 GiB), for 40-bit IPA at [1020 GiB - 4 MiB, 1020 GiB). Two helpers
guest_range_hits_infra and guest_addr_in_infra retarget the four infra
guards at the new range without weakening their security intent. The 64
KiB null-guard slot at the bottom of the reserve is covered too so guest
mmap state cannot semantically reserve it either.

Bump fork IPC to v9 to carry elf_load_min so nested forks from low-linked
ET_EXECs see the actual load address rather than the legacy ELF_DEFAULT_BASE
constant. Validate hdr.ipa_bits, hdr.guest_size, and the page-aligned
in-pool location of hdr.pt_pool_next and hdr.ttbr0 in the child path
before any size-derived arithmetic so a malformed header cannot underflow
interp_base or misalign the page-table walker.

Plumb guest_t through thread_alloc_sp_el1 and record the slot index in
thread_entry_t so thread_free_sp_el1_locked can clear the bitmap from
teardown contexts (thread_{deactivate,destroy_all_vcpus,ptrace_wait) that
lack a guest_t reference.

Add tests/test-fork-lowbase.c, a static ET_EXEC linked at 0x200000 that
exercises a nested fork. The grandchild only completes when intermediate
child preserved elf_load_min across the IPC handoff.
---
 Makefile                  |   7 ++
 src/core/bootstrap.c      |  28 +++++---
 src/core/guest.c          | 130 ++++++++++++++++++++++------------
 src/core/guest.h          | 144 ++++++++++++++++++++++++++++----------
 src/core/shim.S           |  44 ++++++------
 src/runtime/fork-state.h  |   3 +-
 src/runtime/forkipc.c     |  58 ++++++++++++---
 src/runtime/thread.c      |  42 ++++++-----
 src/runtime/thread.h      |  24 +++++--
 src/syscall/exec.c        |  21 +++---
 src/syscall/mem.c         |  11 +--
 src/syscall/signal.c      |   4 +-
 tests/manifest.txt        |   1 +
 tests/test-fork-lowbase.c |  95 +++++++++++++++++++++++++
 14 files changed, 448 insertions(+), 164 deletions(-)
 create mode 100644 tests/test-fork-lowbase.c

diff --git a/Makefile b/Makefile
index 78c96bf..276b933 100644
--- a/Makefile
+++ b/Makefile
@@ -164,6 +164,13 @@ $(BUILD_DIR)/test-futex-waitv: tests/test-futex-waitv.c | $(BUILD_DIR)
 	@echo "  CROSS   $< (with -lpthread)"
 	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
 
+# test-fork-lowbase must be a non-PIE ET_EXEC linked below ELF_DEFAULT_BASE so
+# nested forks exercise elf_load_min preservation across fork IPC.
+$(BUILD_DIR)/test-fork-lowbase: tests/test-fork-lowbase.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (low-base ET_EXEC)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -no-pie \
+		-Wl,-Ttext-segment=0x200000 -o $@ $<
+
 endif
 
 include mk/tests.mk
diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c
index e7912fb..5bdd386 100644
--- a/src/core/bootstrap.c
+++ b/src/core/bootstrap.c
@@ -134,10 +134,10 @@ static bool build_boot_regions(mem_region_t *regions,
      * to the vDSO page when splitting the block; otherwise vdso_build cannot
      * write into it through guest_ptr.
      */
-    if (!append_boot_region(regions, nregions, SHIM_BASE,
-                            SHIM_BASE + shim_bin_len, MEM_PERM_RX) ||
-        !append_boot_region(regions, nregions, SHIM_DATA_BASE,
-                            SHIM_DATA_BASE + BLOCK_2MIB, MEM_PERM_RW) ||
+    if (!append_boot_region(regions, nregions, g->shim_base,
+                            g->shim_base + shim_bin_len, MEM_PERM_RX) ||
+        !append_boot_region(regions, nregions, g->shim_data_base,
+                            g->shim_data_base + BLOCK_2MIB, MEM_PERM_RW) ||
         !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE,
                             MEM_PERM_RX)) {
         return false;
@@ -234,6 +234,11 @@ int guest_bootstrap_prepare(guest_t *g,
         return -1;
     }
 
+    /* Track the lowest loaded ELF address so the legacy fork IPC path
+     * copies low-linked ET_EXECs (e.g. linked at 0x200000) in full.
+     */
+    g->elf_load_min = boot->elf_info.load_min + boot->elf_load_base;
+
     g->brk_base = PAGE_ALIGN_UP(boot->elf_info.load_max + boot->elf_load_base);
     if (g->brk_base < BRK_BASE_DEFAULT)
         g->brk_base = BRK_BASE_DEFAULT;
@@ -252,15 +257,16 @@ int guest_bootstrap_prepare(guest_t *g,
         return -1;
     }
 
-    memcpy((uint8_t *) g->host_base + SHIM_BASE, shim_bin, shim_bin_len);
+    memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len);
     log_debug("shim loaded at offset 0x%llx (%zu bytes)",
-              (unsigned long long) SHIM_BASE, shim_bin_len);
+              (unsigned long long) g->shim_base, shim_bin_len);
 
     invalidate_exec_segments(&boot->elf_info, g->host_base,
                              boot->elf_load_base);
     invalidate_exec_segments(&boot->interp_info, g->host_base,
                              boot->interp_base);
-    sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_bin_len);
+    sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base,
+                          shim_bin_len);
 
     if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) {
         log_error("too many memory regions (%d >= %d)", nregions,
@@ -275,10 +281,10 @@ int guest_bootstrap_prepare(guest_t *g,
     }
     g->need_tlbi = true;
 
-    guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_bin_len,
+    guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len,
                      LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
                      "[shim]");
-    guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB,
+    guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB,
                      LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0,
                      "[shim-data]");
 
@@ -380,10 +386,10 @@ int guest_bootstrap_create_vcpu(guest_t *g,
     uint64_t sctlr;
     uint64_t sctlr_with_mmu;
     uint64_t tcr_value = TCR_EL1_VALUE;
-    uint64_t shim_ipa = guest_ipa(g, SHIM_BASE);
+    uint64_t shim_ipa = guest_ipa(g, g->shim_base);
     uint64_t entry_ipa = guest_ipa(g, boot->entry_point);
     uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer);
-    uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MIB);
+    uint64_t el1_sp = guest_ipa(g, g->shim_data_base + BLOCK_2MIB);
     hv_vcpu_t vcpu;
     hv_vcpu_exit_t *vexit;
 
diff --git a/src/core/guest.c b/src/core/guest.c
index 853ab13..e9af7f8 100644
--- a/src/core/guest.c
+++ b/src/core/guest.c
@@ -118,6 +118,36 @@ static void guest_region_clip_overlay(guest_region_t *r)
     r->overlay_end = overlay_end;
 }
 
+/* Compute infra reserve placement from guest_size and store derived fields in
+ * @g. Called from guest_init and guest_init_from_shm.
+ *
+ * Layout: a 4MiB region anchored at [interp_base - INFRA_RESERVE, interp_base)
+ * sits in the dead zone between mmap_limit and interp_base. PT pool, shim, and
+ * shim data fall at fixed offsets within the reserve (see guest.h).
+ *
+ * Returns 0 on success, -1 if the layout cannot be derived (interp_base too
+ * small to fit the reserve). Today guest_init enforces a 64GiB minimum so the
+ * underflow path is unreachable, but the explicit check guards future
+ * configurations and any IPC restore that bypasses size selection.
+ */
+static int compute_infra_layout(guest_t *g)
+{
+    if (g->interp_base < INFRA_RESERVE) {
+        log_error(
+            "guest: interp_base 0x%llx smaller than INFRA_RESERVE (0x%llx); "
+            "guest_size too small",
+            (unsigned long long) g->interp_base,
+            (unsigned long long) INFRA_RESERVE);
+        return -1;
+    }
+    uint64_t infra_base = g->interp_base - INFRA_RESERVE;
+    g->pt_pool_base = infra_base + INFRA_PT_POOL_OFF;
+    g->pt_pool_end = infra_base + INFRA_PT_POOL_END_OFF;
+    g->shim_base = infra_base + INFRA_SHIM_OFF;
+    g->shim_data_base = infra_base + INFRA_SHIM_DATA_OFF;
+    return 0;
+}
+
 /* Allocate a zeroed 4KiB page from the page table pool.
  * Returns GPA of the page, or 0 on pool exhaustion.
  * Acquires pt_lock internally. Caller typically holds mmap_lock.
@@ -125,12 +155,12 @@ static void guest_region_clip_overlay(guest_region_t *r)
 static uint64_t pt_alloc_page(guest_t *g)
 {
     pthread_mutex_lock(&pt_lock);
-    if (g->pt_pool_next + PAGE_SIZE > PT_POOL_END) {
+    if (g->pt_pool_next + PAGE_SIZE > g->pt_pool_end) {
         log_error(
             "guest: page table pool exhausted "
             "(used %llu / %llu bytes)",
-            (unsigned long long) (g->pt_pool_next - PT_POOL_BASE),
-            (unsigned long long) (PT_POOL_END - PT_POOL_BASE));
+            (unsigned long long) (g->pt_pool_next - g->pt_pool_base),
+            (unsigned long long) (g->pt_pool_end - g->pt_pool_base));
         pthread_mutex_unlock(&pt_lock);
         return 0;
     }
@@ -138,8 +168,8 @@ static uint64_t pt_alloc_page(guest_t *g)
     g->pt_pool_next += PAGE_SIZE;
 
     /* Warn at 80% pool usage so users can anticipate exhaustion */
-    uint64_t used = gpa + PAGE_SIZE - PT_POOL_BASE;
-    uint64_t total = PT_POOL_END - PT_POOL_BASE;
+    uint64_t used = gpa + PAGE_SIZE - g->pt_pool_base;
+    uint64_t total = g->pt_pool_end - g->pt_pool_base;
     if (!pt_pool_warned && used > (total * 4 / 5)) {
         log_debug(
             "guest: page table pool at %llu%% "
@@ -149,8 +179,8 @@ static uint64_t pt_alloc_page(guest_t *g)
         pt_pool_warned = true;
     }
 
-    /* Zero the page while still holding the lock so no other thread can
-     * observe a partially-zeroed page table page.
+    /* Zero the page while still holding the lock so no other thread can observe
+     * a partially-zeroed page table page.
      */
     memset((uint8_t *) g->host_base + gpa, 0, PAGE_SIZE);
     pthread_mutex_unlock(&pt_lock);
@@ -170,15 +200,15 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1;
     g->ipa_base = GUEST_IPA_BASE;
-    g->pt_pool_next = PT_POOL_BASE;
+    g->elf_load_min = ELF_DEFAULT_BASE;
     g->brk_base = BRK_BASE_DEFAULT;
     g->brk_current = BRK_BASE_DEFAULT;
     g->mmap_next = MMAP_BASE;
     g->mmap_rx_next = MMAP_RX_BASE;
 
-    /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+
-     * on Apple Silicon reports 40 bits (1TiB). Older versions or fallback
-     * yields 36 bits (64GiB).
+    /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ on
+     * Apple Silicon reports 40 bits (1TiB). Older versions or fallback yields
+     * 36 bits (64GiB).
      */
     uint32_t max_ipa = 0;
     hv_vm_config_get_max_ipa_size(&max_ipa);
@@ -216,6 +246,9 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
      */
     g->interp_base = g->guest_size - 0x100000000ULL;
     g->mmap_limit = g->guest_size - 0x200000000ULL;
+    if (compute_infra_layout(g) < 0)
+        return -1;
+    g->pt_pool_next = g->pt_pool_base;
 
     /* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages
      * this: physical pages are allocated only on first touch, so reserving up
@@ -238,12 +271,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
      * The child maps it MAP_PRIVATE, giving it an instant copy-on-write
      * clone of all guest memory.
      *
-     * macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular
-     * file fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED
-     * correctly. The file is unlinked immediately; the fd keeps it alive.
-     * macOS demand-pages file mappings, so untouched pages cost nothing.
-     * If any step fails, guest memory silently keeps the MAP_ANON mapping and
-     * falls back to the IPC region-copy path on fork.
+     * macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular file
+     * fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED correctly.
+     * The file is unlinked immediately; the fd keeps it alive. macOS
+     * demand-pages file mappings, so untouched pages cost nothing. If any step
+     * fails, guest memory silently keeps the MAP_ANON mapping and falls back to
+     * the IPC region-copy path on fork.
      */
     {
         char tmppath[] = "/tmp/elfuse-XXXXXX";
@@ -322,6 +355,11 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
         g->guest_size = size;
         g->interp_base = size - 0x100000000ULL;
         g->mmap_limit = size - 0x200000000ULL;
+        if (compute_infra_layout(g) < 0) {
+            hv_vm_destroy();
+            return -1;
+        }
+        g->pt_pool_next = g->pt_pool_base;
         g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE,
                             MAP_ANON | MAP_PRIVATE, -1, 0);
         if (g->host_base == MAP_FAILED) {
@@ -362,7 +400,7 @@ int guest_init_from_shm(guest_t *g,
     memset(g, 0, sizeof(*g));
     g->shm_fd = -1; /* Child does not own the shm */
     g->ipa_base = GUEST_IPA_BASE;
-    g->pt_pool_next = PT_POOL_BASE;
+    g->elf_load_min = ELF_DEFAULT_BASE;
     g->brk_base = BRK_BASE_DEFAULT;
     g->brk_current = BRK_BASE_DEFAULT;
     g->mmap_next = MMAP_BASE;
@@ -373,6 +411,9 @@ int guest_init_from_shm(guest_t *g,
     /* Compute layout limits (same formula as guest_init) */
     g->interp_base = size - 0x100000000ULL;
     g->mmap_limit = size - 0x200000000ULL;
+    if (compute_infra_layout(g) < 0)
+        return -1;
+    g->pt_pool_next = g->pt_pool_base;
 
     /* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see
      * the parent's frozen snapshot; writes are private to this process.
@@ -836,21 +877,21 @@ void guest_reset(guest_t *g)
     }
 
     /* Zero page table pool (not tracked in region array) */
-    if (g->pt_pool_next > PT_POOL_BASE)
-        memset((uint8_t *) g->host_base + PT_POOL_BASE, 0,
-               g->pt_pool_next - PT_POOL_BASE);
+    if (g->pt_pool_next > g->pt_pool_base)
+        memset((uint8_t *) g->host_base + g->pt_pool_base, 0,
+               g->pt_pool_next - g->pt_pool_base);
 
     /* Zero shim code + data (not tracked in region array by guest_reset
      * callers; shim regions are added AFTER reset by the exec path)
      */
-    memset((uint8_t *) g->host_base + SHIM_BASE, 0,
-           SHIM_DATA_BASE + BLOCK_2MIB - SHIM_BASE);
+    memset((uint8_t *) g->host_base + g->shim_base, 0,
+           g->shim_data_base + BLOCK_2MIB - g->shim_base);
 
     /* Reset allocation state */
     guest_pt_gen_bump(g);
     guest_tlb_flush();
     __atomic_store_n(&pt_pool_warned, false, __ATOMIC_RELAXED);
-    g->pt_pool_next = PT_POOL_BASE;
+    g->pt_pool_next = g->pt_pool_base;
     g->brk_base = BRK_BASE_DEFAULT;
     g->brk_current = BRK_BASE_DEFAULT;
     g->mmap_next = MMAP_BASE;
@@ -861,6 +902,7 @@ void guest_reset(guest_t *g)
     g->mmap_rx_gap_hint = 0;
     g->ttbr0 = 0;
     g->need_tlbi = false;
+    g->elf_load_min = ELF_DEFAULT_BASE;
 
     /* Clear semantic region tracking (will be re-populated after exec) */
     guest_region_clear(g);
@@ -875,34 +917,36 @@ int guest_get_used_regions(const guest_t *g,
 {
     int n = 0;
 
-    /* Page table pool */
-    if (n < max && g->pt_pool_next > PT_POOL_BASE) {
-        out[n].offset = PT_POOL_BASE;
-        out[n].size = g->pt_pool_next - PT_POOL_BASE;
+    /* Page table pool (high IPA, just below interp_base) */
+    if (n < max && g->pt_pool_next > g->pt_pool_base) {
+        out[n].offset = g->pt_pool_base;
+        out[n].size = g->pt_pool_next - g->pt_pool_base;
         n++;
     }
 
-    /* Shim code */
+    /* Shim code (high IPA) */
     if (n < max && shim_size > 0) {
-        out[n].offset = SHIM_BASE;
+        out[n].offset = g->shim_base;
         out[n].size = shim_size;
         n++;
     }
 
-    /* Shim data/stack (full 2MiB block) */
+    /* Shim data/stack (full 2MiB block, high IPA) */
     if (n < max) {
-        out[n].offset = SHIM_DATA_BASE;
+        out[n].offset = g->shim_data_base;
         out[n].size = BLOCK_2MIB;
         n++;
     }
 
-    /* ELF + brk region: from ELF_DEFAULT_BASE to brk_current.
-     * guest memory does not track the exact ELF load range, but static musl
-     * binaries always load at or above ELF_DEFAULT_BASE (0x400000).
+    /* ELF + brk region: from elf_load_min (set by ELF loader) to brk_current.
+     * The lower bound is the actual ELF load address, not ELF_DEFAULT_BASE:
+     * ET_EXECs linked below 0x400000 (e.g. at 0x200000) have segments below the
+     * legacy default and would otherwise be silently dropped from the legacy
+     * fork-IPC copy.
      */
-    if (n < max && g->brk_current > ELF_DEFAULT_BASE) {
-        out[n].offset = ELF_DEFAULT_BASE;
-        out[n].size = g->brk_current - ELF_DEFAULT_BASE;
+    if (n < max && g->brk_current > g->elf_load_min) {
+        out[n].offset = g->elf_load_min;
+        out[n].size = g->brk_current - g->elf_load_min;
         n++;
     }
 
@@ -1167,9 +1211,9 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
             right->offset += (end - r->start);
             right->start = end;
             if (r->backing_fd >= 0) {
-                /* A dup failure leaves backing_fd=-1, silently converting
-                 * this half to anonymous semantics (msync and MADV_DONTNEED
-                 * skip regions with backing_fd<0). Propagating the error would
+                /* A dup failure leaves backing_fd=-1, silently converting this
+                 * half to anonymous semantics (msync and MADV_DONTNEED skip
+                 * regions with backing_fd<0). Propagating the error would
                  * require making all region split callers (mprotect, munmap)
                  * fallible.
                  */
@@ -1513,8 +1557,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n)
     uint64_t *l0 = pt_at(g, l0_gpa);
 
     /* For each region, determine which 2MiB blocks need mapping.
-     * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block
-     * descriptor output address are both derived from gpa_start + ipa_base.
+     * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block descriptor
+     * output address are both derived from gpa_start + ipa_base.
      */
     for (int r = 0; r < n; r++) {
         uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start);
diff --git a/src/core/guest.h b/src/core/guest.h
index 7425d01..c087d89 100644
--- a/src/core/guest.h
+++ b/src/core/guest.h
@@ -26,16 +26,35 @@
 
 /* Memory layout constants.
  *
- * Guest memory size is determined dynamically from the VM's IPA width
- * (36-bit = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the
- * runtime probe that selects the correct size.
- */
-
-#define PT_POOL_BASE 0x00010000ULL     /* Page table pool start */
-#define PT_POOL_END 0x00100000ULL      /* Page table pool end (960KiB) */
-#define SHIM_BASE 0x00100000ULL        /* Shim code (2MiB block, RX) */
-#define SHIM_DATA_BASE 0x00200000ULL   /* Shim stack/data (2MiB block, RW) */
-#define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */
+ * Guest memory size is determined dynamically from the VM's IPA width (36-bit
+ * = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the runtime probe that
+ * selects the correct size.
+ *
+ * Infrastructure layout (page-table pool, shim code, shim data): a 4MiB reserve
+ * placed just below g->interp_base, in the dead zone between g->mmap_limit and
+ * g->interp_base. The exact base is computed at guest_init time and stored in
+ * guest_t.pt_pool_base / pt_pool_end / shim_base / shim_data_base. EL0 user
+ * binaries are therefore free to load at low addresses (down to 64KiB) without
+ * colliding with the runtime.
+ *
+ * Internal layout within the 4MiB reserve:
+ *   +0x000000 .. +0x010000  unused (64KiB null guard)
+ *   +0x010000 .. +0x100000  page-table pool (960KiB, RW)
+ *   +0x100000 .. +0x200000  shim code slot (1MiB, RX). Sits in the same
+ *                           2MiB L2 block as the PT pool, so that block
+ *                           is split into 4KiB L3 pages (mixed RX/RW).
+ *   +0x200000 .. +0x400000  shim data + EL1 stack (full 2MiB L2 block, RW)
+ */
+
+/* Total size of the runtime infrastructure reserve. Shifted to
+ * [g->interp_base - INFRA_RESERVE, g->interp_base) at guest_init.
+ */
+#define INFRA_RESERVE 0x00400000ULL         /* 4MiB */
+#define INFRA_PT_POOL_OFF 0x00010000ULL     /* offset of PT pool */
+#define INFRA_PT_POOL_END_OFF 0x00100000ULL /* PT pool end (960KiB) */
+#define INFRA_SHIM_OFF 0x00100000ULL        /* offset of shim code slot */
+#define INFRA_SHIM_DATA_OFF 0x00200000ULL   /* offset of shim data slot */
+#define ELF_DEFAULT_BASE 0x00400000ULL      /* Typical ELF load base */
 #define PIE_LOAD_BASE 0x00400000ULL    /* PIE (ET_DYN) executable base (4MiB) */
 #define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MiB) */
 
@@ -46,36 +65,36 @@
 #define STACK_TOP_DEFAULT 0x08000000ULL
 #define STACK_GUARD_SIZE 0x00001000ULL /* 4KiB guard at stack bottom */
 
-/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap
- * region clear for runtimes that demand a specific minimum heap address.
+/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap region
+ * clear for runtimes that demand a specific minimum heap address.
  */
 #define MMAP_RX_BASE 0x10000000ULL
 
-/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block;
- * additional pages are mapped lazily by guest_extend_page_tables()
- * when sys_mmap needs more PROT_EXEC space. Reduces startup time
- * and memory pressure for small binaries that never call mmap.
+/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block; additional
+ * pages are mapped lazily by guest_extend_page_tables() when sys_mmap needs
+ * more PROT_EXEC space. Reduces startup time and memory pressure for small
+ * binaries that never call mmap.
  */
 #define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MiB */
 
 /* mmap RW region starts at 8GiB to match real Linux address layouts. */
 #define MMAP_BASE 0x200000000ULL
 
-/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block;
- * additional pages are mapped lazily by guest_extend_page_tables().
+/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block; additional
+ * pages are mapped lazily by guest_extend_page_tables().
  */
 #define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MiB */
 
-/* mmap_limit and interp_base are computed dynamically from guest_size
- * in main.c and stored in guest_t.
+/* mmap_limit and interp_base are computed dynamically from guest_size in main.c
+ * and stored in guest_t.
  */
 #define BLOCK_2MIB (2ULL * 1024 * 1024)
 
 /* IPA base: guest memory is mapped at this IPA in the hypervisor.
  * All guest physical addresses = GUEST_IPA_BASE + offset.
- * Must be 0 so that guest virtual addresses match ELF link addresses
- * (e.g. 0x400000). A non-zero IPA base would require all ELF binaries
- * to be linked at IPA_BASE+vaddr, which is impractical.
+ * Must be 0 so that guest virtual addresses match ELF link addresses (e.g.
+ * 0x400000). A non-zero IPA base would require all ELF binaries to be linked at
+ * IPA_BASE+vaddr, which is impractical.
  */
 #define GUEST_IPA_BASE 0x0ULL
 
@@ -100,10 +119,10 @@ typedef struct {
 
 /* Maximum number of tracked memory regions (heap/stack/mmap/ELF/etc.).
  * Adjacent anonymous regions with matching permissions are automatically
- * coalesced (see regions_mergeable in core/guest.c). Threaded runtimes
- * create many thread stacks with guard pages; with coalescing, typical
- * workloads use ~50 regions. 4096 provides ample headroom for edge cases
- * (many interleaved guard pages, file-backed mappings, etc.).
+ * coalesced (see regions_mergeable in core/guest.c). Threaded runtimes create
+ * many thread stacks with guard pages; with coalescing, typical workloads use
+ * ~50 regions. 4096 provides ample headroom for edge cases (many interleaved
+ * guard pages, file-backed mappings, etc.).
  */
 #define GUEST_MAX_REGIONS 4096
 
@@ -141,14 +160,16 @@ typedef struct {
     bool shared;     /* MAP_SHARED (writes should propagate) */
     bool noreserve;  /* MAP_NORESERVE: PTEs deferred until fault */
     bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay
-                          * of backing_fd at host_base+start. The kernel's
-                          * page cache keeps it coherent with the file and
-                          * with peer overlays of the same file, so msync
-                          * skips the snapshot-style pwrite-the-diff and
-                          * refresh-from-file paths for these regions. */
+                          * of backing_fd at host_base+start. The kernel's page
+                          * cache keeps it coherent with the file and with peer
+                          * overlays of the same file, so msync skips the
+                          * snapshot-style pwrite-the-diff and refresh-from-file
+                          * paths for these regions.
+                          */
     uint64_t overlay_start; /* Host-page-aligned overlay start. May extend
                              * outside [start, end) when only part of a host
-                             * page is guest-visible. */
+                             * page is guest-visible.
+                             */
     uint64_t overlay_end;   /* Host-page-aligned overlay end (exclusive). */
     char name[64];          /* Label: "[heap]", "[stack]", ELF path, etc. */
 } guest_region_t;
@@ -162,12 +183,30 @@ typedef struct {
     uint64_t ipa_base;   /* IPA base for hv_vm_map (GUEST_IPA_BASE) */
     uint64_t mmap_limit; /* Max mmap address (computed from guest_size) */
 
-    uint64_t interp_base;  /* Dynamic linker load base (from guest_size) */
+    uint64_t interp_base; /* Dynamic linker load base (from guest_size) */
+
+    /* Runtime-infrastructure reserve. Computed at guest_init time and placed at
+     * [interp_base - INFRA_RESERVE, interp_base). All four values are derived
+     * from the same base, so the inequalities
+     *   pt_pool_base < pt_pool_end <= shim_base < shim_data_base
+     * always hold, and shim_data_base + BLOCK_2MIB == interp_base.
+     */
+    uint64_t pt_pool_base;   /* Page-table pool start (high IPA) */
+    uint64_t pt_pool_end;    /* Page-table pool end (exclusive) */
+    uint64_t shim_base;      /* Shim code (2MiB block, RX) */
+    uint64_t shim_data_base; /* Shim stack/data (2MiB block, RW) */
+
     uint64_t pt_pool_next; /* Next free page table page in pool */
-    uint64_t brk_base;     /* Initial brk (set after ELF load) */
-    uint64_t brk_current;  /* Current brk position */
-    uint64_t stack_base;   /* Bottom of stack region (dynamic, above brk) */
-    uint64_t stack_top;    /* Top of stack (stack grows down from here) */
+    /* Lowest virtual address of the loaded ELF (executable image, not the
+     * dynamic linker). Set by bootstrap and re-set by execve. Used by the
+     * legacy fork IPC path to bound the ELF + brk copy chunk; it must cover
+     * ET_EXECs linked below ELF_DEFAULT_BASE (e.g. 0x200000).
+     */
+    uint64_t elf_load_min;
+    uint64_t brk_base;    /* Initial brk (set after ELF load) */
+    uint64_t brk_current; /* Current brk position */
+    uint64_t stack_base;  /* Bottom of stack region (dynamic, above brk) */
+    uint64_t stack_top;   /* Top of stack (stack grows down from here) */
 
     uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */
     uint64_t mmap_end;  /* Current page-table-covered RW mmap limit */
@@ -220,6 +259,35 @@ static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset)
     return g->ipa_base + offset;
 }
 
+/* True iff [start, end) overlaps the runtime infra reserve
+ * [interp_base - INFRA_RESERVE, interp_base). Covers the full 4 MiB
+ * reserve including the 64 KiB null-guard slot at the bottom (which
+ * has no PT entries but must not become semantically reachable from
+ * guest mmap state). Used by sys_mmap (MAP_FIXED), sys_munmap, and
+ * sys_mprotect to reject guest attempts to touch page tables, shim
+ * code, or shim data through the syscall surface.
+ */
+static inline bool guest_range_hits_infra(const guest_t *g,
+                                          uint64_t start,
+                                          uint64_t end)
+{
+    uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+    uint64_t infra_hi = g->interp_base;
+    return start < infra_hi && end > infra_lo;
+}
+
+/* True iff a single address (PC, hint, etc.) falls inside the infra reserve.
+ * Used by rt_sigreturn to reject forged frames that would redirect EL0 PC into
+ * EL1 shim or page-table memory. Covers the full 4 MiB reserve, matching
+ * guest_range_hits_infra.
+ */
+static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr)
+{
+    uint64_t infra_lo = g->interp_base - INFRA_RESERVE;
+    uint64_t infra_hi = g->interp_base;
+    return addr >= infra_lo && addr < infra_hi;
+}
+
 /* API */
 
 /* Allocate guest memory, create VM, map to hypervisor.
diff --git a/src/core/shim.S b/src/core/shim.S
index fe82f9a..62328d3 100644
--- a/src/core/shim.S
+++ b/src/core/shim.S
@@ -4,11 +4,12 @@
  * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
  * SPDX-License-Identifier: Apache-2.0
  *
- * Loaded at SHIM_BASE (0x100000). Runs at EL1.
- * All system registers (VBAR, MAIR, TCR, TTBR0, SCTLR, etc.) are
- * configured by the host before vCPU start. The shim entry point
- * transitions to EL0 via ERET. Exception vectors handle SVC #0
- * (Linux syscall) forwarding to the host via HVC #5.
+ * Loaded at g->shim_base (a 4MiB infra reserve placed just below g->interp_base;
+ * computed at guest_init time). Runs at EL1.
+ * All system registers (VBAR, MAIR, TCR, TTBR0, SCTLR, etc.) are configured by
+ * the host before vCPU start. The shim entry point transitions to EL0 via ERET.
+ * Exception vectors handle SVC #0 (Linux syscall) forwarding to the host via
+ * HVC #5.
  *
  * HVC protocol:
  *   #0  Normal exit       (x0 = exit code)
@@ -141,10 +142,10 @@
 .globl _start
 _start:
     /* Host has configured all system registers EXCEPT SCTLR.M (MMU enable).
-     * Apple's Hypervisor.framework requires the MMU to be enabled DURING
-     * vCPU execution (via HVC #4), not before hv_vcpu_run(). Setting
-     * SCTLR.M=1 via hv_vcpu_set_sys_reg before start causes permission
-     * faults on the first instruction fetch.
+     * Apple's Hypervisor.framework requires the MMU to be enabled DURING vCPU
+     * execution (via HVC #4), not before hv_vcpu_run(). Setting SCTLR.M=1 via
+     * hv_vcpu_set_sys_reg before start causes permission faults on the first
+     * instruction fetch.
      *
      * Host passes the final SCTLR value (with M=1) in X0 before start.
      */
@@ -173,8 +174,8 @@ _start:
  *
  * bad_exception vectors: mov x5, #offset + b bad_exception
  *   X5 carries the vector offset for host-side debugging.
- *   This is safe because bad_exception halts, so no register
- *   preservation needed.
+ *   This is safe because bad_exception halts, so no register preservation
+     needed.
  *
  * svc_handler vectors: b svc_handler (NO mov x5!)
  *   These MUST NOT clobber any GPR before svc_handler saves them. The Linux
@@ -327,14 +328,14 @@ handle_sysreg_trap:
     b 3f
 
 4:  /* System instruction: trapped cache/data operations.
-     * DC ZVA (Data Cache Zero by VA) is trapped when HCR_EL2.TDZ=1
-     * despite SCTLR.DZE=1. DC ZVA zeroes a cache-line-sized block (64 bytes on
-     * Apple Silicon). JIT translators and libc use it as fast memset(0).
+     * DC ZVA (Data Cache Zero by VA) is trapped when HCR_EL2.TDZ=1 despite
+     * SCTLR.DZE=1. DC ZVA zeroes a cache-line-sized block (64 bytes on Apple
+     * Silicon). JIT translators and libc use it as fast memset(0).
      * Failure to emulate leaves stale data.
      *
-     * DC CVAU, IC IVAU, and other cache maintenance instructions trap
-     * here despite SCTLR.UCI=1 (HVF sets HCR_EL2.TPU=1). The shim executes
-     * IC IALLU as safety net for I-cache coherency after any of these.
+     * DC CVAU, IC IVAU, and other cache maintenance instructions trap here
+     * despite SCTLR.UCI=1 (HVF sets HCR_EL2.TPU=1). The shim executes IC IALLU
+     * as safety net for I-cache coherency after any of these.
      *
      * ISS encoding for system instructions:
      *   Op0[21:20] Op2[19:17] Op1[16:14] CRn[13:10] Rt[9:5] CRm[4:1] Dir[0]
@@ -372,9 +373,9 @@ handle_sysreg_trap:
     stp xzr, xzr, [x11, #48]
 6:  /* Forward to host for counting, then I-cache maintenance */
 
-5:  /* Pass the source register value to the host.  Cache maintenance
-     * instructions ignore it, but MSR writes (notably TPIDR_EL0 for
-     * userspace TLS setup) need the original Xt value.
+5:  /* Pass the source register value to host. Cache maintenance instructions
+     * ignore it, but MSR writes (notably TPIDR_EL0 for userspace TLS setup)
+     * need the original Xt value.
      */
     ubfx x10, x9, #5, #5          /* Rt = ISS[9:5] */
     cmp x10, #31
@@ -400,7 +401,6 @@ handle_sysreg_trap:
  * 1. Permission fault (IFSC[5:2]=0x03): W^X demand toggle for JIT.
  *    Code pages are initially RW; execution triggers permission fault.
  *    Ask host to flip to RX via HVC #9, flush TLB, retry.
- *
  * 2. Translation fault (IFSC[5:2]=0x01) or other non-permission fault: Address
  *    not mapped in page tables. A real Linux kernel delivers SIGSEGV
  *    (si_code=SEGV_MAPERR) to the process. JIT translators may use SIGSEGV for
@@ -430,11 +430,9 @@ handle_inst_abort:
  * Three cases:
  * 1. Write permission fault (WnR=1, DFSC[5:2]=0x03): W^X toggle.
  *    Code page is RX; JIT write triggers permission fault. Flip to RW.
- *
  * 2. Read/write translation fault (DFSC[5:2]=0x01): Address not mapped.
  *    Deliver SIGSEGV (SEGV_MAPERR) via host. Programs may use SIGSEGV
  *    handlers for lazy allocation or guard pages.
- *
  * 3. Read permission fault (WnR=0, DFSC[5:2]=0x03): Permission denied.
  *    Deliver SIGSEGV (SEGV_ACCERR) via host.
  */
diff --git a/src/runtime/fork-state.h b/src/runtime/fork-state.h
index c8721fb..4b45c61 100644
--- a/src/runtime/fork-state.h
+++ b/src/runtime/fork-state.h
@@ -18,7 +18,7 @@
 /* Magic values for IPC frame delimiters */
 #define IPC_MAGIC_HEADER 0x454C464BU   /* "ELFK" */
 #define IPC_MAGIC_SENTINEL 0x454C4F4BU /* "ELOK" */
-#define IPC_VERSION 8                  /* v8: session / process group state */
+#define IPC_VERSION 9                  /* v9: preserve elf_load_min */
 
 typedef struct {
     uint32_t magic;
@@ -27,6 +27,7 @@ typedef struct {
     uint32_t has_shm;
     int64_t child_pid, parent_pid;
     uint64_t guest_size;
+    uint64_t elf_load_min;
     uint64_t brk_base, brk_current;
     uint64_t stack_base;
     uint64_t stack_top;
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 0803503..d115cb3 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -110,6 +110,32 @@ int fork_child_main(int ipc_fd,
     absock_set_namespace_id(hdr.absock_namespace_id);
     proc_set_session(hdr.sid, hdr.pgid);
 
+    /* Validate header layout fields before any size-derived arithmetic.
+     * guest_init / guest_init_from_shm derive interp_base, mmap_limit, and
+     * the high-IPA infra reserve from these inputs; underflow on tiny or
+     * malformed values would place pt_pool_base and friends near UINT64_MAX,
+     * which then feeds unchecked host-buffer offsets in pt_alloc_page and
+     * pt_at. Reject impossible layouts up front.
+     *
+     * Lower bound: guest_size must leave room for both mmap_limit
+     * (size - 8 GiB) and interp_base (size - 4 GiB) plus the 4 MiB infra
+     * reserve below it. 8 GiB satisfies all three with margin.
+     * Upper bound: guest_size must fit in the negotiated IPA width.
+     * IPA bits: 36 (Apple M2) and 40 (M3+) are the supported widths.
+     */
+    if (hdr.ipa_bits < 36 || hdr.ipa_bits > 40) {
+        log_error("fork-child: invalid ipa_bits %u", (unsigned) hdr.ipa_bits);
+        close(ipc_fd);
+        return 1;
+    }
+    if (hdr.guest_size < 0x200000000ULL ||
+        hdr.guest_size > (1ULL << hdr.ipa_bits)) {
+        log_error("fork-child: invalid guest_size 0x%llx (ipa_bits=%u)",
+                  (unsigned long long) hdr.guest_size, (unsigned) hdr.ipa_bits);
+        close(ipc_fd);
+        return 1;
+    }
+
     /* Create guest memory before receiving state so all incoming offsets can be
      * bounds-checked against the negotiated guest size.
      */
@@ -144,11 +170,20 @@ int fork_child_main(int ipc_fd,
     }
 
     /* Restore allocator/page-table cursors before mmap/brk can run in child.
-     * Validate pt_pool_next and ttbr0: both must reside within the page table
-     * pool [PT_POOL_BASE, PT_POOL_END). Accepting out-of-range values from IPC
-     * would corrupt page table allocation or translation walks.
+     * Validate pt_pool_next and ttbr0 against the child's own page-table
+     * pool, which the child just computed from hdr.guest_size +
+     * hdr.ipa_bits via compute_infra_layout.
+     *
+     * Range alone is not enough: pt_alloc_page advances pt_pool_next in
+     * GUEST_PAGE_SIZE quanta, and pt_at converts page-table GPAs straight
+     * into host-buffer pointers. An unaligned value passes the [base, end)
+     * gate but then misaligns the walker. Require:
+     *   - pt_pool_next page-aligned relative to pt_pool_base
+     *   - ttbr0 strictly inside the in-use pool [pt_pool_base, pt_pool_next)
+     *     (parent must have allocated the L0 page) and page-aligned.
      */
-    if (hdr.pt_pool_next < PT_POOL_BASE || hdr.pt_pool_next > PT_POOL_END) {
+    if (hdr.pt_pool_next < g.pt_pool_base || hdr.pt_pool_next > g.pt_pool_end ||
+        ((hdr.pt_pool_next - g.pt_pool_base) % GUEST_PAGE_SIZE) != 0) {
         log_error("fork-child: invalid pt_pool_next 0x%llx",
                   (unsigned long long) hdr.pt_pool_next);
         guest_destroy(&g);
@@ -156,7 +191,8 @@ int fork_child_main(int ipc_fd,
         return 1;
     }
     uint64_t ttbr0_off = hdr.ttbr0 - g.ipa_base;
-    if (ttbr0_off < PT_POOL_BASE || ttbr0_off >= PT_POOL_END) {
+    if (ttbr0_off < g.pt_pool_base || ttbr0_off >= hdr.pt_pool_next ||
+        ((ttbr0_off - g.pt_pool_base) % GUEST_PAGE_SIZE) != 0) {
         log_error("fork-child: invalid ttbr0 0x%llx",
                   (unsigned long long) hdr.ttbr0);
         guest_destroy(&g);
@@ -165,6 +201,7 @@ int fork_child_main(int ipc_fd,
     }
     g.brk_base = hdr.brk_base;
     g.brk_current = hdr.brk_current;
+    g.elf_load_min = hdr.elf_load_min;
     g.stack_base = hdr.stack_base;
     g.stack_top = hdr.stack_top;
     g.mmap_next = hdr.mmap_next;
@@ -379,13 +416,12 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
     if (current_thread)
         t->blocked = current_thread->blocked;
 
-    /* Allocate per-thread EL1 stack */
-    uint64_t child_sp_el1 = thread_alloc_sp_el1();
+    /* Allocate per-thread EL1 stack (records both sp and slot in t). */
+    uint64_t child_sp_el1 = thread_alloc_sp_el1(g, t);
     if (child_sp_el1 == 0) {
         thread_deactivate(t);
         return -LINUX_ENOMEM;
     }
-    t->sp_el1 = child_sp_el1;
 
     /* Capture parent register state before spawning worker.
      * HVF binds vCPU to the creating thread, so the worker must call
@@ -656,13 +692,12 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu,
     if (current_thread)
         t->blocked = current_thread->blocked;
 
-    /* Allocate per-thread EL1 stack */
-    uint64_t child_sp_el1 = thread_alloc_sp_el1();
+    /* Allocate per-thread EL1 stack (records both sp and slot in t). */
+    uint64_t child_sp_el1 = thread_alloc_sp_el1(g, t);
     if (child_sp_el1 == 0) {
         thread_deactivate(t);
         return -LINUX_ENOMEM;
     }
-    t->sp_el1 = child_sp_el1;
 
     /* Capture parent register state */
     uint64_t parent_elr = vcpu_get_sysreg(parent_vcpu, HV_SYS_REG_ELR_EL1);
@@ -1085,6 +1120,7 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         .child_pid = child_guest_pid,
         .parent_pid = proc_get_pid(),
         .guest_size = g->guest_size,
+        .elf_load_min = g->elf_load_min,
         .brk_base = g->brk_base,
         .brk_current = g->brk_current,
         .stack_base = g->stack_base,
diff --git a/src/runtime/thread.c b/src/runtime/thread.c
index 36d6a2d..e024ec8 100644
--- a/src/runtime/thread.c
+++ b/src/runtime/thread.c
@@ -21,7 +21,7 @@
 
 #include "runtime/thread.h"
 #include "debug/log.h"
-#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MIB, GUEST_IPA_BASE */
+#include "core/guest.h" /* guest_t (shim_data_base/ipa_base), BLOCK_2MIB */
 #include "hvutil.h"     /* vcpu_get_gpr, vcpu_get_sysreg */
 
 /* From syscall/signal.h, included here directly to avoid pulling in
@@ -38,8 +38,15 @@ static int thread_can_add_deferred_unmap_locked(thread_entry_t *t,
                                                 uint64_t start,
                                                 uint64_t end);
 
-/* Top of the EL1 exception stack region (one 4KiB slot per thread) */
-#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MIB)
+/* Top of the EL1 exception stack region (one 4KiB slot per thread).
+ * The shim data block sits at high IPA, computed at guest_init time and
+ * stored in g->shim_data_base; the top of the EL1 stacks is the next
+ * 2MiB boundary above that. Caller must hold a guest_t reference.
+ */
+static inline uint64_t sp_el1_top(const guest_t *g)
+{
+    return g->ipa_base + g->shim_data_base + BLOCK_2MIB;
+}
 
 /* Thread table. */
 
@@ -103,6 +110,7 @@ void thread_register_main(hv_vcpu_t vcpu,
     t->host_thread = pthread_self();
     t->clear_child_tid = 0;
     t->sp_el1 = sp_el1;
+    t->sp_el1_slot = 0; /* Main thread always owns slot 0 */
     t->active = 1;
     t->altstack_flags = LINUX_SS_DISABLE;
     t->on_altstack = false;
@@ -138,6 +146,7 @@ thread_entry_t *thread_alloc(int64_t tid,
             pthread_cond_destroy(&t->resume_cond);
         }
         memset(t, 0, sizeof(*t));
+        t->sp_el1_slot = -1; /* No SP_EL1 yet; thread_alloc_sp_el1 fills this */
         t->guest_tid = tid;
         if (stack_start < stack_end) {
             t->stack_map_start = stack_start;
@@ -156,18 +165,15 @@ thread_entry_t *thread_alloc(int64_t tid,
 }
 
 /* Free an SP_EL1 slot for reuse. Must be called with thread_lock held.
- * Derives the slot index from the IPA and clears the bitmask bit.
+ * Reads the slot index recorded at allocation time and clears the bit.
  */
-static void thread_free_sp_el1_locked(uint64_t sp)
+static void thread_free_sp_el1_locked(thread_entry_t *t)
 {
-    if (sp == 0)
-        return;
-    uint64_t top = SP_EL1_TOP;
-    if (sp > top)
-        return;
-    int slot = (int) ((top - sp) / 4096);
+    int slot = t->sp_el1_slot;
     if (RANGE_CHECK(slot, 0, MAX_THREADS))
         sp_el1_allocated &= ~BIT64(slot);
+    t->sp_el1 = 0;
+    t->sp_el1_slot = -1;
 }
 
 static void thread_ptrace_cleanup_locked(thread_entry_t *t)
@@ -205,7 +211,7 @@ void thread_deactivate(thread_entry_t *t)
     }
 
     /* Free SP_EL1 slot so it can be reused by future threads */
-    thread_free_sp_el1_locked(t->sp_el1);
+    thread_free_sp_el1_locked(t);
 
     t->active = 0;
     atomic_fetch_sub(&active_thread_count, 1);
@@ -272,7 +278,7 @@ int thread_count_active_vm_clones(void)
     return count;
 }
 
-uint64_t thread_alloc_sp_el1(void)
+uint64_t thread_alloc_sp_el1(const guest_t *g, thread_entry_t *t)
 {
     uint64_t sp = 0;
 
@@ -284,12 +290,14 @@ uint64_t thread_alloc_sp_el1(void)
         log_error("thread: SP_EL1 slots exhausted");
     } else {
         int slot = bit_ctz64(free_mask);
-        /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MiB.
+        /* Main thread's SP_EL1 sits at the top of the shim data block.
          * Each subsequent thread is 4KiB below.
          */
-        uint64_t top = SP_EL1_TOP;
+        uint64_t top = sp_el1_top(g);
         sp = top - (uint64_t) slot * 4096;
         sp_el1_allocated |= BIT64(slot);
+        t->sp_el1 = sp;
+        t->sp_el1_slot = slot;
     }
 
     pthread_mutex_unlock(&thread_lock);
@@ -357,7 +365,7 @@ void thread_destroy_all_vcpus(void)
             continue;
         hv_vcpu_destroy(t->vcpu);
         t->vcpu = 0;
-        thread_free_sp_el1_locked(t->sp_el1);
+        thread_free_sp_el1_locked(t);
         t->active = 0;
         /* Do NOT destroy condvars. Same race as thread_deactivate: a waiter
          * woken by an earlier broadcast may still reference the condvar.
@@ -892,7 +900,7 @@ int64_t thread_ptrace_wait(int64_t tracer_tid,
                 /* Destroy condvars after the last waiter returns from
                  * pthread_cond_wait().
                  */
-                thread_free_sp_el1_locked(t->sp_el1);
+                thread_free_sp_el1_locked(t);
                 t->active = 0;
                 atomic_fetch_sub(&active_thread_count, 1);
                 t->ptrace_cleanup_pending = true;
diff --git a/src/runtime/thread.h b/src/runtime/thread.h
index 25ab5ff..a8d35ab 100644
--- a/src/runtime/thread.h
+++ b/src/runtime/thread.h
@@ -10,8 +10,8 @@
  * O(1) access to the current thread's entry from any syscall handler.
  *
  * SP_EL1 allocation: each thread gets a 4KiB EL1 exception stack carved from
- * the shim data region (SHIM_DATA_BASE + 2MiB). Thread 0 (main) gets the top,
- * thread N gets offset -(N * 4096).
+ * the shim data region (g->shim_data_base + 2MiB). Thread 0 (main) gets the
+ * top, thread N gets offset -(N * 4096).
  */
 
 #pragma once
@@ -20,6 +20,8 @@
 #include <pthread.h>
 #include <stdbool.h>
 #include <stdint.h>
+
+#include "core/guest.h"  /* guest_t (for thread_alloc_sp_el1) */
 #include "syscall/abi.h" /* linux_user_pt_regs_t */
 
 /* Maximum number of concurrent guest threads in one VM. */
@@ -34,6 +36,11 @@ typedef struct {
     pthread_t host_thread;    /* macOS host thread running this vCPU */
     uint64_t clear_child_tid; /* GVA for CLONE_CHILD_CLEARTID (0=none) */
     uint64_t sp_el1;          /* Per-thread EL1 stack top (IPA) */
+    int sp_el1_slot;          /* Slot index in sp_el1_allocated (-1 = none).
+                               * Stored at alloc time so the free path does
+                               * not need to recompute (top - sp) / 4096; the
+                               * shim data block is now at high IPA and only
+                               * known via guest_t. */
     int active;               /* Non-zero while thread is running.
                                * Stays int (not bool) because lock-free paths in thread.c
                                * use __atomic_load_n on this field; the 32-bit width keeps
@@ -181,12 +188,15 @@ int thread_active_count(void);
 /* Fast path: return non-zero when exactly one guest thread is active. */
 int thread_is_single_active(void);
 
-/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KiB slot counting
- * down from the top of the shim data region. The IPA base (GUEST_IPA_BASE +
- * SHIM_DATA_BASE + 2MiB) is the main thread's SP_EL1; each subsequent thread
- * subtracts 4KiB. Returns the IPA, or 0 on failure.
+/* Allocate a per-thread SP_EL1 stack and record both the IPA and the slot
+ * index into t. Thread N gets the Nth 4KiB slot counting down from the top
+ * of the shim data block (g->shim_data_base + 2MiB). The shim block lives
+ * at high IPA computed by guest_init, so callers must pass g; the slot
+ * index is stored in t->sp_el1_slot so the free path (which is reached
+ * from teardown contexts that lack g) can clear the bitmask directly.
+ * Returns the SP_EL1 IPA, or 0 on slot exhaustion.
  */
-uint64_t thread_alloc_sp_el1(void);
+uint64_t thread_alloc_sp_el1(const guest_t *g, thread_entry_t *t);
 
 /* Iterate over all active threads, calling fn(entry, ctx) for each.
  * Holds the thread table lock during iteration.
diff --git a/src/syscall/exec.c b/src/syscall/exec.c
index b52366c..ac0e9cc 100644
--- a/src/syscall/exec.c
+++ b/src/syscall/exec.c
@@ -488,7 +488,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     const unsigned char *shim_ptr = proc_get_shim_blob();
     unsigned int shim_size = proc_get_shim_size();
     if (shim_ptr && shim_size > 0) {
-        memcpy((uint8_t *) g->host_base + SHIM_BASE, shim_ptr, shim_size);
+        memcpy((uint8_t *) g->host_base + g->shim_base, shim_ptr, shim_size);
     }
 
     /* Load the executable image that was validated before guest_reset(). */
@@ -501,6 +501,11 @@ int64_t sys_execve(hv_vcpu_t vcpu,
         exit(128);
     }
 
+    /* Track lowest loaded ELF address for the legacy fork IPC path
+     * after exec replaces the previous image (see guest_get_used_regions).
+     */
+    g->elf_load_min = elf_info.load_min + elf_load_base;
+
     /* If PT_INTERP was present, map the already-validated interpreter at the
      * exec-time interp_base.
      */
@@ -540,7 +545,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
             sys_icache_invalidate(host_addr, interp_info.segments[i].memsz);
         }
     }
-    sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_size);
+    sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base, shim_size);
 
     /* Reset brk to the first page after loaded executable data. */
     uint64_t brk_start = PAGE_ALIGN_UP(elf_info.load_max + elf_load_base);
@@ -574,16 +579,16 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     /* Keep the shim executable-only; HVF faults on merged RWX mappings. */
     if (nregions >= MAX_REGIONS)
         goto too_many_regions;
-    regions[nregions++] = (mem_region_t) {.gpa_start = SHIM_BASE,
-                                          .gpa_end = SHIM_BASE + shim_size,
+    regions[nregions++] = (mem_region_t) {.gpa_start = g->shim_base,
+                                          .gpa_end = g->shim_base + shim_size,
                                           .perms = MEM_PERM_RX};
 
     /* EL1 exception handlers use this block for stack and scratch state. */
     if (nregions >= MAX_REGIONS)
         goto too_many_regions;
     regions[nregions++] =
-        (mem_region_t) {.gpa_start = SHIM_DATA_BASE,
-                        .gpa_end = SHIM_DATA_BASE + BLOCK_2MIB,
+        (mem_region_t) {.gpa_start = g->shim_data_base,
+                        .gpa_end = g->shim_data_base + BLOCK_2MIB,
                         .perms = MEM_PERM_RW};
 
     /* The vDSO sits in the same 2MiB block as the shim. The page-table builder
@@ -667,10 +672,10 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     }
 
     /* Rebuild /proc/self/maps metadata in parallel with the new page tables. */
-    guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_size,
+    guest_region_add(g, g->shim_base, g->shim_base + shim_size,
                      LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
                      "[shim]");
-    guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB,
+    guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB,
                      LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0,
                      "[shim-data]");
     for (int i = 0; i < elf_info.num_segments; i++) {
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index 4787745..f1e3eb9 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -1185,10 +1185,13 @@ int64_t sys_mmap(guest_t *g,
 
         /* Reject MAP_FIXED targeting VM infrastructure: page table pool,
          * shim code, and shim data/stack regions.  A guest must not be
-         * able to overwrite EL1 exception vectors or page tables.
+         * able to overwrite EL1 exception vectors or page tables. The
+         * reserve sits at high IPA (just below g->interp_base) so the
+         * range check uses the runtime fields rather than compile-time
+         * low-memory constants.
          */
         uint64_t fix_end = off + length;
-        if (off < ELF_DEFAULT_BASE && fix_end > PT_POOL_BASE)
+        if (guest_range_hits_infra(g, off, fix_end))
             return -LINUX_EINVAL;
 
         result_off = off;
@@ -2414,7 +2417,7 @@ static int compare_range_pair(const void *a, const void *b)
 static int munmap_guest_range(guest_t *g, uint64_t unmap_off, uint64_t end)
 {
     /* Reject munmap targeting VM infrastructure regions. */
-    if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE)
+    if (guest_range_hits_infra(g, unmap_off, end))
         return -LINUX_EINVAL;
 
     /* Restore slab backing under any active MAP_SHARED file overlay before
@@ -2565,7 +2568,7 @@ int64_t sys_mprotect(guest_t *g, uint64_t addr, uint64_t length, int prot)
             /* Reject mprotect targeting VM infrastructure (page tables, shim).
              * Matches the guard in sys_munmap.
              */
-            if (mprot_off < ELF_DEFAULT_BASE && mprot_end > PT_POOL_BASE)
+            if (guest_range_hits_infra(g, mprot_off, mprot_end))
                 return -LINUX_EINVAL;
 
             guest_region_set_prot(g, mprot_off, mprot_end, prot);
diff --git a/src/syscall/signal.c b/src/syscall/signal.c
index 1ea952f..f38e4b2 100644
--- a/src/syscall/signal.c
+++ b/src/syscall/signal.c
@@ -1457,9 +1457,11 @@ int signal_rt_sigreturn(hv_vcpu_t vcpu, guest_t *g)
      * signal frame could redirect execution into EL1 code.
      * Must happen before GPR/SP/PSTATE restore so that a failed check
      * does not leave the vCPU with partially-attacker-controlled state.
+     * The infra reserve sits at high IPA (just below g->interp_base);
+     * use the runtime check rather than compile-time constants.
      */
     uint64_t restored_pc = frame.uc.uc_mcontext.pc;
-    if (restored_pc >= PT_POOL_BASE && restored_pc < ELF_DEFAULT_BASE)
+    if (guest_addr_in_infra(g, restored_pc))
         return -LINUX_EFAULT;
 
     /* Restore all 31 GPRs */
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 49136eb..a54078a 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -80,6 +80,7 @@ test-signal-thread
 [section] Fork edge cases
 test-clone3                    # diff=skip
 test-fork-exec $TESTDIR/echo-test
+test-fork-lowbase
 
 [section] COW fork isolation tests
 test-cow-fork
diff --git a/tests/test-fork-lowbase.c b/tests/test-fork-lowbase.c
new file mode 100644
index 0000000..4e6e148
--- /dev/null
+++ b/tests/test-fork-lowbase.c
@@ -0,0 +1,95 @@
+/* Low-base nested fork regression test
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Exercises the legacy fork IPC path from a low-linked ET_EXEC. The child
+ * forks again, so the grandchild only runs correctly if the intermediate child
+ * preserved the executable's true low load address when cloning guest state.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+#include "test-harness.h"
+
+int passes = 0, fails = 0;
+
+static void test_binary_is_low_linked(void)
+{
+    TEST("binary linked below 0x400000");
+    uintptr_t pc = (uintptr_t) &test_binary_is_low_linked;
+    EXPECT_TRUE(pc < 0x400000ULL, "test binary not linked at low address");
+}
+
+static void test_nested_fork_lowbase(void)
+{
+    TEST("nested fork from low-base ET_EXEC");
+
+    int pipefd[2];
+    if (pipe(pipefd) != 0) {
+        FAIL("pipe() failed");
+        return;
+    }
+
+    pid_t child = fork();
+    if (child < 0) {
+        close(pipefd[0]);
+        close(pipefd[1]);
+        FAIL("fork() failed");
+        return;
+    }
+
+    if (child == 0) {
+        close(pipefd[0]);
+
+        pid_t grandchild = fork();
+        if (grandchild < 0)
+            _exit(101);
+
+        if (grandchild == 0) {
+            char ok = 'G';
+            if (write(pipefd[1], &ok, 1) != 1)
+                _exit(102);
+            _exit(0);
+        }
+
+        int status = 0;
+        if (waitpid(grandchild, &status, 0) != grandchild)
+            _exit(103);
+        if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+            _exit(104);
+        _exit(0);
+    }
+
+    close(pipefd[1]);
+
+    char ok = 0;
+    ssize_t n = read(pipefd[0], &ok, 1);
+    close(pipefd[0]);
+
+    int status = 0;
+    if (waitpid(child, &status, 0) != child) {
+        FAIL("waitpid(child) failed");
+        return;
+    }
+
+    EXPECT_TRUE(
+        n == 1 && ok == 'G' && WIFEXITED(status) && WEXITSTATUS(status) == 0,
+        "grandchild did not complete from low-base nested fork");
+}
+
+int main(void)
+{
+    printf("test-fork-lowbase: starting\n");
+
+    test_binary_is_low_linked();
+    test_nested_fork_lowbase();
+
+    SUMMARY("test-fork-lowbase");
+    return fails ? 1 : 0;
+}