Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ $(BUILD_DIR)/test-futex-waitv: tests/test-futex-waitv.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-fork-lowbase must be a non-PIE ET_EXEC linked below ELF_DEFAULT_BASE so
# nested forks exercise elf_load_min preservation across fork IPC.
$(BUILD_DIR)/test-fork-lowbase: tests/test-fork-lowbase.c | $(BUILD_DIR)
@echo " CROSS $< (low-base ET_EXEC)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -no-pie \
-Wl,-Ttext-segment=0x200000 -o $@ $<

endif

include mk/tests.mk
Expand Down
28 changes: 17 additions & 11 deletions src/core/bootstrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ static bool build_boot_regions(mem_region_t *regions,
* to the vDSO page when splitting the block; otherwise vdso_build cannot
* write into it through guest_ptr.
*/
if (!append_boot_region(regions, nregions, SHIM_BASE,
SHIM_BASE + shim_bin_len, MEM_PERM_RX) ||
!append_boot_region(regions, nregions, SHIM_DATA_BASE,
SHIM_DATA_BASE + BLOCK_2MIB, MEM_PERM_RW) ||
if (!append_boot_region(regions, nregions, g->shim_base,
g->shim_base + shim_bin_len, MEM_PERM_RX) ||
!append_boot_region(regions, nregions, g->shim_data_base,
g->shim_data_base + BLOCK_2MIB, MEM_PERM_RW) ||
!append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE,
MEM_PERM_RX)) {
return false;
Expand Down Expand Up @@ -234,6 +234,11 @@ int guest_bootstrap_prepare(guest_t *g,
return -1;
}

/* Track the lowest loaded ELF address so the legacy fork IPC path
* copies low-linked ET_EXECs (e.g. linked at 0x200000) in full.
*/
g->elf_load_min = boot->elf_info.load_min + boot->elf_load_base;

g->brk_base = PAGE_ALIGN_UP(boot->elf_info.load_max + boot->elf_load_base);
if (g->brk_base < BRK_BASE_DEFAULT)
g->brk_base = BRK_BASE_DEFAULT;
Expand All @@ -252,15 +257,16 @@ int guest_bootstrap_prepare(guest_t *g,
return -1;
}

memcpy((uint8_t *) g->host_base + SHIM_BASE, shim_bin, shim_bin_len);
memcpy((uint8_t *) g->host_base + g->shim_base, shim_bin, shim_bin_len);
log_debug("shim loaded at offset 0x%llx (%zu bytes)",
(unsigned long long) SHIM_BASE, shim_bin_len);
(unsigned long long) g->shim_base, shim_bin_len);

invalidate_exec_segments(&boot->elf_info, g->host_base,
boot->elf_load_base);
invalidate_exec_segments(&boot->interp_info, g->host_base,
boot->interp_base);
sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_bin_len);
sys_icache_invalidate((uint8_t *) g->host_base + g->shim_base,
shim_bin_len);

if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) {
log_error("too many memory regions (%d >= %d)", nregions,
Expand All @@ -275,10 +281,10 @@ int guest_bootstrap_prepare(guest_t *g,
}
g->need_tlbi = true;

guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_bin_len,
guest_region_add(g, g->shim_base, g->shim_base + shim_bin_len,
LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0,
"[shim]");
guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB,
guest_region_add(g, g->shim_data_base, g->shim_data_base + BLOCK_2MIB,
LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0,
"[shim-data]");

Expand Down Expand Up @@ -380,10 +386,10 @@ int guest_bootstrap_create_vcpu(guest_t *g,
uint64_t sctlr;
uint64_t sctlr_with_mmu;
uint64_t tcr_value = TCR_EL1_VALUE;
uint64_t shim_ipa = guest_ipa(g, SHIM_BASE);
uint64_t shim_ipa = guest_ipa(g, g->shim_base);
uint64_t entry_ipa = guest_ipa(g, boot->entry_point);
uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer);
uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MIB);
uint64_t el1_sp = guest_ipa(g, g->shim_data_base + BLOCK_2MIB);
hv_vcpu_t vcpu;
hv_vcpu_exit_t *vexit;

Expand Down
130 changes: 87 additions & 43 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,28 +118,58 @@ static void guest_region_clip_overlay(guest_region_t *r)
r->overlay_end = overlay_end;
}

/* Compute infra reserve placement from guest_size and store derived fields in
* @g. Called from guest_init and guest_init_from_shm.
*
* Layout: a 4MiB region anchored at [interp_base - INFRA_RESERVE, interp_base)
* sits in the dead zone between mmap_limit and interp_base. PT pool, shim, and
* shim data fall at fixed offsets within the reserve (see guest.h).
*
* Returns 0 on success, -1 if the layout cannot be derived (interp_base too
* small to fit the reserve). Today guest_init enforces a 64GiB minimum so the
* underflow path is unreachable, but the explicit check guards future
* configurations and any IPC restore that bypasses size selection.
*/
static int compute_infra_layout(guest_t *g)
{
if (g->interp_base < INFRA_RESERVE) {
log_error(
"guest: interp_base 0x%llx smaller than INFRA_RESERVE (0x%llx); "
"guest_size too small",
(unsigned long long) g->interp_base,
(unsigned long long) INFRA_RESERVE);
return -1;
}
uint64_t infra_base = g->interp_base - INFRA_RESERVE;
g->pt_pool_base = infra_base + INFRA_PT_POOL_OFF;
g->pt_pool_end = infra_base + INFRA_PT_POOL_END_OFF;
g->shim_base = infra_base + INFRA_SHIM_OFF;
g->shim_data_base = infra_base + INFRA_SHIM_DATA_OFF;
return 0;
}

/* Allocate a zeroed 4KiB page from the page table pool.
* Returns GPA of the page, or 0 on pool exhaustion.
* Acquires pt_lock internally. Caller typically holds mmap_lock.
*/
static uint64_t pt_alloc_page(guest_t *g)
{
pthread_mutex_lock(&pt_lock);
if (g->pt_pool_next + PAGE_SIZE > PT_POOL_END) {
if (g->pt_pool_next + PAGE_SIZE > g->pt_pool_end) {
log_error(
"guest: page table pool exhausted "
"(used %llu / %llu bytes)",
(unsigned long long) (g->pt_pool_next - PT_POOL_BASE),
(unsigned long long) (PT_POOL_END - PT_POOL_BASE));
(unsigned long long) (g->pt_pool_next - g->pt_pool_base),
(unsigned long long) (g->pt_pool_end - g->pt_pool_base));
pthread_mutex_unlock(&pt_lock);
return 0;
}
uint64_t gpa = g->pt_pool_next;
g->pt_pool_next += PAGE_SIZE;

/* Warn at 80% pool usage so users can anticipate exhaustion */
uint64_t used = gpa + PAGE_SIZE - PT_POOL_BASE;
uint64_t total = PT_POOL_END - PT_POOL_BASE;
uint64_t used = gpa + PAGE_SIZE - g->pt_pool_base;
uint64_t total = g->pt_pool_end - g->pt_pool_base;
if (!pt_pool_warned && used > (total * 4 / 5)) {
log_debug(
"guest: page table pool at %llu%% "
Expand All @@ -149,8 +179,8 @@ static uint64_t pt_alloc_page(guest_t *g)
pt_pool_warned = true;
}

/* Zero the page while still holding the lock so no other thread can
* observe a partially-zeroed page table page.
/* Zero the page while still holding the lock so no other thread can observe
* a partially-zeroed page table page.
*/
memset((uint8_t *) g->host_base + gpa, 0, PAGE_SIZE);
pthread_mutex_unlock(&pt_lock);
Expand All @@ -170,15 +200,15 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
memset(g, 0, sizeof(*g));
g->shm_fd = -1;
g->ipa_base = GUEST_IPA_BASE;
g->pt_pool_next = PT_POOL_BASE;
g->elf_load_min = ELF_DEFAULT_BASE;
g->brk_base = BRK_BASE_DEFAULT;
g->brk_current = BRK_BASE_DEFAULT;
g->mmap_next = MMAP_BASE;
g->mmap_rx_next = MMAP_RX_BASE;

/* Query the maximum IPA size supported by the hardware/kernel. macOS 15+
* on Apple Silicon reports 40 bits (1TiB). Older versions or fallback
* yields 36 bits (64GiB).
/* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ on
* Apple Silicon reports 40 bits (1TiB). Older versions or fallback yields
* 36 bits (64GiB).
*/
uint32_t max_ipa = 0;
hv_vm_config_get_max_ipa_size(&max_ipa);
Expand Down Expand Up @@ -216,6 +246,9 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
*/
g->interp_base = g->guest_size - 0x100000000ULL;
g->mmap_limit = g->guest_size - 0x200000000ULL;
if (compute_infra_layout(g) < 0)
return -1;
g->pt_pool_next = g->pt_pool_base;

/* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages
* this: physical pages are allocated only on first touch, so reserving up
Expand All @@ -238,12 +271,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
* The child maps it MAP_PRIVATE, giving it an instant copy-on-write
* clone of all guest memory.
*
* macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular
* file fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED
* correctly. The file is unlinked immediately; the fd keeps it alive.
* macOS demand-pages file mappings, so untouched pages cost nothing.
* If any step fails, guest memory silently keeps the MAP_ANON mapping and
* falls back to the IPC region-copy path on fork.
* macOS rejects MAP_PRIVATE on shm_open objects (EINVAL), but regular file
* fds support MAP_SHARED, MAP_PRIVATE, and MAP_PRIVATE|MAP_FIXED correctly.
* The file is unlinked immediately; the fd keeps it alive. macOS
* demand-pages file mappings, so untouched pages cost nothing. If any step
* fails, guest memory silently keeps the MAP_ANON mapping and falls back to
* the IPC region-copy path on fork.
*/
{
char tmppath[] = "/tmp/elfuse-XXXXXX";
Expand Down Expand Up @@ -322,6 +355,11 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
g->guest_size = size;
g->interp_base = size - 0x100000000ULL;
g->mmap_limit = size - 0x200000000ULL;
if (compute_infra_layout(g) < 0) {
hv_vm_destroy();
return -1;
}
g->pt_pool_next = g->pt_pool_base;
g->host_base = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
if (g->host_base == MAP_FAILED) {
Expand Down Expand Up @@ -362,7 +400,7 @@ int guest_init_from_shm(guest_t *g,
memset(g, 0, sizeof(*g));
g->shm_fd = -1; /* Child does not own the shm */
g->ipa_base = GUEST_IPA_BASE;
g->pt_pool_next = PT_POOL_BASE;
g->elf_load_min = ELF_DEFAULT_BASE;
g->brk_base = BRK_BASE_DEFAULT;
g->brk_current = BRK_BASE_DEFAULT;
g->mmap_next = MMAP_BASE;
Expand All @@ -373,6 +411,9 @@ int guest_init_from_shm(guest_t *g,
/* Compute layout limits (same formula as guest_init) */
g->interp_base = size - 0x100000000ULL;
g->mmap_limit = size - 0x200000000ULL;
if (compute_infra_layout(g) < 0)
return -1;
g->pt_pool_next = g->pt_pool_base;

/* Map the shm fd MAP_PRIVATE: copy-on-write semantics. Reads see
* the parent's frozen snapshot; writes are private to this process.
Expand Down Expand Up @@ -836,21 +877,21 @@ void guest_reset(guest_t *g)
}

/* Zero page table pool (not tracked in region array) */
if (g->pt_pool_next > PT_POOL_BASE)
memset((uint8_t *) g->host_base + PT_POOL_BASE, 0,
g->pt_pool_next - PT_POOL_BASE);
if (g->pt_pool_next > g->pt_pool_base)
memset((uint8_t *) g->host_base + g->pt_pool_base, 0,
g->pt_pool_next - g->pt_pool_base);

/* Zero shim code + data (not tracked in region array by guest_reset
* callers; shim regions are added AFTER reset by the exec path)
*/
memset((uint8_t *) g->host_base + SHIM_BASE, 0,
SHIM_DATA_BASE + BLOCK_2MIB - SHIM_BASE);
memset((uint8_t *) g->host_base + g->shim_base, 0,
g->shim_data_base + BLOCK_2MIB - g->shim_base);

/* Reset allocation state */
guest_pt_gen_bump(g);
guest_tlb_flush();
__atomic_store_n(&pt_pool_warned, false, __ATOMIC_RELAXED);
g->pt_pool_next = PT_POOL_BASE;
g->pt_pool_next = g->pt_pool_base;
g->brk_base = BRK_BASE_DEFAULT;
g->brk_current = BRK_BASE_DEFAULT;
g->mmap_next = MMAP_BASE;
Expand All @@ -861,6 +902,7 @@ void guest_reset(guest_t *g)
g->mmap_rx_gap_hint = 0;
g->ttbr0 = 0;
g->need_tlbi = false;
g->elf_load_min = ELF_DEFAULT_BASE;

/* Clear semantic region tracking (will be re-populated after exec) */
guest_region_clear(g);
Expand All @@ -875,34 +917,36 @@ int guest_get_used_regions(const guest_t *g,
{
int n = 0;

/* Page table pool */
if (n < max && g->pt_pool_next > PT_POOL_BASE) {
out[n].offset = PT_POOL_BASE;
out[n].size = g->pt_pool_next - PT_POOL_BASE;
/* Page table pool (high IPA, just below interp_base) */
if (n < max && g->pt_pool_next > g->pt_pool_base) {
out[n].offset = g->pt_pool_base;
out[n].size = g->pt_pool_next - g->pt_pool_base;
n++;
}

/* Shim code */
/* Shim code (high IPA) */
if (n < max && shim_size > 0) {
out[n].offset = SHIM_BASE;
out[n].offset = g->shim_base;
out[n].size = shim_size;
n++;
}

/* Shim data/stack (full 2MiB block) */
/* Shim data/stack (full 2MiB block, high IPA) */
if (n < max) {
out[n].offset = SHIM_DATA_BASE;
out[n].offset = g->shim_data_base;
out[n].size = BLOCK_2MIB;
n++;
}

/* ELF + brk region: from ELF_DEFAULT_BASE to brk_current.
* guest memory does not track the exact ELF load range, but static musl
* binaries always load at or above ELF_DEFAULT_BASE (0x400000).
/* ELF + brk region: from elf_load_min (set by ELF loader) to brk_current.
* The lower bound is the actual ELF load address, not ELF_DEFAULT_BASE:
* ET_EXECs linked below 0x400000 (e.g. at 0x200000) have segments below the
* legacy default and would otherwise be silently dropped from the legacy
* fork-IPC copy.
*/
if (n < max && g->brk_current > ELF_DEFAULT_BASE) {
out[n].offset = ELF_DEFAULT_BASE;
out[n].size = g->brk_current - ELF_DEFAULT_BASE;
if (n < max && g->brk_current > g->elf_load_min) {
out[n].offset = g->elf_load_min;
out[n].size = g->brk_current - g->elf_load_min;
n++;
}

Expand Down Expand Up @@ -1167,9 +1211,9 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
right->offset += (end - r->start);
right->start = end;
if (r->backing_fd >= 0) {
/* A dup failure leaves backing_fd=-1, silently converting
* this half to anonymous semantics (msync and MADV_DONTNEED
* skip regions with backing_fd<0). Propagating the error would
/* A dup failure leaves backing_fd=-1, silently converting this
* half to anonymous semantics (msync and MADV_DONTNEED skip
* regions with backing_fd<0). Propagating the error would
* require making all region split callers (mprotect, munmap)
* fallible.
*/
Expand Down Expand Up @@ -1513,8 +1557,8 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n)
uint64_t *l0 = pt_at(g, l0_gpa);

/* For each region, determine which 2MiB blocks need mapping.
* Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block
* descriptor output address are both derived from gpa_start + ipa_base.
* Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block descriptor
* output address are both derived from gpa_start + ipa_base.
*/
for (int r = 0; r < n; r++) {
uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start);
Expand Down
Loading
Loading