Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 74 additions & 19 deletions README.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions docs/gdb-workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,9 @@ ASAN_OPTIONS=detect_leaks=0 gdb --args ./kbox image -S alpine.ext4 -c /bin/sh
## Coordinated Syscall Tracing

The `kbox-syscall-trace` command sets breakpoints on three points:
1. `kbox_dispatch_syscall` -- seccomp notification entry
2. `lkl_syscall` -- LKL kernel entry
3. `lkl_syscall6` -- LKL wrapper
1. `kbox_dispatch_syscall`: seccomp dispatch entry
2. `lkl_syscall`: LKL kernel entry
3. `lkl_syscall6`: LKL wrapper

On each hit, it prints the syscall number, decoded name, arguments,
virtual FD translation (if applicable), and LKL parameters:
Expand Down
12 changes: 7 additions & 5 deletions docs/syscall-parity-spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ Acceptance test definition for the C rewrite. For each syscall in the MVP
set, documents: arguments, return value, errno, side effects, and any
deviation from the Rust implementation.

Status: all syscalls below are implemented in seccomp_dispatch.c.
Status: all syscalls below are implemented in seccomp-dispatch.c via
kbox_dispatch_request(). The same dispatch engine handles all three
interception tiers (seccomp-unotify, SIGSYS trap, binary rewriting).

## Notation

- `vfd`: virtual file descriptor (4096+), mapped to LKL-internal fd
- `vfd`: virtual file descriptor (32768+), mapped to LKL-internal fd
- `LKL(...)`: forwarded to LKL via lkl_syscall6()
- `CONTINUE`: seccomp_notif_resp with FLAG_CONTINUE (host kernel handles)
- `RETURN(val)`: seccomp_notif_resp with injected return value
- `ERRNO(e)`: seccomp_notif_resp with error = e
- `CONTINUE`: host kernel handles (seccomp: FLAG_CONTINUE; trap: re-issue via asm trampoline)
- `RETURN(val)`: injected return value
- `ERRNO(e)`: return with error = e

---

Expand Down
8 changes: 8 additions & 0 deletions include/kbox/cli.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ enum kbox_mode {
KBOX_MODE_IMAGE,
};

enum kbox_syscall_mode {
KBOX_SYSCALL_MODE_SECCOMP,
KBOX_SYSCALL_MODE_TRAP,
KBOX_SYSCALL_MODE_REWRITE,
KBOX_SYSCALL_MODE_AUTO,
};

struct kbox_image_args {
const char *root_dir; /* -r: image file path */
bool recommended; /* -R: enable recommended mounts */
Expand All @@ -34,6 +41,7 @@ struct kbox_image_args {
bool verbose; /* --forward-verbose */
bool net; /* --net: enable SLIRP networking */
enum kbox_mount_profile mount_profile; /* --mount-profile */
enum kbox_syscall_mode syscall_mode; /* --syscall-mode */
bool web; /* --web: enable web observatory */
int web_port; /* --web=PORT (default 8080) */
const char *web_bind; /* --web-bind ADDR */
Expand Down
83 changes: 83 additions & 0 deletions include/kbox/elf.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,88 @@ int kbox_find_elf_interp_loc(const unsigned char *buf,
size_t out_size,
uint64_t *offset_out,
uint64_t *filesz_out);
int kbox_read_elf_header_window_fd(int fd,
unsigned char **buf_out,
size_t *buf_len_out);

struct kbox_elf_exec_segment {
uint64_t file_offset;
uint64_t file_size;
uint64_t vaddr;
uint64_t mem_size;
};

#define KBOX_ELF_MAX_LOAD_SEGMENTS 16

struct kbox_elf_load_segment {
uint64_t file_offset;
uint64_t file_size;
uint64_t vaddr;
uint64_t mem_size;
uint64_t align;
uint64_t map_align;
uint64_t map_offset;
uint64_t map_start;
uint64_t map_size;
uint32_t flags;
};

struct kbox_elf_load_plan {
uint16_t machine;
uint16_t type;
uint64_t entry;
uint64_t phoff;
uint16_t phentsize;
uint16_t phnum;
uint64_t phdr_vaddr;
uint64_t phdr_size;
uint64_t min_vaddr;
uint64_t max_vaddr;
uint64_t load_size;
uint64_t interp_offset;
uint64_t interp_size;
uint32_t stack_flags;
size_t segment_count;
int has_interp;
int pie;
struct kbox_elf_load_segment segments[KBOX_ELF_MAX_LOAD_SEGMENTS];
};

int kbox_build_elf_load_plan(const unsigned char *buf,
size_t buf_len,
uint64_t page_size,
struct kbox_elf_load_plan *plan);

typedef int (*kbox_elf_exec_segment_cb)(const struct kbox_elf_exec_segment *seg,
const unsigned char *segment_bytes,
void *opaque);
typedef int (*kbox_elf_exec_segment_header_cb)(
const struct kbox_elf_exec_segment *seg,
void *opaque);

/* Return the ELF machine type (e_machine) for a 64-bit little-endian image. */
int kbox_elf_machine(const unsigned char *buf,
size_t buf_len,
uint16_t *machine_out);

/* Visit every PT_LOAD|PF_X segment with file-backed bytes (read-only).
* The callback receives a const pointer to segment bytes, suitable for
* analysis/scanning but not for in-place rewriting. A mutable variant
* will be needed when actual instruction replacement is implemented.
* Returns the number of visited segments on success or -1 on malformed ELF.
*/
int kbox_visit_elf_exec_segments(const unsigned char *buf,
size_t buf_len,
kbox_elf_exec_segment_cb cb,
void *opaque);

/* Visit executable PT_LOAD segment metadata. Only the ELF header and program
* header table need to be present in @buf; the segment payload bytes are not
* dereferenced.
*/
int kbox_visit_elf_exec_segment_headers(const unsigned char *buf,
size_t buf_len,
kbox_elf_exec_segment_header_cb cb,
void *opaque);

#endif /* KBOX_ELF_H */
20 changes: 18 additions & 2 deletions include/kbox/probe.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,32 @@
#ifndef KBOX_PROBE_H
#define KBOX_PROBE_H

#include "kbox/cli.h"

struct kbox_probe_result {
int no_new_privs_ok;
int seccomp_filter_ok;
int seccomp_listener_ok;
int process_vm_readv_ok;
};

/* Runtime host feature probing.
*
* Verify at startup that the host kernel supports the features kbox depends on.
* Fail fast with a clear diagnostic if any check fails.
*
* All modes require basic seccomp filter support plus no_new_privs.
* SECCOMP and AUTO additionally require seccomp-unotify + process_vm_readv
* because they run the supervisor path today. TRAP and REWRITE skip those
* supervisor-specific probes.
*/

/* Run all probes.
/* Run all probes for the given syscall mode.
* Returns 0 on success, -1 if a required feature is unavailable.
* Prints diagnostics to stderr.
*/
int kbox_probe_host_features(void);
int kbox_probe_host_features(enum kbox_syscall_mode mode);
int kbox_collect_probe_result(enum kbox_syscall_mode mode,
struct kbox_probe_result *out);

#endif /* KBOX_PROBE_H */
16 changes: 16 additions & 0 deletions include/kbox/x86-decode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/* SPDX-License-Identifier: MIT */
#ifndef KBOX_X86_DECODE_H
#define KBOX_X86_DECODE_H

#include <stddef.h>
#include <stdint.h>

/*
* Minimal x86-64 instruction length decoder.
*
* Used by the rewrite scanner so syscall opcodes are only matched at real
* instruction boundaries, not inside immediates or other instruction bytes.
*/
int kbox_x86_insn_length(const unsigned char *code, size_t max_len);

#endif /* KBOX_X86_DECODE_H */
20 changes: 20 additions & 0 deletions mk/features.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,20 @@ SRCS = $(SRC_DIR)/main.c \
$(SRC_DIR)/lkl-wrap.c \
$(SRC_DIR)/fd-table.c \
$(SRC_DIR)/procmem.c \
$(SRC_DIR)/syscall-request.c \
$(SRC_DIR)/syscall-trap.c \
$(SRC_DIR)/path.c \
$(SRC_DIR)/identity.c \
$(SRC_DIR)/elf.c \
$(SRC_DIR)/loader-entry.c \
$(SRC_DIR)/loader-handoff.c \
$(SRC_DIR)/loader-image.c \
$(SRC_DIR)/loader-layout.c \
$(SRC_DIR)/loader-launch.c \
$(SRC_DIR)/loader-stack.c \
$(SRC_DIR)/loader-transfer.c \
$(SRC_DIR)/x86-decode.c \
$(SRC_DIR)/rewrite.c \
$(SRC_DIR)/mount.c \
$(SRC_DIR)/probe.c \
$(SRC_DIR)/image.c \
Expand All @@ -34,7 +45,16 @@ ifeq ($(CONFIG_HAS_SLIRP),y)
CFLAGS += -DKBOX_HAS_SLIRP -I$(SLIRP_DIR)/src
SLIRP_SRCS = $(wildcard $(SLIRP_DIR)/src/*.c)
SLIRP_OBJS = $(SLIRP_SRCS:.c=.o)
SLIRP_CFLAGS = $(filter-out -Wpedantic -Wshadow,$(CFLAGS))
SLIRP_CFLAGS += -Wno-sign-compare -Wno-unused-variable -Wno-comment
SLIRP_CFLAGS += -Wno-return-type -Wno-pedantic
SRCS += $(SLIRP_SRCS)
# Use a directory-specific pattern rule instead of target-specific CFLAGS.
# $(SLIRP_OBJS): CFLAGS := ... would expand SLIRP_OBJS at parse time,
# before deps.mk has cloned minislirp, producing an empty target list.
$(SLIRP_DIR)/src/%.o: $(SLIRP_DIR)/src/%.c
@echo " CC $<"
$(Q)$(CC) $(SLIRP_CFLAGS) -MMD -MP -c -o $@ $<
endif

# Web observatory
Expand Down
39 changes: 36 additions & 3 deletions mk/tests.mk
Original file line number Diff line number Diff line change
@@ -1,20 +1,53 @@
# mk/tests.mk - Test targets (unit, integration, stress, guest binaries)

# Unit test files (no LKL dependency)
# Portable tests (compile on any host):
TEST_DIR = tests/unit
TEST_SRCS = $(TEST_DIR)/test-runner.c \
$(TEST_DIR)/test-fd-table.c \
$(TEST_DIR)/test-path.c \
$(TEST_DIR)/test-identity.c \
$(TEST_DIR)/test-syscall-nr.c \
$(TEST_DIR)/test-elf.c
$(TEST_DIR)/test-elf.c \
$(TEST_DIR)/test-x86-decode.c

# Linux-only tests (depend on inline asm, siginfo_t/ucontext, memfd_create):
ifeq ($(shell uname -s),Linux)
TEST_SRCS += $(TEST_DIR)/test-rewrite.c \
$(TEST_DIR)/test-procmem.c \
$(TEST_DIR)/test-syscall-request.c \
$(TEST_DIR)/test-syscall-trap.c \
$(TEST_DIR)/test-loader-entry.c \
$(TEST_DIR)/test-loader-handoff.c \
$(TEST_DIR)/test-loader-image.c \
$(TEST_DIR)/test-loader-layout.c \
$(TEST_DIR)/test-loader-launch.c \
$(TEST_DIR)/test-loader-stack.c \
$(TEST_DIR)/test-loader-transfer.c
endif

# Unit tests link only the pure-computation sources (no LKL)
TEST_SUPPORT_SRCS = $(SRC_DIR)/fd-table.c \
$(SRC_DIR)/path.c \
$(SRC_DIR)/identity.c \
$(SRC_DIR)/syscall-nr.c \
$(SRC_DIR)/elf.c
$(SRC_DIR)/elf.c \
$(SRC_DIR)/x86-decode.c

ifeq ($(shell uname -s),Linux)
TEST_SUPPORT_SRCS += $(SRC_DIR)/rewrite.c \
$(TEST_DIR)/test-seccomp-stubs.c \
$(SRC_DIR)/procmem.c \
$(SRC_DIR)/syscall-request.c \
$(SRC_DIR)/syscall-trap.c \
$(SRC_DIR)/loader-entry.c \
$(SRC_DIR)/loader-handoff.c \
$(SRC_DIR)/loader-image.c \
$(SRC_DIR)/loader-layout.c \
$(SRC_DIR)/loader-launch.c \
$(SRC_DIR)/loader-stack.c \
$(SRC_DIR)/loader-transfer.c
endif

TEST_TARGET = tests/unit/test-runner

Expand Down Expand Up @@ -43,7 +76,7 @@ check-unit: $(TEST_TARGET)
# We define LKL stubs for functions referenced by test support code.
$(TEST_TARGET): $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(wildcard .config)
@echo " LD $@"
$(Q)$(CC) $(CFLAGS) -DKBOX_UNIT_TEST -o $@ $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(LDFLAGS)
$(Q)$(CC) $(CFLAGS) -DKBOX_UNIT_TEST -o $@ $(TEST_SRCS) $(TEST_SUPPORT_SRCS) $(LDFLAGS) -lpthread

check-integration: $(TARGET) guest-bins stress-bins $(ROOTFS)
@echo " RUN check-integration"
Expand Down
6 changes: 6 additions & 0 deletions scripts/pre-commit.hook
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,20 @@ cppcheck_suppressions() {
"unusedFunction"
"syntaxError"
"constParameterPointer"
"constParameterCallback"
"constVariablePointer"
"constParameter"
"unusedStructMember"
"redundantAssignment"
"staticFunction"
"checkLevelNormal"
"variableScope"
"compareValueOutOfTypeRangeError"
"constVariable"
"knownConditionTrueFalse"
"unreadVariable"
"redundantInitialization"
"shadowVariable"
)

local out="--inline-suppr "
Expand Down
15 changes: 15 additions & 0 deletions src/cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <string.h>

#include "kbox/cli.h"
#include "rewrite.h"

/* Long option codes for options without short equivalents */
enum {
Expand All @@ -15,6 +16,7 @@ enum {
OPT_NET,
OPT_WEB,
OPT_WEB_BIND,
OPT_SYSCALL_MODE,
OPT_TRACE_FORMAT,
OPT_HELP,
};
Expand All @@ -38,6 +40,7 @@ static const struct option image_longopts[] = {
{"net", no_argument, NULL, OPT_NET},
{"web", optional_argument, NULL, OPT_WEB},
{"web-bind", required_argument, NULL, OPT_WEB_BIND},
{"syscall-mode", required_argument, NULL, OPT_SYSCALL_MODE},
{"trace-format", required_argument, NULL, OPT_TRACE_FORMAT},
{"help", no_argument, NULL, OPT_HELP},
{NULL, 0, NULL, 0},
Expand Down Expand Up @@ -71,6 +74,8 @@ void kbox_usage(const char *argv0)
" --forward-verbose Verbose syscall forwarding\n"
" --net Enable SLIRP user-mode networking\n"
" --mount-profile P Mount profile: full (default), minimal\n"
" --syscall-mode MODE Syscall path: auto (default), "
"seccomp, trap, rewrite\n"
" --web[=PORT] Enable web observatory (default: 8080)\n"
" --web-bind ADDR Bind address for web (default: "
"127.0.0.1)\n"
Expand All @@ -88,6 +93,7 @@ static void image_defaults(struct kbox_image_args *img)
img->command = "/bin/sh";
img->cmdline = "mem=1024M loglevel=4";
img->mount_profile = KBOX_MOUNT_FULL;
img->syscall_mode = KBOX_SYSCALL_MODE_AUTO;
}

static int parse_image_args(int argc,
Expand Down Expand Up @@ -205,6 +211,15 @@ static int parse_image_args(int argc,
return -1;
#endif
break;
case OPT_SYSCALL_MODE:
if (kbox_parse_syscall_mode(optarg, &img->syscall_mode) < 0) {
fprintf(stderr,
"unknown syscall mode: %s "
"(use 'seccomp', 'trap', 'rewrite', or 'auto')\n",
optarg);
return -1;
}
break;
case OPT_TRACE_FORMAT:
#ifdef KBOX_HAS_WEB
if (strcmp(optarg, "json") != 0) {
Expand Down
Loading
Loading