From f18e9c4fed24d454ebc220ab30d65e3d4789645b Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Mon, 11 May 2026 19:15:29 +0000 Subject: [PATCH] Add aarch64/Graviton support to PyPerf Add architecture-specific code paths for aarch64 in the PyPerf eBPF Python profiler. This removes the 'Unsupported platform' error on aarch64 and enables PyPerf to profile Python processes on ARM64/Graviton. Changes: - PyPerfBPFProgram.cc: aarch64 TLS access via TPIDR_EL0, user-mode detection via pstate, register mapping (pc/x29), no red zone - PyPerfNativeStackTrace.cc: libunwind aarch64 register handling - PyPerf.cc/PyPerfProfiler: --pthread-struct-size argument - CMakeLists.txt: conditional libunwind-aarch64 linking Tested on AWS Graviton (m9g.metal-48xl), kernel 6.17, Ubuntu 24.04, glibc 2.39, Python 3.12. Co-developed-by: Kiro (Amazon Q Developer Agent) Signed-off-by: Ali Saidi --- examples/cpp/pyperf/CMakeLists.txt | 7 +- examples/cpp/pyperf/PyPerf.cc | 4 +- examples/cpp/pyperf/PyPerfBPFProgram.cc | 69 ++++++++++++++++++- examples/cpp/pyperf/PyPerfNativeStackTrace.cc | 34 ++++++++- examples/cpp/pyperf/PyPerfProfiler.cc | 5 +- examples/cpp/pyperf/PyPerfProfiler.h | 2 +- 6 files changed, 114 insertions(+), 7 deletions(-) diff --git a/examples/cpp/pyperf/CMakeLists.txt b/examples/cpp/pyperf/CMakeLists.txt index 271ff80a6e99..35b08d7ca4e2 100644 --- a/examples/cpp/pyperf/CMakeLists.txt +++ b/examples/cpp/pyperf/CMakeLists.txt @@ -19,7 +19,12 @@ add_executable(PyPerf PyOffsets.cc PyPerfNativeStackTrace.cc ) -target_link_libraries(PyPerf pthread libunwind-ptrace.a libunwind-x86_64.a libunwind.a lzma) +target_link_libraries(PyPerf pthread libunwind-ptrace.a libunwind.a lzma) +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + target_link_libraries(PyPerf libunwind-aarch64.a) +else() + target_link_libraries(PyPerf libunwind-x86_64.a) +endif() if(NOT CMAKE_USE_LIBBPF_PACKAGE) target_link_libraries(PyPerf bcc-static) else() diff --git a/examples/cpp/pyperf/PyPerf.cc b/examples/cpp/pyperf/PyPerf.cc index 2fee5e68c1c3..61fd5d72a925 100644 --- a/examples/cpp/pyperf/PyPerf.cc +++ b/examples/cpp/pyperf/PyPerf.cc @@ -118,6 +118,7 @@ int main(int argc, char** argv) { std::string output = ""; uint64_t fsOffset = 0; uint64_t stackOffset = 0; + uint64_t pthreadStructSize = 0; bool insertDsoName = false; while (true) { @@ -138,6 +139,7 @@ int main(int argc, char** argv) { found = found || parseStrArg({"-o", "--output"}, output); found = found || parseIntArg({"--fs-offset"}, fsOffset); found = found || parseIntArg({"--stack-offset"}, stackOffset); + found = found || parseIntArg({"--pthread-struct-size"}, pthreadStructSize); found = found || parseFlag({"--insert-dso-name"}, insertDsoName); if (!found) { std::fprintf(stderr, "Unexpected argument: %s\n", argv[pos]); @@ -182,7 +184,7 @@ int main(int argc, char** argv) { ebpf::pyperf::PyPerfProfiler profiler; profiler.update_interval = std::chrono::seconds{updateIntervalSecs}; - auto res = profiler.init(symbolsMapSize, eventsBufferPages, kernelStacksMapSize, userStacksPages, fsOffset, stackOffset, insertDsoName); + auto res = profiler.init(symbolsMapSize, eventsBufferPages, kernelStacksMapSize, userStacksPages, fsOffset, stackOffset, insertDsoName, pthreadStructSize); if (res != ebpf::pyperf::PyPerfProfiler::PyPerfResult::SUCCESS) { std::exit((int)res); } diff --git a/examples/cpp/pyperf/PyPerfBPFProgram.cc b/examples/cpp/pyperf/PyPerfBPFProgram.cc index cb6984fea5ac..2d9d0046b303 100644 --- a/examples/cpp/pyperf/PyPerfBPFProgram.cc +++ b/examples/cpp/pyperf/PyPerfBPFProgram.cc @@ -256,9 +256,34 @@ get_task_thread_id(struct task_struct const *task, enum pthreads_impl pthreads_i return ERROR_NONE; -#else // __x86_64__ +#elif defined(__aarch64__) + // On aarch64, the TLS base is stored in task->thread.uw.tp_value (TPIDR_EL0). + // FS_OFS here is the offset of tp_value within task_struct, discovered at runtime + // by the get_tp_offset helper. + uint64_t tp_value; + bpf_probe_read_kernel(&tp_value, sizeof(tp_value), (u8*)task + FS_OFS); + + // On aarch64 glibc: pthread_self() = tp_value - sizeof(struct pthread) + // On aarch64 musl: pthread_self() = tp_value - sizeof(struct __pthread) + // PTHREAD_STRUCT_SIZE is passed as a compile-time define by the driver. + switch (pthreads_impl) { + case PTI_GLIBC: + case PTI_MUSL: + *thread_id = tp_value - PTHREAD_STRUCT_SIZE; + break; + default: + return ERROR_INVALID_PTHREADS_IMPL; + } + + if (tp_value == 0) { + return ERROR_BAD_FSBASE; + } + + return ERROR_NONE; + +#else #error "Unsupported platform" -#endif // __x86_64__ +#endif } // this function is trivial, but we need to do map lookup in separate function, @@ -320,6 +345,7 @@ on_event(struct pt_regs* ctx) { // Get raw native user stack struct pt_regs user_regs; +#ifdef __x86_64__ // ebpf doesn't allow direct access to ctx->cs, so we need to copy it int cs; bpf_probe_read_kernel(&cs, sizeof(cs), &(ctx->cs)); @@ -352,6 +378,45 @@ on_event(struct pt_regs* ctx) { // Subtract 128 from sp for x86-ABI red zone uintptr_t top_of_stack = user_regs.sp - 128; +#elif defined(__aarch64__) + // On aarch64, BCC's compat header defines an x86_64 struct pt_regs. + // We need to reinterpret ctx as the actual aarch64 user_pt_regs layout: + // u64 regs[31]; // offset 0, size 248 + // u64 sp; // offset 248 + // u64 pc; // offset 256 + // u64 pstate; // offset 264 + uint64_t *raw = (uint64_t *)ctx; + uint64_t user_sp_val, user_pc_val, user_fp_val, pstate; + + // Read pstate (index 33 = offset 264) + bpf_probe_read_kernel(&pstate, sizeof(pstate), &raw[33]); + + if ((pstate & 0xf) == 0) { + // EL0 - user mode context + bpf_probe_read_kernel(&user_sp_val, sizeof(user_sp_val), &raw[31]); // sp + bpf_probe_read_kernel(&user_pc_val, sizeof(user_pc_val), &raw[32]); // pc + bpf_probe_read_kernel(&user_fp_val, sizeof(user_fp_val), &raw[29]); // x29 = FP + } + else { + // Kernel mode - read user regs from kernel stack via task_pt_regs + unsigned long stack_base; + bpf_probe_read_kernel(&stack_base, sizeof(stack_base), + (void*)((unsigned long)task + STACK_OFS)); + uint64_t *user_raw = (uint64_t *)((stack_base + THREAD_SIZE) - 272); // sizeof(pt_regs)=272 + bpf_probe_read_kernel(&user_sp_val, sizeof(user_sp_val), &user_raw[31]); + bpf_probe_read_kernel(&user_pc_val, sizeof(user_pc_val), &user_raw[32]); + bpf_probe_read_kernel(&user_fp_val, sizeof(user_fp_val), &user_raw[29]); + } + + event->user_sp = user_sp_val; + event->user_ip = user_pc_val; + event->user_bp = user_fp_val; + event->user_stack_len = 0; + + // No red zone on aarch64 Linux ABI + uintptr_t top_of_stack = user_sp_val; +#endif + // Copy one page at the time - if one fails we don't want to lose the others int i; #pragma unroll diff --git a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc index 5b1885e77fc5..b20e4e8f76bd 100644 --- a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc +++ b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc @@ -159,6 +159,7 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, int NativeStackTrace::UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum, unw_word_t *valp, int write, void *arg) { +#if defined(__x86_64__) if (regnum == UNW_X86_64_RBP) { if (write) { logInfo(2, "Libunwind attempts to write to BP\n"); @@ -168,6 +169,33 @@ int NativeStackTrace::UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum, *valp = NativeStackTrace::bp; return 0; } +#elif defined(__aarch64__) + if (regnum == UNW_AARCH64_X29) { + if (write) { + logInfo(2, "Libunwind attempts to write to X29/FP\n"); + return -UNW_EINVAL; + } + + *valp = NativeStackTrace::bp; + return 0; + } + if (regnum == UNW_AARCH64_SP || regnum == UNW_REG_SP) { + if (write) return -UNW_EINVAL; + *valp = NativeStackTrace::sp; + return 0; + } + if (regnum == UNW_AARCH64_PC) { + if (write) return -UNW_EINVAL; + *valp = NativeStackTrace::ip; + return 0; + } + if (regnum == UNW_AARCH64_X30 || regnum == UNW_REG_IP) { + // X30 = link register = return address. Use IP as best approximation. + if (write) return -UNW_EINVAL; + *valp = NativeStackTrace::ip; + return 0; + } +#endif if (regnum == UNW_REG_SP) { if (write) { logInfo(2, "Libunwind attempts to write to SP\n"); @@ -199,8 +227,12 @@ int NativeStackTrace::UPT_access_mem(unw_addr_space_t as, unw_word_t addr, return -UNW_EINVAL; } - // Subtract 128 for x86-ABI red zone + // Subtract 128 for x86-ABI red zone (no red zone on aarch64) +#if defined(__x86_64__) const uintptr_t top_of_stack = NativeStackTrace::sp - 128; +#elif defined(__aarch64__) + const uintptr_t top_of_stack = NativeStackTrace::sp; +#endif const uintptr_t stack_start = top_of_stack & ~(getpagesize() - 1); const uintptr_t stack_end = stack_start + NativeStackTrace::stack_len; diff --git a/examples/cpp/pyperf/PyPerfProfiler.cc b/examples/cpp/pyperf/PyPerfProfiler.cc index acd053c71290..de211da82cf6 100644 --- a/examples/cpp/pyperf/PyPerfProfiler.cc +++ b/examples/cpp/pyperf/PyPerfProfiler.cc @@ -167,7 +167,7 @@ void handleLostSamplesCallback(void* cb_cookie, uint64_t lost_cnt) { PyPerfProfiler::PyPerfResult PyPerfProfiler::init(unsigned int symbolsMapSize, unsigned int eventsBufferPages, unsigned int kernelStacksMapSize, unsigned int userStacksPages, unsigned int fsOffset, unsigned int stackOffset, - bool insertDsoName) { + bool insertDsoName, unsigned int pthreadStructSize) { std::vector cflags; cflags.emplace_back(kNumCpusFlag + std::to_string(::sysconf(_SC_NPROCESSORS_ONLN))); cflags.emplace_back(kSymbolsHashSizeFlag + std::to_string(symbolsMapSize)); @@ -177,6 +177,9 @@ PyPerfProfiler::PyPerfResult PyPerfProfiler::init(unsigned int symbolsMapSize, u cflags.emplace_back(kGetThreadStateProgIdxFlag + std::to_string(kGetThreadStateProgIdx)); cflags.emplace_back(kFsOffsetFlag + std::to_string(fsOffset)); cflags.emplace_back(kStackOffsetFlag + std::to_string(stackOffset)); + if (pthreadStructSize > 0) { + cflags.emplace_back("-DPTHREAD_STRUCT_SIZE=" + std::to_string(pthreadStructSize)); + } if (insertDsoName) { NativeStackTrace::enable_dso_reporting(); diff --git a/examples/cpp/pyperf/PyPerfProfiler.h b/examples/cpp/pyperf/PyPerfProfiler.h index c3273172603b..c6e03ac6e6bf 100644 --- a/examples/cpp/pyperf/PyPerfProfiler.h +++ b/examples/cpp/pyperf/PyPerfProfiler.h @@ -68,7 +68,7 @@ class PyPerfProfiler { PyPerfResult init(unsigned int symbolsMapSize, unsigned int eventsBufferPages, unsigned int kernelStacksMapSize, unsigned int userStacksPages, unsigned int fsOffset, unsigned int stackOffset, - bool insertDsoName); + bool insertDsoName, unsigned int pthreadStructSize = 0); PyPerfResult profile(int64_t sampleRate, int64_t sampleFreq, int64_t duration, PyPerfSampleProcessor* processor);