Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion examples/cpp/pyperf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ add_executable(PyPerf
PyOffsets.cc
PyPerfNativeStackTrace.cc
)
target_link_libraries(PyPerf pthread libunwind-ptrace.a libunwind-x86_64.a libunwind.a lzma)
target_link_libraries(PyPerf pthread libunwind-ptrace.a libunwind.a lzma)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
target_link_libraries(PyPerf libunwind-aarch64.a)
else()
target_link_libraries(PyPerf libunwind-x86_64.a)
endif()
if(NOT CMAKE_USE_LIBBPF_PACKAGE)
target_link_libraries(PyPerf bcc-static)
else()
Expand Down
4 changes: 3 additions & 1 deletion examples/cpp/pyperf/PyPerf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ int main(int argc, char** argv) {
std::string output = "";
uint64_t fsOffset = 0;
uint64_t stackOffset = 0;
uint64_t pthreadStructSize = 0;
bool insertDsoName = false;

while (true) {
Expand All @@ -138,6 +139,7 @@ int main(int argc, char** argv) {
found = found || parseStrArg({"-o", "--output"}, output);
found = found || parseIntArg({"--fs-offset"}, fsOffset);
found = found || parseIntArg({"--stack-offset"}, stackOffset);
found = found || parseIntArg({"--pthread-struct-size"}, pthreadStructSize);
found = found || parseFlag({"--insert-dso-name"}, insertDsoName);
if (!found) {
std::fprintf(stderr, "Unexpected argument: %s\n", argv[pos]);
Expand Down Expand Up @@ -182,7 +184,7 @@ int main(int argc, char** argv) {
ebpf::pyperf::PyPerfProfiler profiler;
profiler.update_interval = std::chrono::seconds{updateIntervalSecs};

auto res = profiler.init(symbolsMapSize, eventsBufferPages, kernelStacksMapSize, userStacksPages, fsOffset, stackOffset, insertDsoName);
auto res = profiler.init(symbolsMapSize, eventsBufferPages, kernelStacksMapSize, userStacksPages, fsOffset, stackOffset, insertDsoName, pthreadStructSize);
if (res != ebpf::pyperf::PyPerfProfiler::PyPerfResult::SUCCESS) {
std::exit((int)res);
}
Expand Down
69 changes: 67 additions & 2 deletions examples/cpp/pyperf/PyPerfBPFProgram.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,34 @@ get_task_thread_id(struct task_struct const *task, enum pthreads_impl pthreads_i

return ERROR_NONE;

#else // __x86_64__
#elif defined(__aarch64__)
// On aarch64, the TLS base is stored in task->thread.uw.tp_value (TPIDR_EL0).
// FS_OFS here is the offset of tp_value within task_struct, discovered at runtime
// by the get_tp_offset helper.
uint64_t tp_value;
bpf_probe_read_kernel(&tp_value, sizeof(tp_value), (u8*)task + FS_OFS);

// On aarch64 glibc: pthread_self() = tp_value - sizeof(struct pthread)
// On aarch64 musl: pthread_self() = tp_value - sizeof(struct __pthread)
// PTHREAD_STRUCT_SIZE is passed as a compile-time define by the driver.
switch (pthreads_impl) {
case PTI_GLIBC:
case PTI_MUSL:
*thread_id = tp_value - PTHREAD_STRUCT_SIZE;
break;
default:
return ERROR_INVALID_PTHREADS_IMPL;
}

if (tp_value == 0) {
return ERROR_BAD_FSBASE;
}

return ERROR_NONE;

#else
#error "Unsupported platform"
#endif // __x86_64__
#endif
}

// this function is trivial, but we need to do map lookup in separate function,
Expand Down Expand Up @@ -320,6 +345,7 @@ on_event(struct pt_regs* ctx) {
// Get raw native user stack
struct pt_regs user_regs;

#ifdef __x86_64__
// ebpf doesn't allow direct access to ctx->cs, so we need to copy it
int cs;
bpf_probe_read_kernel(&cs, sizeof(cs), &(ctx->cs));
Expand Down Expand Up @@ -352,6 +378,45 @@ on_event(struct pt_regs* ctx) {
// Subtract 128 from sp for x86-ABI red zone
uintptr_t top_of_stack = user_regs.sp - 128;

#elif defined(__aarch64__)
// On aarch64, BCC's compat header defines an x86_64 struct pt_regs.
// We need to reinterpret ctx as the actual aarch64 user_pt_regs layout:
// u64 regs[31]; // offset 0, size 248
// u64 sp; // offset 248
// u64 pc; // offset 256
// u64 pstate; // offset 264
uint64_t *raw = (uint64_t *)ctx;
uint64_t user_sp_val, user_pc_val, user_fp_val, pstate;

// Read pstate (index 33 = offset 264)
bpf_probe_read_kernel(&pstate, sizeof(pstate), &raw[33]);

if ((pstate & 0xf) == 0) {
// EL0 - user mode context
bpf_probe_read_kernel(&user_sp_val, sizeof(user_sp_val), &raw[31]); // sp
bpf_probe_read_kernel(&user_pc_val, sizeof(user_pc_val), &raw[32]); // pc
bpf_probe_read_kernel(&user_fp_val, sizeof(user_fp_val), &raw[29]); // x29 = FP
}
else {
// Kernel mode - read user regs from kernel stack via task_pt_regs
unsigned long stack_base;
bpf_probe_read_kernel(&stack_base, sizeof(stack_base),
(void*)((unsigned long)task + STACK_OFS));
uint64_t *user_raw = (uint64_t *)((stack_base + THREAD_SIZE) - 272); // sizeof(pt_regs)=272
bpf_probe_read_kernel(&user_sp_val, sizeof(user_sp_val), &user_raw[31]);
bpf_probe_read_kernel(&user_pc_val, sizeof(user_pc_val), &user_raw[32]);
bpf_probe_read_kernel(&user_fp_val, sizeof(user_fp_val), &user_raw[29]);
}

event->user_sp = user_sp_val;
event->user_ip = user_pc_val;
event->user_bp = user_fp_val;
event->user_stack_len = 0;

// No red zone on aarch64 Linux ABI
uintptr_t top_of_stack = user_sp_val;
#endif

// Copy one page at the time - if one fails we don't want to lose the others
int i;
#pragma unroll
Expand Down
34 changes: 33 additions & 1 deletion examples/cpp/pyperf/PyPerfNativeStackTrace.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack,

int NativeStackTrace::UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum,
unw_word_t *valp, int write, void *arg) {
#if defined(__x86_64__)
if (regnum == UNW_X86_64_RBP) {
if (write) {
logInfo(2, "Libunwind attempts to write to BP\n");
Expand All @@ -168,6 +169,33 @@ int NativeStackTrace::UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum,
*valp = NativeStackTrace::bp;
return 0;
}
#elif defined(__aarch64__)
if (regnum == UNW_AARCH64_X29) {
if (write) {
logInfo(2, "Libunwind attempts to write to X29/FP\n");
return -UNW_EINVAL;
}

*valp = NativeStackTrace::bp;
return 0;
}
if (regnum == UNW_AARCH64_SP || regnum == UNW_REG_SP) {
if (write) return -UNW_EINVAL;
*valp = NativeStackTrace::sp;
return 0;
}
if (regnum == UNW_AARCH64_PC) {
if (write) return -UNW_EINVAL;
*valp = NativeStackTrace::ip;
return 0;
}
if (regnum == UNW_AARCH64_X30 || regnum == UNW_REG_IP) {
// X30 = link register = return address. Use IP as best approximation.
if (write) return -UNW_EINVAL;
*valp = NativeStackTrace::ip;
return 0;
}
#endif
if (regnum == UNW_REG_SP) {
if (write) {
logInfo(2, "Libunwind attempts to write to SP\n");
Expand Down Expand Up @@ -199,8 +227,12 @@ int NativeStackTrace::UPT_access_mem(unw_addr_space_t as, unw_word_t addr,
return -UNW_EINVAL;
}

// Subtract 128 for x86-ABI red zone
// Subtract 128 for x86-ABI red zone (no red zone on aarch64)
#if defined(__x86_64__)
const uintptr_t top_of_stack = NativeStackTrace::sp - 128;
#elif defined(__aarch64__)
const uintptr_t top_of_stack = NativeStackTrace::sp;
#endif
const uintptr_t stack_start = top_of_stack & ~(getpagesize() - 1);
const uintptr_t stack_end = stack_start + NativeStackTrace::stack_len;

Expand Down
5 changes: 4 additions & 1 deletion examples/cpp/pyperf/PyPerfProfiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ void handleLostSamplesCallback(void* cb_cookie, uint64_t lost_cnt) {
PyPerfProfiler::PyPerfResult PyPerfProfiler::init(unsigned int symbolsMapSize, unsigned int eventsBufferPages,
unsigned int kernelStacksMapSize, unsigned int userStacksPages,
unsigned int fsOffset, unsigned int stackOffset,
bool insertDsoName) {
bool insertDsoName, unsigned int pthreadStructSize) {
std::vector<std::string> cflags;
cflags.emplace_back(kNumCpusFlag + std::to_string(::sysconf(_SC_NPROCESSORS_ONLN)));
cflags.emplace_back(kSymbolsHashSizeFlag + std::to_string(symbolsMapSize));
Expand All @@ -177,6 +177,9 @@ PyPerfProfiler::PyPerfResult PyPerfProfiler::init(unsigned int symbolsMapSize, u
cflags.emplace_back(kGetThreadStateProgIdxFlag + std::to_string(kGetThreadStateProgIdx));
cflags.emplace_back(kFsOffsetFlag + std::to_string(fsOffset));
cflags.emplace_back(kStackOffsetFlag + std::to_string(stackOffset));
if (pthreadStructSize > 0) {
cflags.emplace_back("-DPTHREAD_STRUCT_SIZE=" + std::to_string(pthreadStructSize));
}

if (insertDsoName) {
NativeStackTrace::enable_dso_reporting();
Expand Down
2 changes: 1 addition & 1 deletion examples/cpp/pyperf/PyPerfProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class PyPerfProfiler {
PyPerfResult init(unsigned int symbolsMapSize, unsigned int eventsBufferPages,
unsigned int kernelStacksMapSize, unsigned int userStacksPages,
unsigned int fsOffset, unsigned int stackOffset,
bool insertDsoName);
bool insertDsoName, unsigned int pthreadStructSize = 0);

PyPerfResult profile(int64_t sampleRate, int64_t sampleFreq, int64_t duration,
PyPerfSampleProcessor* processor);
Expand Down