From 324eedb69473c1da5ea6f3b43573795905fd91f1 Mon Sep 17 00:00:00 2001 From: Nicolas Savoire Date: Thu, 5 Feb 2026 00:26:56 +0000 Subject: [PATCH 1/3] Remove pthread_key and use initial-exec TLS instead --- CMakeLists.txt | 4 ++++ cmake/dd_profiling.version | 2 +- include/lib/allocation_tracker.hpp | 2 -- src/lib/allocation_tracker.cc | 29 ++++++++++------------------- src/lib/loader.c | 3 +++ 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 508619fb3..7ca7f263c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,6 +267,10 @@ if(BUILD_UNIVERSAL_DDPROF) endif() endif() +if(USE_LOADER) + target_compile_definitions(dd_profiling-embedded PRIVATE "DDPROF_USE_LOADER") +endif() + # Fix for link error in sanitizeddebug build mode with gcc: # ~~~ # /usr/bin/ld: ./libdd_profiling.so: undefined reference to `__dynamic_cast' diff --git a/cmake/dd_profiling.version b/cmake/dd_profiling.version index d7e1730ea..988721d24 100644 --- a/cmake/dd_profiling.version +++ b/cmake/dd_profiling.version @@ -1,4 +1,4 @@ { - global: ddprof_start_profiling; ddprof_stop_profiling; + global: ddprof_start_profiling; ddprof_stop_profiling; ddprof_lib_state; local: *; }; diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index ef7c9705a..3fb80d01a 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -120,8 +120,6 @@ class AllocationTracker { static void delete_tl_state(void *tl_state); - static void make_key(); - void track_allocation(uintptr_t addr, size_t size, TrackerThreadLocalState &tl_state, bool is_large_alloc); void track_deallocation(uintptr_t addr, TrackerThreadLocalState &tl_state, diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index e7a2981d9..41ad0a9e5 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -35,6 +35,14 @@ pthread_key_t AllocationTracker::_tl_state_key; AllocationTracker *AllocationTracker::_instance; namespace { + +#ifdef DDPROF_USE_LOADER +extern "C" + __attribute((tls_model("initial-exec"))) __thread void *ddprof_lib_state; +#else +__attribute((tls_model("initial-exec"))) __thread void *ddprof_lib_state; +#endif + DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, size_t size, bool &timeout) { constexpr std::chrono::nanoseconds k_sleep_duration = @@ -53,13 +61,7 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, } // namespace TrackerThreadLocalState *AllocationTracker::get_tl_state() { - // In shared libraries, TLS access requires a call to tls_get_addr, - // tls_get_addr can call into malloc, which can create a recursive loop - // instead we call pthread APIs to control the creation of TLS objects - pthread_once(&_key_once, make_key); - auto *tl_state = static_cast( - pthread_getspecific(_tl_state_key)); - return tl_state; + return static_cast(ddprof_lib_state); } TrackerThreadLocalState *AllocationTracker::init_tl_state() { @@ -69,13 +71,7 @@ TrackerThreadLocalState *AllocationTracker::init_tl_state() { auto tl_state = std::make_unique(); tl_state->tid = ddprof::gettid(); tl_state->stack_bounds = retrieve_stack_bounds(); - - if (int const res = pthread_setspecific(_tl_state_key, tl_state.get()); - res != 0) { - // should return 0 - LG_DBG("Unable to store tl_state. Error %d: %s\n", res, strerror(res)); - tl_state.reset(); - } + ddprof_lib_state = tl_state.get(); return tl_state.release(); } @@ -91,11 +87,6 @@ void AllocationTracker::delete_tl_state(void *tl_state) { delete static_cast(tl_state); } -void AllocationTracker::make_key() { - // delete is called on all key objects - pthread_key_create(&_tl_state_key, delete_tl_state); -} - DDRes AllocationTracker::allocation_tracking_init( uint64_t allocation_profiling_rate, uint32_t flags, uint32_t stack_sample_size, const RingBufferInfo &ring_buffer, diff --git a/src/lib/loader.c b/src/lib/loader.c index ffd6022a9..83317f0b0 100644 --- a/src/lib/loader.c +++ b/src/lib/loader.c @@ -18,6 +18,9 @@ #include #include +__attribute__((__visibility__("default"))) +__attribute__((tls_model("initial-exec"))) __thread void *ddprof_lib_state; + /* Role of loader is to ensure that all dependencies (libdl/lim/libpthread) of * libdd_profiling-embedded.so are satisfied before dlopen'ing it. * On musl, all libc features are in libc.so and hence are available once libc From 7fd9a0d5ab3cab88ab88f224c9fa9f15ce401e0e Mon Sep 17 00:00:00 2001 From: Erwan Viollet Date: Thu, 12 Feb 2026 17:20:21 +0100 Subject: [PATCH 2/3] Replace heap-allocated void* TLS with a char[] buffer using initial-exec __thread and placement new. - Add `initialized` field to TrackerThreadLocalState - Shared C header (tls_state_storage.h) for size/alignment constants between loader.c and C++ code, with static_assert - Clean up dead code: _key_once, _tl_state_key, delete_tl_state - Add standalone fork test validating TLS zero-init, fork inheritance, and fresh-thread behavior --- include/lib/allocation_tracker.hpp | 8 -- include/lib/allocation_tracker_tls.hpp | 4 + include/lib/tls_state_storage.h | 15 +++ src/lib/allocation_tracker.cc | 44 ++++---- src/lib/loader.c | 5 +- test/CMakeLists.txt | 31 ++++++ test/allocation_tracker_fork_test.cc | 145 +++++++++++++++++++++++++ 7 files changed, 221 insertions(+), 31 deletions(-) create mode 100644 include/lib/tls_state_storage.h create mode 100644 test/allocation_tracker_fork_test.cc diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 3fb80d01a..208dfdcb7 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -16,7 +16,6 @@ #include #include #include -#include namespace ddprof { @@ -118,8 +117,6 @@ class AllocationTracker { static AllocationTracker *create_instance(); - static void delete_tl_state(void *tl_state); - void track_allocation(uintptr_t addr, size_t size, TrackerThreadLocalState &tl_state, bool is_large_alloc); void track_deallocation(uintptr_t addr, TrackerThreadLocalState &tl_state, @@ -156,11 +153,6 @@ class AllocationTracker { AddressBitset _allocated_address_set; IntervalTimerCheck _interval_timer_check; - // These can not be tied to the internal state of the instance. - // The creation of the instance depends on this - static pthread_once_t _key_once; // ensures we call key creation a single time - static pthread_key_t _tl_state_key; - static AllocationTracker *_instance; }; diff --git a/include/lib/allocation_tracker_tls.hpp b/include/lib/allocation_tracker_tls.hpp index 25ad78aeb..fb7773bba 100644 --- a/include/lib/allocation_tracker_tls.hpp +++ b/include/lib/allocation_tracker_tls.hpp @@ -25,6 +25,10 @@ struct TrackerThreadLocalState { // should not allocate because we might already // be inside an allocation) + // Set to true by placement new in init_tl_state(). + // Zero-initialized (false) in a fresh thread's TLS before init. + bool initialized{true}; + // In the choice of random generators, this one is smaller // - smaller than mt19937 (8 vs 5K) std::minstd_rand gen{std::random_device{}()}; diff --git a/include/lib/tls_state_storage.h b/include/lib/tls_state_storage.h new file mode 100644 index 000000000..eac241daf --- /dev/null +++ b/include/lib/tls_state_storage.h @@ -0,0 +1,15 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#pragma once + +// C-compatible constants for the TLS buffer that stores +// TrackerThreadLocalState. Used by loader.c (C) and allocation_tracker.cc +// (C++). Correctness enforced at compile time via static_assert in +// allocation_tracker.cc. +enum { + DDPROF_TLS_STATE_SIZE = 48, + DDPROF_TLS_STATE_ALIGN = 8, +}; diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 41ad0a9e5..0b3212246 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -15,6 +15,7 @@ #include "ringbuffer_utils.hpp" #include "savecontext.hpp" #include "syscalls.hpp" +#include "tls_state_storage.h" #include "tsc_clock.hpp" #include @@ -23,24 +24,27 @@ #include #include #include +#include #include namespace ddprof { -// Static declarations -pthread_once_t AllocationTracker::_key_once = PTHREAD_ONCE_INIT; - -pthread_key_t AllocationTracker::_tl_state_key; - AllocationTracker *AllocationTracker::_instance; +static_assert(sizeof(TrackerThreadLocalState) == DDPROF_TLS_STATE_SIZE, + "Update DDPROF_TLS_STATE_SIZE in tls_state_storage.h"); +static_assert(alignof(TrackerThreadLocalState) <= DDPROF_TLS_STATE_ALIGN, + "Update DDPROF_TLS_STATE_ALIGN in tls_state_storage.h"); + namespace { #ifdef DDPROF_USE_LOADER -extern "C" - __attribute((tls_model("initial-exec"))) __thread void *ddprof_lib_state; +extern "C" __attribute((tls_model( + "initial-exec"))) __thread char ddprof_lib_state[DDPROF_TLS_STATE_SIZE]; #else -__attribute((tls_model("initial-exec"))) __thread void *ddprof_lib_state; +__attribute((tls_model("initial-exec"))) +__attribute((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char + ddprof_lib_state[sizeof(TrackerThreadLocalState)]; #endif DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, @@ -61,19 +65,19 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, } // namespace TrackerThreadLocalState *AllocationTracker::get_tl_state() { - return static_cast(ddprof_lib_state); + // ddprof_lib_state is zero-initialized by libc for each new thread. + // After placement new (init_tl_state), initialized is set to true. + auto *state = reinterpret_cast(ddprof_lib_state); + return state->initialized ? state : nullptr; } TrackerThreadLocalState *AllocationTracker::init_tl_state() { - // Since init_tl_state is only called in allocation_tracking_init and - // notify_thread_start, there is no danger of reentering it when doing an - // allocation. - auto tl_state = std::make_unique(); - tl_state->tid = ddprof::gettid(); - tl_state->stack_bounds = retrieve_stack_bounds(); - ddprof_lib_state = tl_state.get(); - - return tl_state.release(); + // Placement new into TLS -- no heap allocation, no cleanup needed on thread + // exit. Safe to call after fork (TLS memory is inherited by child). + auto *state = new (ddprof_lib_state) TrackerThreadLocalState{}; + state->tid = ddprof::gettid(); + state->stack_bounds = retrieve_stack_bounds(); + return state; } AllocationTracker::AllocationTracker() = default; @@ -83,10 +87,6 @@ AllocationTracker *AllocationTracker::create_instance() { return &tracker; } -void AllocationTracker::delete_tl_state(void *tl_state) { - delete static_cast(tl_state); -} - DDRes AllocationTracker::allocation_tracking_init( uint64_t allocation_profiling_rate, uint32_t flags, uint32_t stack_sample_size, const RingBufferInfo &ring_buffer, diff --git a/src/lib/loader.c b/src/lib/loader.c index 83317f0b0..cd1aa9e03 100644 --- a/src/lib/loader.c +++ b/src/lib/loader.c @@ -6,6 +6,7 @@ #include "constants.hpp" #include "dd_profiling.h" #include "lib_embedded_data.h" +#include "tls_state_storage.h" #include #include @@ -19,7 +20,9 @@ #include __attribute__((__visibility__("default"))) -__attribute__((tls_model("initial-exec"))) __thread void *ddprof_lib_state; +__attribute__((tls_model("initial-exec"))) +__attribute__((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char + ddprof_lib_state[DDPROF_TLS_STATE_SIZE]; /* Role of loader is to ensure that all dependencies (libdl/lim/libpthread) of * libdd_profiling-embedded.so are satisfied before dlopen'ing it. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 01d4e6257..76d7e93db 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -393,6 +393,37 @@ add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.c add_unit_test(lib_logger-ut ./lib_logger-ut.cc) +# Standalone fork test for allocation tracker TLS (no gtest -- fork inside gtest is fragile) +add_exe( + allocation_tracker_fork_test + allocation_tracker_fork_test.cc + ../src/lib/allocation_tracker.cc + ../src/lib/address_bitset.cc + ../src/logger.cc + ../src/lib/pthread_fixes.cc + ../src/lib/savecontext.cc + ../src/lib/saveregisters.cc + ../src/procutils.cc + ../src/ratelimiter.cc + ../src/ringbuffer_utils.cc + ../src/sys_utils.cc + ../src/tsc_clock.cc + ../src/perf_clock.cc + ../src/perf.cc + ../src/ddres_list.cc + ../src/perf_ringbuffer.cc + ../src/pevent_lib.cc + ../src/user_override.cc + LIBRARIES llvm-demangle) +target_include_directories( + allocation_tracker_fork_test PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/include/lib + ${CMAKE_SOURCE_DIR}/src) + +add_test( + NAME allocation_tracker_fork_test + COMMAND allocation_tracker_fork_test + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_unit_test( create_elf-ut create_elf-ut.cc diff --git a/test/allocation_tracker_fork_test.cc b/test/allocation_tracker_fork_test.cc new file mode 100644 index 000000000..6a0ac20e8 --- /dev/null +++ b/test/allocation_tracker_fork_test.cc @@ -0,0 +1,145 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +// Standalone test (no gtest) to verify TLS availability across fork(). +// Gtest is avoided because forking inside gtest is fragile. + +#include "lib/allocation_tracker.hpp" +#include "loghandle.hpp" + +#include +#include +#include + +namespace ddprof { + +#define CHECK_OR_RETURN(cond, ...) \ + do { \ + if (!(cond)) { \ + LG_ERR(__VA_ARGS__); \ + return 1; \ + } \ + } while (0) + +#define CHECK_OR_EXIT(cond, ...) \ + do { \ + if (!(cond)) { \ + LG_ERR(__VA_ARGS__); \ + _exit(1); \ + } \ + } while (0) + +void *thread_check_null_tls(void *arg) { + auto *result = static_cast(arg); + auto *state = AllocationTracker::get_tl_state(); + if (state != nullptr) { + LG_ERR("new thread expected NULL TLS, got %p", static_cast(state)); + *result = 1; + return nullptr; + } + *result = 0; + return nullptr; +} + +int run_child(void *parent_state) { + // After fork, the __thread buffer is inherited: the child's main thread + // sees the initialized state at the same virtual address. + auto *child_inherited = AllocationTracker::get_tl_state(); + CHECK_OR_EXIT(child_inherited == parent_state, + "expected inherited TLS %p, got %p", parent_state, + static_cast(child_inherited)); + + // Re-init to get a fresh state for the child + auto *child_state = AllocationTracker::init_tl_state(); + CHECK_OR_EXIT(child_state != nullptr, "init_tl_state() returned NULL"); + + auto *retrieved = AllocationTracker::get_tl_state(); + CHECK_OR_EXIT(retrieved == child_state, "get/init mismatch: %p vs %p", + static_cast(retrieved), + static_cast(child_state)); + + // A new thread in the child must start with NULL TLS (zero-initialized) + pthread_t thread; + int thread_result = -1; + CHECK_OR_EXIT(pthread_create(&thread, nullptr, thread_check_null_tls, + &thread_result) == 0, + "pthread_create failed"); + pthread_join(thread, nullptr); + CHECK_OR_EXIT(thread_result == 0, "child thread TLS was not NULL"); + + _exit(0); +} + +} // namespace ddprof + +int main() { + using namespace ddprof; + LogHandle log_handle(LL_NOTICE); + LG_NTC("allocation_tracker_fork_test starting"); + + // Before any init, main thread's TLS must be zero-initialized by libc, + // so get_tl_state() should return NULL (initialized == false). + auto *pre_init = AllocationTracker::get_tl_state(); + CHECK_OR_RETURN(pre_init == nullptr, + "main thread TLS not zero-initialized before init (got %p)", + static_cast(pre_init)); + + // Verify the same zero-initialization contract on a new thread + { + pthread_t thread; + int thread_result = -1; + CHECK_OR_RETURN(pthread_create(&thread, nullptr, thread_check_null_tls, + &thread_result) == 0, + "pthread_create failed (pre-fork thread)"); + pthread_join(thread, nullptr); + CHECK_OR_RETURN(thread_result == 0, + "pre-fork thread TLS was not zero-initialized"); + } + + // Create TLS in parent + auto *parent_state = AllocationTracker::init_tl_state(); + CHECK_OR_RETURN(parent_state != nullptr, + "parent init_tl_state() returned NULL"); + + auto *retrieved = AllocationTracker::get_tl_state(); + CHECK_OR_RETURN(retrieved == parent_state, "parent get/init mismatch"); + + fflush(stdout); + fflush(stderr); + + pid_t pid = fork(); + CHECK_OR_RETURN(pid != -1, "fork failed: %s", strerror(errno)); + + if (pid == 0) { + run_child(parent_state); + _exit(0); + } + + // Parent: wait for child + int status; + waitpid(pid, &status, 0); + + if (!WIFEXITED(status)) { + if (WIFSIGNALED(status)) { + LG_ERR("child killed by signal %d", WTERMSIG(status)); + } else { + LG_ERR("child did not exit normally"); + } + return 1; + } + + int exit_code = WEXITSTATUS(status); + CHECK_OR_RETURN(exit_code == 0, "child exited with code %d", exit_code); + + // Parent TLS should be unaffected by the fork + auto *parent_after = AllocationTracker::get_tl_state(); + CHECK_OR_RETURN(parent_after == parent_state, + "parent TLS corrupted after fork (%p vs %p)", + static_cast(parent_after), + static_cast(parent_state)); + + LG_NTC("allocation_tracker_fork_test passed"); + return 0; +} From b561539103191958bbbaecb6b4683299d936a8ed Mon Sep 17 00:00:00 2001 From: Erwan Viollet Date: Mon, 23 Feb 2026 18:07:05 +0100 Subject: [PATCH 3/3] Minor linter fixes --- include/lib/tls_state_storage.h | 4 ++++ include/loghandle.hpp | 4 ++++ src/logger.cc | 12 +++--------- test/allocation_tracker_fork_test.cc | 8 +++++--- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/lib/tls_state_storage.h b/include/lib/tls_state_storage.h index eac241daf..e658f2e93 100644 --- a/include/lib/tls_state_storage.h +++ b/include/lib/tls_state_storage.h @@ -9,7 +9,11 @@ // TrackerThreadLocalState. Used by loader.c (C) and allocation_tracker.cc // (C++). Correctness enforced at compile time via static_assert in // allocation_tracker.cc. +#ifdef __cplusplus +enum : unsigned char { +#else enum { +#endif DDPROF_TLS_STATE_SIZE = 48, DDPROF_TLS_STATE_ALIGN = 8, }; diff --git a/include/loghandle.hpp b/include/loghandle.hpp index 21117c120..b9e11a78c 100644 --- a/include/loghandle.hpp +++ b/include/loghandle.hpp @@ -15,5 +15,9 @@ class LogHandle { LOG_setlevel(lvl); } ~LogHandle() { LOG_close(); } + LogHandle(const LogHandle &) = delete; + LogHandle &operator=(const LogHandle &) = delete; + LogHandle(LogHandle &&) = delete; + LogHandle &operator=(LogHandle &&) = delete; }; } // namespace ddprof diff --git a/src/logger.cc b/src/logger.cc index a983786f6..9d85fbb63 100644 --- a/src/logger.cc +++ b/src/logger.cc @@ -190,15 +190,9 @@ void vlprintfln(int lvl, int fac, const char *format, va_list args) { "<%d>%s.%06ld %s[%d]: ", lvl + (fac * LL_LENGTH), tm_str, d_us.count(), name, pid); } else { - const char *levels[LL_LENGTH] = { - [LL_EMERGENCY] = "EMERGENCY", - [LL_ALERT] = "ALERT", - [LL_CRITICAL] = "CRITICAL", - [LL_ERROR] = "ERROR", - [LL_WARNING] = "WARNING", - [LL_NOTICE] = "NOTICE", - [LL_INFORMATIONAL] = "INFORMATIONAL", - [LL_DEBUG] = "DEBUG", + static constexpr const char *levels[] = { + "EMERGENCY", "ALERT", "CRITICAL", "ERROR", + "WARNING", "NOTICE", "INFORMATIONAL", "DEBUG", }; sz_h = snprintf(buf, LOG_MSG_CAP, "<%s>%s.%06lu %s[%d]: ", levels[lvl], tm_str, d_us.count(), name, pid); diff --git a/test/allocation_tracker_fork_test.cc b/test/allocation_tracker_fork_test.cc index 6a0ac20e8..2bebdfcd0 100644 --- a/test/allocation_tracker_fork_test.cc +++ b/test/allocation_tracker_fork_test.cc @@ -14,6 +14,7 @@ #include namespace ddprof { +namespace { #define CHECK_OR_RETURN(cond, ...) \ do { \ @@ -72,11 +73,12 @@ int run_child(void *parent_state) { _exit(0); } +} // namespace } // namespace ddprof int main() { using namespace ddprof; - LogHandle log_handle(LL_NOTICE); + const LogHandle log_handle(LL_NOTICE); LG_NTC("allocation_tracker_fork_test starting"); // Before any init, main thread's TLS must be zero-initialized by libc, @@ -109,7 +111,7 @@ int main() { fflush(stdout); fflush(stderr); - pid_t pid = fork(); + const pid_t pid = fork(); CHECK_OR_RETURN(pid != -1, "fork failed: %s", strerror(errno)); if (pid == 0) { @@ -130,7 +132,7 @@ int main() { return 1; } - int exit_code = WEXITSTATUS(status); + const int exit_code = WEXITSTATUS(status); CHECK_OR_RETURN(exit_code == 0, "child exited with code %d", exit_code); // Parent TLS should be unaffected by the fork