diff --git a/CMakeLists.txt b/CMakeLists.txt index 551d9884ff..31e4838f31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,6 +159,12 @@ add_feature_info(SIGNED_1INDEX_TYPE TA_SIGNED_1INDEX_TYPE "Use of signed 1-index # define this as needed set(TA_MAX_SOO_RANK_METADATA 8 CACHE STRING "Specifies the max rank for which small object optimization will be used (hence, heap use avoided) for metadata objects") +# Alignment, in bytes, of element storage inside an ArenaTensor cell. Must be a +# power of two and at least sizeof(void*). Default 32 covers AVX2 YMM; raise to +# 64 for AVX-512, drop to 16 for NEON-only / Apple Silicon, raise to 128 for an +# Apple-Silicon L1 cache-line floor. See src/TiledArray/tensor/arena_tensor.h. +set(TA_ARENATENSOR_SIMD_ALIGN 32 CACHE STRING "Alignment (B) of in-arena element storage for ArenaTensor; power of two, default 32 covers AVX2 (set 64 for AVX-512, 16 for NEON-only, 128 for Apple-Silicon cache-line floor)") + option(TA_TRACE_TASKS "Enable debug tracing of MADNESS tasks in (some components of) TiledArray" OFF) add_feature_info(TASK_TRACE_DEBUG TA_TRACE_TASKS "Debug tracing of MADNESS tasks in (some components of) TiledArray") set(TILEDARRAY_ENABLE_TASK_DEBUG_TRACE ${TA_TRACE_TASKS}) diff --git a/INSTALL.md b/INSTALL.md index 1345220cf6..928eba7fa8 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -431,6 +431,7 @@ support may be added. * `TA_WERROR` -- Set to `ON` to treat compiler warnings as errors when compiling TiledArray's own translation units (the `tiledarray` library and in-tree tests/examples). Also implies `MADNESS_WERROR=ON` for the MADworld translation units built as part of TA's FetchContent tree. Does **not** propagate to consumers of the installed `tiledarray` target (i.e. `find_package(tiledarray)` users do not inherit `-Werror`). Honored on GNU/Clang/AppleClang/IntelLLVM. [Default=OFF]. * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates. * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`. +* `TA_ARENATENSOR_SIMD_ALIGN` -- Alignment (in bytes) of element storage inside an `ArenaTensor` cell. Must be a power of two. The default is `32` (covers AVX2 YMM). Set to `64` for AVX-512 ZMM (also matches the x86_64 cache line), `16` for NEON-only / Apple Silicon (NEON has no wider register and Apple Silicon does not implement SVE), or `128` for a two-cache-line / Apple-Silicon L1-line floor. Each `ArenaTensor` cell pads from `sizeof(Cell)` up to this alignment before its element storage, so lowering the value cuts per-cell padding at the cost of narrower vectorized loads/stores. * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout. * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray. * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s. diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index f0399b5b01..460521ecae 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -172,6 +172,9 @@ /* Specifies the rank for up to which to use Small Object Optimization for metadata (e.g. Range, Range::index, etc.) */ #cmakedefine TA_MAX_SOO_RANK_METADATA @TA_MAX_SOO_RANK_METADATA@ +/* Alignment (in bytes) of element storage inside an ArenaTensor cell; see src/TiledArray/tensor/arena_tensor.h */ +#cmakedefine TA_ARENATENSOR_SIMD_ALIGN @TA_ARENATENSOR_SIMD_ALIGN@ + /* Enables tracing MADNESS tasks in TiledArray */ #cmakedefine TILEDARRAY_ENABLE_TASK_DEBUG_TRACE 1 diff --git a/src/TiledArray/tensor/arena_tensor.h b/src/TiledArray/tensor/arena_tensor.h index b4c3d4959e..473c342125 100644 --- a/src/TiledArray/tensor/arena_tensor.h +++ b/src/TiledArray/tensor/arena_tensor.h @@ -26,18 +26,23 @@ namespace TiledArray { -/// Alignment of in-arena element storage, in bytes. Sized to cover the -/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64 -/// cache line. Override at configure time by defining -/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for -/// two-cache-line floor / Apple-Silicon L1 line size). -#ifndef TILEDARRAY_INNER_SIMD_ALIGN -#define TILEDARRAY_INNER_SIMD_ALIGN 64 -#endif - -inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN; -static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0, - "kInnerSimdAlign must be a power of two"); +/// Alignment of in-arena element storage, in bytes. Supplied via CMake +/// (cache variable `TA_ARENATENSOR_SIMD_ALIGN`, propagated through +/// `TiledArray/config.h`). The default (32 B) matches the SSE/AVX/AVX2 +/// family — AVX2's 256-bit YMM registers being the most common x86_64 +/// SIMD target today. Override at configure time with +/// `-DTA_ARENATENSOR_SIMD_ALIGN=` for another power of two: +/// - 64 for AVX-512 ZMM (also matches an x86_64 cache line); +/// - 16 for NEON-only targets — NEON has no wider register (Apple +/// Silicon does not implement SVE), so 16 is sufficient there; +/// - 128 for a two-cache-line / Apple-Silicon L1-line floor (useful +/// only if cells need that as a false-sharing boundary). +/// Each ArenaTensor cell pads from `sizeof(Cell)` up to this alignment +/// before its element storage, so lowering the value cuts per-cell +/// padding at the cost of narrower vectorized loads/stores. +inline constexpr std::size_t kArenaTensorSimdAlign = TA_ARENATENSOR_SIMD_ALIGN; +static_assert((kArenaTensorSimdAlign & (kArenaTensorSimdAlign - 1)) == 0, + "TA_ARENATENSOR_SIMD_ALIGN must be a power of two"); template > class ArenaTensor; @@ -79,7 +84,8 @@ class ArenaTensor { /// arena slots must honour this so SIMD loads/stores on `data()` are /// aligned without an extra runtime check. static constexpr size_type data_alignment() noexcept { - return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign; + return alignof(T) > kArenaTensorSimdAlign ? alignof(T) + : kArenaTensorSimdAlign; } /// Offset (in bytes) of the first element past the cell header. diff --git a/tests/arena_tensor.cpp b/tests/arena_tensor.cpp index 9b47e1116f..cd9c792970 100644 --- a/tests/arena_tensor.cpp +++ b/tests/arena_tensor.cpp @@ -69,16 +69,16 @@ BOOST_AUTO_TEST_CASE(sizeof_invariant_across_range_parameterizations) { } BOOST_AUTO_TEST_CASE(element_data_is_simd_aligned) { - // data_alignment() should be at least kInnerSimdAlign; cell_alignment() + // data_alignment() should be at least kArenaTensorSimdAlign; cell_alignment() // should propagate that so the element pointer is SIMD-aligned. - BOOST_CHECK(Inner::data_alignment() >= TA::kInnerSimdAlign); - BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kInnerSimdAlign, 0u); + BOOST_CHECK(Inner::data_alignment() >= TA::kArenaTensorSimdAlign); + BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kArenaTensorSimdAlign, 0u); BOOST_CHECK(Inner::cell_alignment() >= Inner::data_alignment()); CellBuf buf(8); Inner x = TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{8}); auto addr = reinterpret_cast(x.data()); - BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u); } BOOST_AUTO_TEST_CASE(default_constructed_is_null) { diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp index 2e501c66db..09eb0aa232 100644 --- a/tests/arena_tensor_kernels.cpp +++ b/tests/arena_tensor_kernels.cpp @@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(builds_outer_with_uniform_inners) { BOOST_CHECK(bool(inner)); BOOST_CHECK_EQUAL(inner.size(), 8u); auto addr = reinterpret_cast(inner.data()); - BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u); } } @@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE(jagged_inner_shapes_round_trip) { BOOST_REQUIRE(bool(inner)); BOOST_CHECK_EQUAL(inner.size(), static_cast(sizes[ord])); auto addr = reinterpret_cast(inner.data()); - BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u); } } } @@ -320,7 +320,7 @@ BOOST_AUTO_TEST_CASE(contraction_arena_plan_reserve_and_construct_inner) { BOOST_REQUIRE(bool(inner)); BOOST_CHECK_EQUAL(inner.size(), 24u); auto addr = reinterpret_cast(inner.data()); - BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u); } } @@ -441,7 +441,7 @@ BOOST_AUTO_TEST_CASE(outer_tile_serialize_round_trip_arena_tensor) { // The loaded cell's data pointer is SIMD-aligned via // arena_outer_init. auto addr = reinterpret_cast(d.data()); - BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u); } }