ValeevGroup · evaleev · May 21, 2026 · May 21, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -159,6 +159,12 @@ add_feature_info(SIGNED_1INDEX_TYPE TA_SIGNED_1INDEX_TYPE "Use of signed 1-index
 # define this as needed
 set(TA_MAX_SOO_RANK_METADATA 8 CACHE STRING "Specifies the max rank for which small object optimization will be used (hence, heap use avoided) for metadata objects")
 
+# Alignment, in bytes, of element storage inside an ArenaTensor cell. Must be a
+# power of two and at least sizeof(void*). Default 32 covers AVX2 YMM; raise to
+# 64 for AVX-512, drop to 16 for NEON-only / Apple Silicon, raise to 128 for an
+# Apple-Silicon L1 cache-line floor. See src/TiledArray/tensor/arena_tensor.h.
+set(TA_ARENATENSOR_SIMD_ALIGN 32 CACHE STRING "Alignment (B) of in-arena element storage for ArenaTensor; power of two, default 32 covers AVX2 (set 64 for AVX-512, 16 for NEON-only, 128 for Apple-Silicon cache-line floor)")
+
 option(TA_TRACE_TASKS "Enable debug tracing of MADNESS tasks in (some components of) TiledArray" OFF)
 add_feature_info(TASK_TRACE_DEBUG TA_TRACE_TASKS "Debug tracing of MADNESS tasks in (some components of) TiledArray")
 set(TILEDARRAY_ENABLE_TASK_DEBUG_TRACE ${TA_TRACE_TASKS})

diff --git a/INSTALL.md b/INSTALL.md
@@ -431,6 +431,7 @@ support may be added.
 * `TA_WERROR` -- Set to `ON` to treat compiler warnings as errors when compiling TiledArray's own translation units (the `tiledarray` library and in-tree tests/examples). Also implies `MADNESS_WERROR=ON` for the MADworld translation units built as part of TA's FetchContent tree. Does **not** propagate to consumers of the installed `tiledarray` target (i.e. `find_package(tiledarray)` users do not inherit `-Werror`). Honored on GNU/Clang/AppleClang/IntelLLVM. [Default=OFF].
 * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates.
 * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`.
+* `TA_ARENATENSOR_SIMD_ALIGN` -- Alignment (in bytes) of element storage inside an `ArenaTensor` cell. Must be a power of two. The default is `32` (covers AVX2 YMM). Set to `64` for AVX-512 ZMM (also matches the x86_64 cache line), `16` for NEON-only / Apple Silicon (NEON has no wider register and Apple Silicon does not implement SVE), or `128` for a two-cache-line / Apple-Silicon L1-line floor. Each `ArenaTensor` cell pads from `sizeof(Cell)` up to this alignment before its element storage, so lowering the value cuts per-cell padding at the cost of narrower vectorized loads/stores.
 * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
 * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray.
 * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.

diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
@@ -172,6 +172,9 @@
 /* Specifies the rank for up to which to use Small Object Optimization for metadata (e.g. Range, Range::index, etc.) */
 #cmakedefine TA_MAX_SOO_RANK_METADATA @TA_MAX_SOO_RANK_METADATA@
 
+/* Alignment (in bytes) of element storage inside an ArenaTensor cell; see src/TiledArray/tensor/arena_tensor.h */
+#cmakedefine TA_ARENATENSOR_SIMD_ALIGN @TA_ARENATENSOR_SIMD_ALIGN@
+
 /* Enables tracing MADNESS tasks in TiledArray */
 #cmakedefine TILEDARRAY_ENABLE_TASK_DEBUG_TRACE 1
 

diff --git a/src/TiledArray/tensor/arena_tensor.h b/src/TiledArray/tensor/arena_tensor.h
@@ -26,18 +26,23 @@
 
 namespace TiledArray {
 
-/// Alignment of in-arena element storage, in bytes. Sized to cover the
-/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64
-/// cache line. Override at configure time by defining
-/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for
-/// two-cache-line floor / Apple-Silicon L1 line size).
-#ifndef TILEDARRAY_INNER_SIMD_ALIGN
-#define TILEDARRAY_INNER_SIMD_ALIGN 64
-#endif
-
-inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN;
-static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0,
-              "kInnerSimdAlign must be a power of two");
+/// Alignment of in-arena element storage, in bytes. Supplied via CMake
+/// (cache variable `TA_ARENATENSOR_SIMD_ALIGN`, propagated through
+/// `TiledArray/config.h`). The default (32 B) matches the SSE/AVX/AVX2
+/// family — AVX2's 256-bit YMM registers being the most common x86_64
+/// SIMD target today. Override at configure time with
+/// `-DTA_ARENATENSOR_SIMD_ALIGN=<N>` for another power of two:
+///   - 64  for AVX-512 ZMM (also matches an x86_64 cache line);
+///   - 16  for NEON-only targets — NEON has no wider register (Apple
+///         Silicon does not implement SVE), so 16 is sufficient there;
+///   - 128 for a two-cache-line / Apple-Silicon L1-line floor (useful
+///         only if cells need that as a false-sharing boundary).
+/// Each ArenaTensor cell pads from `sizeof(Cell)` up to this alignment
+/// before its element storage, so lowering the value cuts per-cell
+/// padding at the cost of narrower vectorized loads/stores.
+inline constexpr std::size_t kArenaTensorSimdAlign = TA_ARENATENSOR_SIMD_ALIGN;
+static_assert((kArenaTensorSimdAlign & (kArenaTensorSimdAlign - 1)) == 0,
+              "TA_ARENATENSOR_SIMD_ALIGN must be a power of two");
 
 template <typename T, typename Range_ = ::btas::zb::RangeNd<>>
 class ArenaTensor;
@@ -79,7 +84,8 @@ class ArenaTensor {
   /// arena slots must honour this so SIMD loads/stores on `data()` are
   /// aligned without an extra runtime check.
   static constexpr size_type data_alignment() noexcept {
-    return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign;
+    return alignof(T) > kArenaTensorSimdAlign ? alignof(T)
+                                              : kArenaTensorSimdAlign;
   }
 
   /// Offset (in bytes) of the first element past the cell header.

diff --git a/tests/arena_tensor.cpp b/tests/arena_tensor.cpp
@@ -69,16 +69,16 @@ BOOST_AUTO_TEST_CASE(sizeof_invariant_across_range_parameterizations) {
 }
 
 BOOST_AUTO_TEST_CASE(element_data_is_simd_aligned) {
-  // data_alignment() should be at least kInnerSimdAlign; cell_alignment()
+  // data_alignment() should be at least kArenaTensorSimdAlign; cell_alignment()
   // should propagate that so the element pointer is SIMD-aligned.
-  BOOST_CHECK(Inner::data_alignment() >= TA::kInnerSimdAlign);
-  BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kInnerSimdAlign, 0u);
+  BOOST_CHECK(Inner::data_alignment() >= TA::kArenaTensorSimdAlign);
+  BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kArenaTensorSimdAlign, 0u);
   BOOST_CHECK(Inner::cell_alignment() >= Inner::data_alignment());
   CellBuf buf(8);
   Inner x =
       TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{8});
   auto addr = reinterpret_cast<std::uintptr_t>(x.data());
-  BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+  BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
 }
 
 BOOST_AUTO_TEST_CASE(default_constructed_is_null) {

diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp
@@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(builds_outer_with_uniform_inners) {
     BOOST_CHECK(bool(inner));
     BOOST_CHECK_EQUAL(inner.size(), 8u);
     auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
-    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+    BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
   }
 }
 
@@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE(jagged_inner_shapes_round_trip) {
       BOOST_REQUIRE(bool(inner));
       BOOST_CHECK_EQUAL(inner.size(), static_cast<std::size_t>(sizes[ord]));
       auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
-      BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+      BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
     }
   }
 }
@@ -320,7 +320,7 @@ BOOST_AUTO_TEST_CASE(contraction_arena_plan_reserve_and_construct_inner) {
     BOOST_REQUIRE(bool(inner));
     BOOST_CHECK_EQUAL(inner.size(), 24u);
     auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
-    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+    BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
   }
 }
 
@@ -441,7 +441,7 @@ BOOST_AUTO_TEST_CASE(outer_tile_serialize_round_trip_arena_tensor) {
     // The loaded cell's data pointer is SIMD-aligned via
     // arena_outer_init.
     auto addr = reinterpret_cast<std::uintptr_t>(d.data());
-    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+    BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
   }
 }