Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ add_feature_info(SIGNED_1INDEX_TYPE TA_SIGNED_1INDEX_TYPE "Use of signed 1-index
# define this as needed
set(TA_MAX_SOO_RANK_METADATA 8 CACHE STRING "Specifies the max rank for which small object optimization will be used (hence, heap use avoided) for metadata objects")

# Alignment, in bytes, of element storage inside an ArenaTensor cell. Must be a
# power of two and at least sizeof(void*). Default 32 covers AVX2 YMM; raise to
# 64 for AVX-512, drop to 16 for NEON-only / Apple Silicon, raise to 128 for an
# Apple-Silicon L1 cache-line floor. See src/TiledArray/tensor/arena_tensor.h.
set(TA_ARENATENSOR_SIMD_ALIGN 32 CACHE STRING "Alignment (B) of in-arena element storage for ArenaTensor; power of two, default 32 covers AVX2 (set 64 for AVX-512, 16 for NEON-only, 128 for Apple-Silicon cache-line floor)")

option(TA_TRACE_TASKS "Enable debug tracing of MADNESS tasks in (some components of) TiledArray" OFF)
add_feature_info(TASK_TRACE_DEBUG TA_TRACE_TASKS "Debug tracing of MADNESS tasks in (some components of) TiledArray")
set(TILEDARRAY_ENABLE_TASK_DEBUG_TRACE ${TA_TRACE_TASKS})
Expand Down
1 change: 1 addition & 0 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ support may be added.
* `TA_WERROR` -- Set to `ON` to treat compiler warnings as errors when compiling TiledArray's own translation units (the `tiledarray` library and in-tree tests/examples). Also implies `MADNESS_WERROR=ON` for the MADworld translation units built as part of TA's FetchContent tree. Does **not** propagate to consumers of the installed `tiledarray` target (i.e. `find_package(tiledarray)` users do not inherit `-Werror`). Honored on GNU/Clang/AppleClang/IntelLLVM. [Default=OFF].
* `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates.
* `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`.
* `TA_ARENATENSOR_SIMD_ALIGN` -- Alignment (in bytes) of element storage inside an `ArenaTensor` cell. Must be a power of two. The default is `32` (covers AVX2 YMM). Set to `64` for AVX-512 ZMM (also matches the x86_64 cache line), `16` for NEON-only / Apple Silicon (NEON has no wider register and Apple Silicon does not implement SVE), or `128` for a two-cache-line / Apple-Silicon L1-line floor. Each `ArenaTensor` cell pads from `sizeof(Cell)` up to this alignment before its element storage, so lowering the value cuts per-cell padding at the cost of narrower vectorized loads/stores.
* `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
* `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray.
* `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.
Expand Down
3 changes: 3 additions & 0 deletions src/TiledArray/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@
/* Specifies the rank for up to which to use Small Object Optimization for metadata (e.g. Range, Range::index, etc.) */
#cmakedefine TA_MAX_SOO_RANK_METADATA @TA_MAX_SOO_RANK_METADATA@

/* Alignment (in bytes) of element storage inside an ArenaTensor cell; see src/TiledArray/tensor/arena_tensor.h */
#cmakedefine TA_ARENATENSOR_SIMD_ALIGN @TA_ARENATENSOR_SIMD_ALIGN@

/* Enables tracing MADNESS tasks in TiledArray */
#cmakedefine TILEDARRAY_ENABLE_TASK_DEBUG_TRACE 1

Expand Down
32 changes: 19 additions & 13 deletions src/TiledArray/tensor/arena_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,23 @@

namespace TiledArray {

/// Alignment of in-arena element storage, in bytes. Sized to cover the
/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64
/// cache line. Override at configure time by defining
/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for
/// two-cache-line floor / Apple-Silicon L1 line size).
#ifndef TILEDARRAY_INNER_SIMD_ALIGN
#define TILEDARRAY_INNER_SIMD_ALIGN 64
#endif

inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN;
static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0,
"kInnerSimdAlign must be a power of two");
/// Alignment of in-arena element storage, in bytes. Supplied via CMake
/// (cache variable `TA_ARENATENSOR_SIMD_ALIGN`, propagated through
/// `TiledArray/config.h`). The default (32 B) matches the SSE/AVX/AVX2
/// family — AVX2's 256-bit YMM registers being the most common x86_64
/// SIMD target today. Override at configure time with
/// `-DTA_ARENATENSOR_SIMD_ALIGN=<N>` for another power of two:
/// - 64 for AVX-512 ZMM (also matches an x86_64 cache line);
/// - 16 for NEON-only targets — NEON has no wider register (Apple
/// Silicon does not implement SVE), so 16 is sufficient there;
/// - 128 for a two-cache-line / Apple-Silicon L1-line floor (useful
/// only if cells need that as a false-sharing boundary).
/// Each ArenaTensor cell pads from `sizeof(Cell)` up to this alignment
/// before its element storage, so lowering the value cuts per-cell
/// padding at the cost of narrower vectorized loads/stores.
inline constexpr std::size_t kArenaTensorSimdAlign = TA_ARENATENSOR_SIMD_ALIGN;
static_assert((kArenaTensorSimdAlign & (kArenaTensorSimdAlign - 1)) == 0,
"TA_ARENATENSOR_SIMD_ALIGN must be a power of two");

template <typename T, typename Range_ = ::btas::zb::RangeNd<>>
class ArenaTensor;
Expand Down Expand Up @@ -79,7 +84,8 @@ class ArenaTensor {
/// arena slots must honour this so SIMD loads/stores on `data()` are
/// aligned without an extra runtime check.
static constexpr size_type data_alignment() noexcept {
return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign;
return alignof(T) > kArenaTensorSimdAlign ? alignof(T)
: kArenaTensorSimdAlign;
}

/// Offset (in bytes) of the first element past the cell header.
Expand Down
8 changes: 4 additions & 4 deletions tests/arena_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,16 @@ BOOST_AUTO_TEST_CASE(sizeof_invariant_across_range_parameterizations) {
}

BOOST_AUTO_TEST_CASE(element_data_is_simd_aligned) {
// data_alignment() should be at least kInnerSimdAlign; cell_alignment()
// data_alignment() should be at least kArenaTensorSimdAlign; cell_alignment()
// should propagate that so the element pointer is SIMD-aligned.
BOOST_CHECK(Inner::data_alignment() >= TA::kInnerSimdAlign);
BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kInnerSimdAlign, 0u);
BOOST_CHECK(Inner::data_alignment() >= TA::kArenaTensorSimdAlign);
BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kArenaTensorSimdAlign, 0u);
BOOST_CHECK(Inner::cell_alignment() >= Inner::data_alignment());
CellBuf buf(8);
Inner x =
TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{8});
auto addr = reinterpret_cast<std::uintptr_t>(x.data());
BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
}

BOOST_AUTO_TEST_CASE(default_constructed_is_null) {
Expand Down
8 changes: 4 additions & 4 deletions tests/arena_tensor_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(builds_outer_with_uniform_inners) {
BOOST_CHECK(bool(inner));
BOOST_CHECK_EQUAL(inner.size(), 8u);
auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
}
}

Expand Down Expand Up @@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE(jagged_inner_shapes_round_trip) {
BOOST_REQUIRE(bool(inner));
BOOST_CHECK_EQUAL(inner.size(), static_cast<std::size_t>(sizes[ord]));
auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
}
}
}
Expand Down Expand Up @@ -320,7 +320,7 @@ BOOST_AUTO_TEST_CASE(contraction_arena_plan_reserve_and_construct_inner) {
BOOST_REQUIRE(bool(inner));
BOOST_CHECK_EQUAL(inner.size(), 24u);
auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
}
}

Expand Down Expand Up @@ -441,7 +441,7 @@ BOOST_AUTO_TEST_CASE(outer_tile_serialize_round_trip_arena_tensor) {
// The loaded cell's data pointer is SIMD-aligned via
// arena_outer_init.
auto addr = reinterpret_cast<std::uintptr_t>(d.data());
BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
BOOST_CHECK_EQUAL(addr % TA::kArenaTensorSimdAlign, 0u);
}
}

Expand Down
Loading