From 7a674b98d6060b17b1a7f1f53583fa7dbb422f82 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 19 Mar 2026 12:55:03 -0400 Subject: [PATCH] fix: harden reassociation barriers for fast-math nearbyint --- .../arch/common/xsimd_common_details.hpp | 22 ++++++++++++ .../xsimd/arch/common/xsimd_common_math.hpp | 10 +----- include/xsimd/arch/xsimd_avx.hpp | 5 +++ include/xsimd/arch/xsimd_avx2.hpp | 12 +++---- include/xsimd/arch/xsimd_avx512f.hpp | 5 +++ include/xsimd/arch/xsimd_common_fwd.hpp | 35 +++++++++++++++++++ include/xsimd/arch/xsimd_neon.hpp | 15 ++++++++ include/xsimd/arch/xsimd_rvv.hpp | 15 ++++++++ include/xsimd/arch/xsimd_sse2.hpp | 15 ++++++++ include/xsimd/arch/xsimd_sse4_1.hpp | 12 +++---- include/xsimd/arch/xsimd_sve.hpp | 5 +++ include/xsimd/arch/xsimd_vsx.hpp | 18 ++++++++++ 12 files changed, 144 insertions(+), 25 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index efe01806b..ddf76dfae 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -111,6 +111,28 @@ namespace xsimd namespace detail { + template + XSIMD_INLINE memory_barrier_tag barrier_tag(A const&) noexcept + { + return {}; + } + + template + XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept + { +#if defined(__GNUC__) + __asm__ volatile("" : : "r"(&x) : "memory"); +#else + (void)x; +#endif + } + + template + XSIMD_INLINE void reassociation_barrier(T& x, A const& arch) noexcept + { + detail::reassociation_barrier(x, detail::barrier_tag(arch)); + } + template XSIMD_INLINE batch apply(F&& func, batch const& self, batch const& other) noexcept { diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index f84883405..6fc06c1ea 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -1900,17 +1900,9 @@ namespace xsimd batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); - // Under fast-math, reordering is possible and the compiler optimizes d - // to v. That's not what we want, so prevent compiler optimization here. - // FIXME: it may be better to emit a memory barrier here (?). -#ifdef __FAST_MATH__ batch_type d0 = v + t2n; - asm volatile("" ::"r"(&d0) : "memory"); + detail::reassociation_barrier(d0.data, A {}); batch_type d = d0 - t2n; -#else - batch_type d0 = v + t2n; - batch_type d = d0 - t2n; -#endif return s ^ select(v < t2n, d, v); } } diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 4af728e07..29284ff03 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -37,6 +37,11 @@ namespace xsimd namespace detail { + XSIMD_INLINE x86_barrier_tag barrier_tag(avx const&) noexcept + { + return {}; + } + XSIMD_INLINE __m128i lower_half(__m256i self) noexcept { return _mm256_castsi256_si128(self); diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 1eecabf7f..46cc181eb 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -15,6 +15,7 @@ #include #include +#include "./xsimd_common_fwd.hpp" #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" #include "./utils/shifts.hpp" @@ -554,11 +555,8 @@ namespace xsimd __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 // With -ffast-math, the compiler may reassociate (xH-C)+xL into // xH+(xL-C). Since xL< class batch_bool; + + namespace kernel + { + namespace detail + { + struct memory_barrier_tag + { + }; + + struct x86_barrier_tag + { + }; + + struct arm_barrier_tag + { + }; + + struct vsx_barrier_tag + { + }; + + struct rvv_barrier_tag + { + }; + } + } template struct batch_constant; template @@ -101,6 +127,15 @@ namespace xsimd // Forward declarations for pack-level helpers namespace detail { + template + XSIMD_INLINE memory_barrier_tag barrier_tag(A const&) noexcept; + + template + XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept; + + template + XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept; + template XSIMD_INLINE constexpr bool is_identity() noexcept; template diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4af19a650..cbeb433ee 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -159,6 +159,21 @@ namespace xsimd namespace detail { + XSIMD_INLINE arm_barrier_tag barrier_tag(neon const&) noexcept + { + return {}; + } + + template + XSIMD_INLINE void reassociation_barrier(T& x, arm_barrier_tag) noexcept + { +#if defined(__GNUC__) + __asm__ volatile("" : "+w"(x)); +#else + detail::reassociation_barrier(x, memory_barrier_tag {}); +#endif + } + template