Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions include/xsimd/arch/common/xsimd_common_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,28 @@ namespace xsimd

namespace detail
{
template <class A>
XSIMD_INLINE memory_barrier_tag barrier_tag(A const&) noexcept
{
return {};
}

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept
{
#if defined(__GNUC__)
__asm__ volatile("" : : "r"(&x) : "memory");
#else
(void)x;
#endif
}

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const& arch) noexcept
{
detail::reassociation_barrier(x, detail::barrier_tag(arch));
}

template <class F, class A, class T, class... Batches>
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
{
Expand Down
10 changes: 1 addition & 9 deletions include/xsimd/arch/common/xsimd_common_math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1900,17 +1900,9 @@ namespace xsimd
batch_type s = bitofsign(self);
batch_type v = self ^ s;
batch_type t2n = constants::twotonmb<batch_type>();
// Under fast-math, reordering is possible and the compiler optimizes d
// to v. That's not what we want, so prevent compiler optimization here.
// FIXME: it may be better to emit a memory barrier here (?).
#ifdef __FAST_MATH__
batch_type d0 = v + t2n;
asm volatile("" ::"r"(&d0) : "memory");
detail::reassociation_barrier(d0.data, A {});
batch_type d = d0 - t2n;
#else
batch_type d0 = v + t2n;
batch_type d = d0 - t2n;
#endif
return s ^ select(v < t2n, d, v);
}
}
Expand Down
5 changes: 5 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE x86_barrier_tag barrier_tag(avx const&) noexcept
{
return {};
}

XSIMD_INLINE __m128i lower_half(__m256i self) noexcept
{
return _mm256_castsi256_si128(self);
Expand Down
12 changes: 4 additions & 8 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <complex>
#include <type_traits>

#include "./xsimd_common_fwd.hpp"
#include "../types/xsimd_avx2_register.hpp"
#include "../types/xsimd_batch_constant.hpp"
#include "./utils/shifts.hpp"
Expand Down Expand Up @@ -554,11 +555,8 @@ namespace xsimd
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, avx2 {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}

Expand All @@ -575,9 +573,7 @@ namespace xsimd
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, avx2 {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}
}
Expand Down
5 changes: 5 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE x86_barrier_tag barrier_tag(avx512f const&) noexcept
{
return {};
}

XSIMD_INLINE __m256 lower_half(__m512 self) noexcept
{
return _mm512_castps512_ps256(self);
Expand Down
35 changes: 35 additions & 0 deletions include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,32 @@ namespace xsimd
class batch;
template <class T, class A>
class batch_bool;

namespace kernel
{
namespace detail
{
struct memory_barrier_tag
{
};

struct x86_barrier_tag
{
};

struct arm_barrier_tag
{
};

struct vsx_barrier_tag
{
};

struct rvv_barrier_tag
{
};
}
}
template <class T, class A, T... Vs>
struct batch_constant;
template <class T, class A, bool... Vs>
Expand Down Expand Up @@ -101,6 +127,15 @@ namespace xsimd
// Forward declarations for pack-level helpers
namespace detail
{
template <class A>
XSIMD_INLINE memory_barrier_tag barrier_tag(A const&) noexcept;

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept;

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept;

template <typename T, T... Vs>
XSIMD_INLINE constexpr bool is_identity() noexcept;
template <typename T, class A, T... Vs>
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,21 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE arm_barrier_tag barrier_tag(neon const&) noexcept
{
return {};
}

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, arm_barrier_tag) noexcept
{
#if defined(__GNUC__)
__asm__ volatile("" : "+w"(x));
#else
detail::reassociation_barrier(x, memory_barrier_tag {});
#endif
}

template <template <class> class return_type, class... T>
struct neon_dispatcher_base
{
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/arch/xsimd_rvv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,21 @@ namespace xsimd
{
namespace detail
{
XSIMD_INLINE rvv_barrier_tag barrier_tag(rvv const&) noexcept
{
return {};
}

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, rvv_barrier_tag) noexcept
{
#if defined(__GNUC__)
__asm__ volatile("" : "+vr"(x));
#else
detail::reassociation_barrier(x, memory_barrier_tag {});
#endif
}

template <class T>
using rvv_fix_char_t = types::detail::rvv_fix_char_t<T>;
template <class T, size_t Width = XSIMD_RVV_BITS>
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE x86_barrier_tag barrier_tag(sse2 const&) noexcept
{
return {};
}

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, x86_barrier_tag) noexcept
{
#if defined(__GNUC__) && XSIMD_TARGET_X86
__asm__ volatile("" : "+x"(x));
#else
detail::reassociation_barrier(x, memory_barrier_tag {});
#endif
}

constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
{
return (z << 6) | (y << 4) | (x << 2) | w;
Expand Down
12 changes: 4 additions & 8 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <type_traits>

#include "./xsimd_common_fwd.hpp"
#include "../types/xsimd_sse4_1_register.hpp"
#include "./common/xsimd_common_cast.hpp"

Expand Down Expand Up @@ -64,11 +65,8 @@ namespace xsimd
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, sse4_1 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand All @@ -81,9 +79,7 @@ namespace xsimd
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, sse4_1 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}
}
Expand Down
5 changes: 5 additions & 0 deletions include/xsimd/arch/xsimd_sve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ namespace xsimd
{
namespace detail
{
XSIMD_INLINE arm_barrier_tag barrier_tag(sve const&) noexcept
{
return {};
}

using xsimd::index;
using xsimd::types::detail::sve_vector_type;

Expand Down
18 changes: 18 additions & 0 deletions include/xsimd/arch/xsimd_vsx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@ namespace xsimd

namespace kernel
{
namespace detail
{
XSIMD_INLINE vsx_barrier_tag barrier_tag(vsx const&) noexcept
{
return {};
}

template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, vsx_barrier_tag) noexcept
{
#if defined(__GNUC__)
__asm__ volatile("" : "+wa"(x));
#else
detail::reassociation_barrier(x, memory_barrier_tag {});
#endif
}
}

template <class A, class T>
XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
template <class A, class T>
Expand Down
Loading