Skip to content

Commit 39c0790

Browse files
committed
Add AArch64 NEON non-temporal load/store (ldnp/stnp)
Implement store_stream and load_stream for neon64 using inline asm with LDNP/STNP instructions, providing non-temporal cache hints on AArch64. Covers float, double, and integral types. Guarded behind __GNUC__ so MSVC ARM64 falls back to aligned load/store. Also remove xsimd::fence (std::atomic wrapper) and its test, which were unrelated additions from a prior commit.
1 parent c8ab083 commit 39c0790

3 files changed

Lines changed: 84 additions & 16 deletions

File tree

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include <complex>
1616
#include <cstddef>
17+
#include <cstring>
1718
#include <tuple>
1819
#include <utility>
1920

@@ -178,6 +179,89 @@ namespace xsimd
178179
return store_aligned<A>(dst, src, A {});
179180
}
180181

182+
/****************
183+
* store_stream *
184+
****************/
185+
186+
#if defined(__GNUC__)
187+
template <class A>
188+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& val, requires_arch<neon64>) noexcept
189+
{
190+
float32x2_t lo = vget_low_f32(val);
191+
float32x2_t hi = vget_high_f32(val);
192+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
193+
:
194+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
195+
: "memory");
196+
}
197+
198+
template <class A>
199+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& val, requires_arch<neon64>) noexcept
200+
{
201+
float64x1_t lo = vget_low_f64(val);
202+
float64x1_t hi = vget_high_f64(val);
203+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
204+
:
205+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
206+
: "memory");
207+
}
208+
209+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
210+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& val, requires_arch<neon64>) noexcept
211+
{
212+
uint64x2_t u64;
213+
std::memcpy(&u64, &val, sizeof(u64));
214+
uint64x1_t lo = vget_low_u64(u64);
215+
uint64x1_t hi = vget_high_u64(u64);
216+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
217+
:
218+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
219+
: "memory");
220+
}
221+
#endif
222+
223+
/***************
224+
* load_stream *
225+
***************/
226+
227+
#if defined(__GNUC__)
228+
template <class A>
229+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<neon64>) noexcept
230+
{
231+
float32x2_t lo, hi;
232+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
233+
: [lo] "=w"(lo), [hi] "=w"(hi)
234+
: [mem] "r"(mem)
235+
: "memory");
236+
return vcombine_f32(lo, hi);
237+
}
238+
239+
template <class A>
240+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<neon64>) noexcept
241+
{
242+
float64x1_t lo, hi;
243+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
244+
: [lo] "=w"(lo), [hi] "=w"(hi)
245+
: [mem] "r"(mem)
246+
: "memory");
247+
return vcombine_f64(lo, hi);
248+
}
249+
250+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
251+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<neon64>) noexcept
252+
{
253+
uint64x1_t lo, hi;
254+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
255+
: [lo] "=w"(lo), [hi] "=w"(hi)
256+
: [mem] "r"(mem)
257+
: "memory");
258+
uint64x2_t u64 = vcombine_u64(lo, hi);
259+
batch<T, A> result;
260+
std::memcpy(&result, &u64, sizeof(u64));
261+
return result;
262+
}
263+
#endif
264+
181265
/*********************
182266
* store<batch_bool> *
183267
*********************/

include/xsimd/types/xsimd_api.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#ifndef XSIMD_API_HPP
1313
#define XSIMD_API_HPP
1414

15-
#include <atomic>
1615
#include <complex>
1716
#include <cstddef>
1817
#include <limits>
@@ -2568,16 +2567,6 @@ namespace xsimd
25682567
store_as<T, A>(mem, val, stream_mode {});
25692568
}
25702569

2571-
/**
2572-
* @ingroup batch_data_transfer
2573-
*
2574-
* Issues a sequentially consistent memory fence.
2575-
*/
2576-
XSIMD_INLINE void fence() noexcept
2577-
{
2578-
std::atomic_thread_fence(std::memory_order_seq_cst);
2579-
}
2580-
25812570
/**
25822571
* @ingroup batch_data_transfer
25832572
*

test/test_load_store.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
606606
SUBCASE("masked") { Test.test_masked(); }
607607
}
608608

609-
TEST_CASE("[fence] sequential consistency")
610-
{
611-
xsimd::fence();
612-
CHECK(true);
613-
}
614609
#endif

0 commit comments

Comments
 (0)