Add AArch64 NEON non-temporal load/store (ldnp/stnp)

DiamonDinoia · DiamonDinoia · commit 39c079078a5c · 2026-03-03T11:54:20.000-05:00
Implement store_stream and load_stream for neon64 using inline asm
with LDNP/STNP instructions, providing non-temporal cache hints on
AArch64. Covers float, double, and integral types. Guarded behind
__GNUC__ so MSVC ARM64 falls back to aligned load/store.

Also remove xsimd::fence (std::atomic wrapper) and its test, which
were unrelated additions from a prior commit.
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -14,6 +14,7 @@
 
 #include <complex>
 #include <cstddef>
+#include <cstring>
 #include <tuple>
 #include <utility>
 
@@ -178,6 +179,89 @@ namespace xsimd
             return store_aligned<A>(dst, src, A {});
         }
 
+        /****************
+         * store_stream *
+         ****************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo = vget_low_f32(val);
+            float32x2_t hi = vget_high_f32(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo = vget_low_f64(val);
+            float64x1_t hi = vget_high_f64(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& val, requires_arch<neon64>) noexcept
+        {
+            uint64x2_t u64;
+            std::memcpy(&u64, &val, sizeof(u64));
+            uint64x1_t lo = vget_low_u64(u64);
+            uint64x1_t hi = vget_high_u64(u64);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+#endif
+
+        /***************
+         * load_stream *
+         ***************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f32(lo, hi);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f64(lo, hi);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<neon64>) noexcept
+        {
+            uint64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            uint64x2_t u64 = vcombine_u64(lo, hi);
+            batch<T, A> result;
+            std::memcpy(&result, &u64, sizeof(u64));
+            return result;
+        }
+#endif
+
         /*********************
          * store<batch_bool> *
          *********************/
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -12,7 +12,6 @@
 #ifndef XSIMD_API_HPP
 #define XSIMD_API_HPP
 
-#include <atomic>
 #include <complex>
 #include <cstddef>
 #include <limits>
@@ -2568,16 +2567,6 @@ namespace xsimd
         store_as<T, A>(mem, val, stream_mode {});
     }
 
-    /**
-     * @ingroup batch_data_transfer
-     *
-     * Issues a sequentially consistent memory fence.
-     */
-    XSIMD_INLINE void fence() noexcept
-    {
-        std::atomic_thread_fence(std::memory_order_seq_cst);
-    }
-
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
@@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
     SUBCASE("masked") { Test.test_masked(); }
 }
 
-TEST_CASE("[fence] sequential consistency")
-{
-    xsimd::fence();
-    CHECK(true);
-}
 #endif

Original file line number	Diff line number	Diff line change
`@@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)`
`606`	`606`	`SUBCASE("masked") { Test.test_masked(); }`
`607`	`607`	`}`
`608`	`608`
`609`		`-TEST_CASE("[fence] sequential consistency")`
`610`		`-{`
`611`		`- xsimd::fence();`
`612`		`- CHECK(true);`
`613`		`-}`
`614`	`609`	`#endif`