From 82dd5266b9bb2e5ccca60ad3672a127db79329f9 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 19 May 2026 15:40:55 -0700
Subject: [PATCH 1/4] `cuda::std::simd` Optimize small integer operations

---
 libcudacxx/include/cuda/std/__fwd/simd.h      |   1 +
 .../include/cuda/std/__internal/features.h    |   7 +-
 .../include/cuda/std/__simd/basic_vec.h       |   1 +
 .../specializations/fixed_size_integral_vec.h | 223 ++++++++++++++++++
 .../specializations/fixed_size_storage.h      |  10 +-
 .../__simd/specializations/simd_intrinsics.h  |  93 ++++++++
 .../specializations/simd_intrinsics_array.h   | 133 +++++++++++
 .../include/cuda/std/__simd/type_traits.h     |  13 +-
 .../simd/simd.traits/alignment.pass.cpp       |  18 +-
 libcudacxx/test/simd_codegen/CMakeLists.txt   |  87 ++++---
 .../{ => floating_point}/decrement_f32x2.cu   |  16 +-
 .../simd_codegen/floating_point/fma_bf16.cu   |  38 +++
 .../simd_codegen/floating_point/fma_f16.cu    |  38 +++
 .../{ => floating_point}/increment_f32x2.cu   |  16 +-
 .../{ => floating_point}/less_bf16.cu         |  26 +-
 .../{ => floating_point}/less_f16.cu          |  26 +-
 .../floating_point/minus_f32x2.cu             |  28 +++
 .../floating_point/multiplies_bf16.cu         |  38 +++
 .../{ => floating_point}/multiplies_f16.cu    |  25 +-
 .../simd_codegen/floating_point/plus_bf16.cu  |  38 +++
 .../{ => floating_point}/plus_f16.cu          |  19 +-
 .../simd_codegen/floating_point/plus_f32x2.cu |  28 +++
 .../{ => floating_point}/unary_minus_f32x2.cu |  17 +-
 libcudacxx/test/simd_codegen/fma_bf16.cu      |  54 -----
 libcudacxx/test/simd_codegen/fma_f16.cu       |  53 -----
 .../simd_codegen/integer/arithmetic_u16x2.cu  |  66 ++++++
 .../simd_codegen/integer/arithmetic_u8x4.cu   |  61 +++++
 .../integer/bitwise_u16x2_u8x4.cu             |  95 ++++++++
 libcudacxx/test/simd_codegen/minus_f32x2.cu   |  39 ---
 .../test/simd_codegen/multiplies_bf16.cu      |  52 ----
 libcudacxx/test/simd_codegen/plus_bf16.cu     |  52 ----
 libcudacxx/test/simd_codegen/plus_f32x2.cu    |  39 ---
 32 files changed, 1010 insertions(+), 440 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h
 rename libcudacxx/test/simd_codegen/{ => floating_point}/decrement_f32x2.cu (59%)
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/fma_f16.cu
 rename libcudacxx/test/simd_codegen/{ => floating_point}/increment_f32x2.cu (59%)
 rename libcudacxx/test/simd_codegen/{ => floating_point}/less_bf16.cu (51%)
 rename libcudacxx/test/simd_codegen/{ => floating_point}/less_f16.cu (51%)
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu
 rename libcudacxx/test/simd_codegen/{ => floating_point}/multiplies_f16.cu (50%)
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu
 rename libcudacxx/test/simd_codegen/{ => floating_point}/plus_f16.cu (53%)
 create mode 100644 libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu
 rename libcudacxx/test/simd_codegen/{ => floating_point}/unary_minus_f32x2.cu (56%)
 delete mode 100644 libcudacxx/test/simd_codegen/fma_bf16.cu
 delete mode 100644 libcudacxx/test/simd_codegen/fma_f16.cu
 create mode 100644 libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu
 create mode 100644 libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu
 create mode 100644 libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu
 delete mode 100644 libcudacxx/test/simd_codegen/minus_f32x2.cu
 delete mode 100644 libcudacxx/test/simd_codegen/multiplies_bf16.cu
 delete mode 100644 libcudacxx/test/simd_codegen/plus_bf16.cu
 delete mode 100644 libcudacxx/test/simd_codegen/plus_f32x2.cu

diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h
index 1ce9102d9c0..02f26d4d335 100644
--- a/libcudacxx/include/cuda/std/__fwd/simd.h
+++ b/libcudacxx/include/cuda/std/__fwd/simd.h
@@ -50,6 +50,7 @@ enum class __simd_operations_kind
 {
   __default,
   __fixed_size_float,
+  __fixed_size_integral,
 };
 
 template <typename _Tp, typename _Abi>
diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h
index 79ad0ec42b1..3b8d43c521f 100644
--- a/libcudacxx/include/cuda/std/__internal/features.h
+++ b/libcudacxx/include/cuda/std/__internal/features.h
@@ -103,8 +103,13 @@
 
 #define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK() && !_CCCL_COMPILER(CLANG))
 #define _CCCL_HAS_SIMD_F32X2_PTX()        (__cccl_ptx_isa >= 860ULL)
+#define _CCCL_HAS_SIMD_F32X2() \
+  (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) && !_CCCL_TILE_COMPILATION()
 
-#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX())
+#define _CCCL_HAS_SIMD_8BIT_INTRINSICS() 0 // TODO(fbusato): CTK 13.2 produces non-optimal code for 8-bit SIMD instrs.
+#define _CCCL_HAS_SIMD_8BIT_PTX()        (__cccl_ptx_isa >= 920ULL)
+#define _CCCL_HAS_SIMD_8BIT() \
+  ((_CCCL_HAS_SIMD_8BIT_PTX() || _CCCL_HAS_SIMD_8BIT_INTRINSICS()) && !_CCCL_TILE_COMPILATION())
 
 // Third party libraries
 
diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h
index f16dfc9fdc2..37824dbc48c 100644
--- a/libcudacxx/include/cuda/std/__simd/basic_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h
@@ -32,6 +32,7 @@
 #include <cuda/std/__simd/flag.h>
 #include <cuda/std/__simd/iterator.h>
 #include <cuda/std/__simd/specializations/fixed_size_float_vec.h>
+#include <cuda/std/__simd/specializations/fixed_size_integral_vec.h>
 #include <cuda/std/__simd/specializations/fixed_size_vec.h>
 #include <cuda/std/__simd/utility.h>
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h
new file mode 100644
index 00000000000..abdb1c9dd05
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h
@@ -0,0 +1,223 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// automatic vectorization for small integers is not supported (until CUDA 13.2)
+// TODO(fbusato): remove this path once the feature is supported
+// TODO(fbusato): extend to other GPU archs in the future
+
+#include <cuda/__cmath/ceil_div.h>
+#include <cuda/std/__simd/specializations/fixed_size_vec.h>
+#include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/array>
+#include <cuda/std/cstdint>
+
+#include <nv/target>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <typename _Tp, __simd_size_type _Np>
+inline constexpr bool __is_fixed_size_small_integral_v =
+  is_integral_v<_Tp> && sizeof(_Tp) < sizeof(uint32_t) && _Np >= 2;
+
+inline constexpr auto __simd_operations_small_integral = __simd_operations_kind::__fixed_size_integral;
+
+template <typename _Tp, __simd_size_type _Np>
+inline constexpr __simd_operations_kind __simd_operations_kind_v<_Tp, __fixed_size<_Np>> =
+  __is_fixed_size_small_integral_v<_Tp, _Np> ? __simd_operations_small_integral : __simd_operations_kind::__default;
+
+#define _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(_NAME, _OP)                           \
+  [[nodiscard]] _CCCL_API static constexpr __simd_storage_t _NAME(                          \
+    const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept                  \
+  {                                                                                         \
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT                                                          \
+    {                                                                                       \
+      __unsigned_storage_t __result_u{};                                                    \
+      const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs);                 \
+      const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs);                 \
+      _CCCL_PRAGMA_UNROLL_FULL()                                                            \
+      for (__simd_size_type __i = 0; __i < __usize; ++__i)                                  \
+      {                                                                                     \
+        __result_u[__i] = __lhs_u[__i] _OP __rhs_u[__i];                                    \
+      }                                                                                     \
+      return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__result_u); \
+    }                                                                                       \
+    return __base::_NAME(__lhs, __rhs);                                                     \
+  }
+
+// Simd operations for fixed_size ABI with small integral element types.
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_operations<_Tp, __fixed_size<_Np>, __simd_operations_small_integral> : __fixed_size_operations<_Tp, _Np>
+{
+  using __base           = __fixed_size_operations<_Tp, _Np>;
+  using __simd_storage_t = __simd_storage<_Tp, __fixed_size<_Np>>;
+
+  // all computation is done on uint32_t, so the alignment must be at least the alignment of uint32_t
+  static_assert(alignof(__simd_storage_t) >= alignof(uint32_t));
+
+  static constexpr __simd_size_type __ratio = sizeof(uint32_t) / sizeof(_Tp);
+  static constexpr __simd_size_type __usize = ::cuda::ceil_div(_Np, __ratio);
+  using __unsigned_storage_t                = array<uint32_t, __usize>;
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_storage_t __bitwise_not(const __simd_storage_t& __s) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      auto __udata = ::cuda::std::simd::__to_unsigned_storage(__s);
+      _CCCL_PRAGMA_UNROLL_FULL()
+      for (__simd_size_type __i = 0; __i < __usize; ++__i)
+      {
+        __udata[__i] = ~__udata[__i];
+      }
+      return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__udata);
+    }
+    return __fixed_size_operations<_Tp, _Np>::__bitwise_not(__s);
+  }
+
+  _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_and, &)
+  _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_or, |)
+  _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_xor, ^)
+
+#if _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+  // Unary arithmetic operations
+
+  // x++ = x + 1
+  _CCCL_API static constexpr void __increment(__simd_storage_t& __s) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      [[maybe_unused]] constexpr __simd_storage_t __one = __base::__broadcast(1);
+      if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (__s = __plus(__s, __one); return;))
+      }
+#  if _CCCL_HAS_SIMD_8BIT()
+      else if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (__s = __plus(__s, __one); return;))
+      }
+#  endif // _CCCL_HAS_SIMD_8BIT()
+    }
+    __base::__increment(__s);
+  }
+
+  // x-- = x - 1
+  _CCCL_API static constexpr void __decrement(__simd_storage_t& __s) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      [[maybe_unused]] constexpr __simd_storage_t __minus_one = __base::__broadcast(static_cast<_Tp>(-1));
+      if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (__s = __plus(__s, __minus_one); return;))
+      }
+#  if _CCCL_HAS_SIMD_8BIT()
+      else if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (__s = __plus(__s, __minus_one); return;))
+      }
+#  endif // _CCCL_HAS_SIMD_8BIT()
+    }
+    __base::__decrement(__s);
+  }
+
+  // -x = ~x + 1
+  [[nodiscard]]
+  _CCCL_API static constexpr __simd_storage_t __unary_minus(const __simd_storage_t& __s) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      [[maybe_unused]] constexpr __simd_storage_t __one = __base::__broadcast(1);
+      if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (return __plus(__bitwise_not(__s), __one);))
+      }
+#  if _CCCL_HAS_SIMD_8BIT()
+      else if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return __plus(__bitwise_not(__s), __one);))
+      }
+#  endif // _CCCL_HAS_SIMD_8BIT()
+    }
+    return __base::__unary_minus(__s);
+  }
+
+  // Binary arithmetic operations
+
+  [[nodiscard]]
+  _CCCL_API static constexpr __simd_storage_t
+  __plus(const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      [[maybe_unused]] const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs);
+      [[maybe_unused]] const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs);
+      if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90,
+                     (return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(
+                               ::cuda::std::simd::__vadd_16bit_x2<_Tp>(__lhs_u, __rhs_u));))
+      }
+#  if _CCCL_HAS_SIMD_8BIT()
+      else if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
+                     (return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(
+                               ::cuda::std::simd::__vadd_8bit_x4<_Tp>(__lhs_u, __rhs_u));))
+      }
+#  endif // _CCCL_HAS_SIMD_8BIT()
+    }
+    return __fixed_size_operations<_Tp, _Np>::__plus(__lhs, __rhs);
+  }
+
+  [[nodiscard]]
+  _CCCL_API static constexpr __simd_storage_t
+  __minus(const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept
+  {
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (sizeof(_Tp) == 2)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_90, (return __plus(__lhs, __unary_minus(__rhs));))
+      }
+#  if _CCCL_HAS_SIMD_8BIT()
+      else if constexpr (sizeof(_Tp) == 1)
+      {
+        NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return __plus(__lhs, __unary_minus(__rhs));))
+      }
+#  endif // _CCCL_HAS_SIMD_8BIT()
+    }
+    return __base::__minus(__lhs, __rhs);
+  }
+#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+};
+
+#undef _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
index 01723e33ddf..fcbc63d933d 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/__utility/in_range.h>
+#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__fwd/simd.h>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -36,9 +37,16 @@ struct __fixed_size
   static constexpr __simd_size_type __simd_size = _Np;
 };
 
+// SIMD storage never directly interacts with memory. Users must use load/store API for that purpose.
+// However, SIMD storage could spill from register to cache/memory. This could break the alignment of the data for
+// vectorized instructions. For this reason, we align the SIMD storage to at least 8 bytes (max SIMD instruction size).
+// 8 bytes is a negligible constraint in case of spilling.
+template <typename _Tp, __simd_size_type _Np, size_t _TotalAlignment = alignof(_Tp) * _Np>
+inline constexpr size_t __simd_storage_alignment_v = ::cuda::std::max(alignof(_Tp), size_t{8});
+
 // Element-per-slot simd storage for fixed_size ABI
 template <typename _Tp, __simd_size_type _Np>
-struct __simd_storage<_Tp, __fixed_size<_Np>>
+struct alignas(__simd_storage_alignment_v<_Tp, _Np>) __simd_storage<_Tp, __fixed_size<_Np>>
 {
   using value_type = _Tp;
 
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
new file mode 100644
index 00000000000..6e7cd7caffe
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUDA_COMPILATION()
+
+#  include <cuda/std/cstdint>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+[[nodiscard]] _CCCL_DEVICE_API inline uint32_t
+__vadd_u16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (return ::__vadd2(__lhs, __rhs);), //
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u16x2: Unsupported architecture"); return uint32_t{};));
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline uint32_t
+__vadd_s16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
+{
+  // prevent MSVC warning
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               ({
+                 uint32_t __result{};
+                 asm("add.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture"); return uint32_t{};));
+}
+
+#  if _CCCL_HAS_SIMD_8BIT()
+
+[[nodiscard]] _CCCL_DEVICE_API inline uint32_t
+__vadd_u8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
+{
+#    if _CCCL_HAS_SIMD_8BIT_INTRINSICS()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
+               (return ::__vadd4(__lhs, __rhs);), //
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};));
+#    else // ^^^ _CCCL_HAS_SIMD_8BIT_INTRINSICS() ^^^ / vvv !_CCCL_HAS_SIMD_8BIT_INTRINSICS() vvv
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
+               ({
+                 uint32_t __result{};
+                 asm("add.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};));
+#    endif // _CCCL_HAS_SIMD_8BIT()
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline uint32_t
+__vadd_s8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
+{
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
+               ({
+                 uint32_t __result{};
+                 asm("add.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture"); return uint32_t{};));
+}
+
+#  endif // _CCCL_HAS_SIMD_8BIT()
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+#endif // _CCCL_CUDA_COMPILATION()
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h
new file mode 100644
index 00000000000..aa55459da48
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__cmath/ceil_div.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__cstring/memcpy.h>
+#include <cuda/std/__memory/assume_aligned.h>
+#include <cuda/std/__simd/specializations/fixed_size_storage.h>
+#include <cuda/std/__simd/specializations/simd_intrinsics.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/std/array>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+//----------------------------------------------------------------------------------------------------------------------
+// conversion utilities
+
+template <typename _SimdStorage>
+inline constexpr size_t __simd_storage_size_u32 = 0;
+
+template <typename _Tp, __simd_size_type _Np>
+inline constexpr size_t __simd_storage_size_u32<__simd_storage<_Tp, __fixed_size<_Np>>> =
+  ::cuda::ceil_div(_Np, sizeof(uint32_t) / sizeof(_Tp));
+
+template <size_t _Np>
+using __array_u32_t = array<uint32_t, _Np>;
+
+template <typename _SimdStorage>
+using __simd_storage_u32_t = __array_u32_t<__simd_storage_size_u32<_SimdStorage>>;
+
+template <typename _SimdStorage>
+inline constexpr size_t __simd_storage_copy_size_u32 = 0;
+
+template <typename _Tp, __simd_size_type _Np>
+inline constexpr size_t __simd_storage_copy_size_u32<__simd_storage<_Tp, __fixed_size<_Np>>> = _Np * sizeof(_Tp);
+
+template <typename _SimdStorage, typename _SimdStorageU32 = __simd_storage_u32_t<_SimdStorage>>
+[[nodiscard]] _CCCL_API constexpr _SimdStorageU32 __to_unsigned_storage(const _SimdStorage& __s) noexcept
+{
+  _SimdStorageU32 __tmp{};
+  const auto __input_data = ::cuda::std::assume_aligned<alignof(uint32_t)>(__s.__data);
+  ::cuda::std::memcpy(__tmp.data(), __input_data, __simd_storage_copy_size_u32<_SimdStorage>);
+  return __tmp;
+}
+
+template <typename _SimdStorage, typename _SimdStorageU32 = __simd_storage_u32_t<_SimdStorage>>
+[[nodiscard]] _CCCL_API constexpr _SimdStorage __copy_from_unsigned_storage(const _SimdStorageU32& __tmp) noexcept
+{
+  _SimdStorage __result{};
+  const auto __result_ptr = ::cuda::std::assume_aligned<alignof(uint32_t)>(__result.__data);
+  ::cuda::std::memcpy(__result_ptr, __tmp.data(), __simd_storage_copy_size_u32<_SimdStorage>);
+  return __result;
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// device-only functions
+
+#if _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+
+template <typename _Tp, size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __array_u32_t<_Np>
+__vadd_16bit_x2(const __array_u32_t<_Np>& __lhs_u, const __array_u32_t<_Np>& __rhs_u) noexcept
+{
+  __array_u32_t<_Np> __result_u;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (is_unsigned_v<_Tp>)
+    {
+      __result_u[__i] = ::cuda::std::simd::__vadd_u16x2(__lhs_u[__i], __rhs_u[__i]);
+    }
+    else
+    {
+      __result_u[__i] = ::cuda::std::simd::__vadd_s16x2(__lhs_u[__i], __rhs_u[__i]);
+    }
+  }
+  return __result_u;
+}
+
+#  if _CCCL_HAS_SIMD_8BIT()
+
+template <typename _Tp, size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __array_u32_t<_Np>
+__vadd_8bit_x4(const __array_u32_t<_Np>& __lhs_u, const __array_u32_t<_Np>& __rhs_u) noexcept
+{
+  __array_u32_t<_Np> __result_u;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (is_unsigned_v<_Tp>)
+    {
+      __result_u[__i] = ::cuda::std::simd::__vadd_u8x4(__lhs_u[__i], __rhs_u[__i]);
+    }
+    else
+    {
+      __result_u[__i] = ::cuda::std::simd::__vadd_s8x4(__lhs_u[__i], __rhs_u[__i]);
+    }
+  }
+  return __result_u;
+}
+
+#  endif // _CCCL_HAS_SIMD_8BIT()
+
+#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H
diff --git a/libcudacxx/include/cuda/std/__simd/type_traits.h b/libcudacxx/include/cuda/std/__simd/type_traits.h
index 8f350b8b938..3929ac9bb70 100644
--- a/libcudacxx/include/cuda/std/__simd/type_traits.h
+++ b/libcudacxx/include/cuda/std/__simd/type_traits.h
@@ -32,16 +32,19 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
+inline constexpr size_t __optimal_cuda_alignment = _CCCL_CTK_AT_LEAST(12, 9) ? 32 : 16;
+
+// The best alignment for a pointer to a SIMD type is the maximum of the type's alignment and the optimal CUDA
+// alignment.
+template <typename _Tp>
+inline constexpr size_t __simd_pointer_alignment_v = ::cuda::std::max(alignof(_Tp), __optimal_cuda_alignment);
+
 // [simd.traits], alignment
 template <typename _Tp, typename _Up = typename _Tp::value_type>
 struct alignment;
 
 template <typename _Tp, typename _Abi, typename _Up>
-struct alignment<basic_vec<_Tp, _Abi>, _Up>
-    : integral_constant<size_t,
-                        ::cuda::__is_valid_alignment(__simd_size_v<_Tp, _Abi> * alignof(_Up))
-                          ? __simd_size_v<_Tp, _Abi> * alignof(_Up)
-                          : alignof(_Up)>
+struct alignment<basic_vec<_Tp, _Abi>, _Up> : integral_constant<size_t, __simd_pointer_alignment_v<_Up>>
 {
   static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type");
 };
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp
index e237fc28fd3..dc8cd76ddaf 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp
@@ -26,27 +26,29 @@
 
 namespace simd = cuda::std::simd;
 
-template <typename T, int N, size_t ExpectedAlign = alignof(T) * N>
+inline constexpr size_t optimal_cuda_alignment = _CCCL_CTK_AT_LEAST(12, 9) ? 32 : 16;
+
+template <typename T, int N>
 TEST_FUNC void test_default_u()
 {
   using V = simd::basic_vec<T, simd::fixed_size<N>>;
-  static_assert(simd::alignment<V>::value == ExpectedAlign);
-  static_assert(simd::alignment_v<V> == ExpectedAlign);
+  static_assert(simd::alignment<V>::value == optimal_cuda_alignment);
+  static_assert(simd::alignment_v<V> == optimal_cuda_alignment);
 }
 
-template <typename T, int N, typename U, size_t ExpectedAlign = alignof(U) * N>
+template <typename T, int N, typename U>
 TEST_FUNC void test_explicit_u()
 {
   using V = simd::basic_vec<T, simd::fixed_size<N>>;
-  static_assert(simd::alignment<V, U>::value == ExpectedAlign);
-  static_assert(simd::alignment_v<V, U> == ExpectedAlign);
+  static_assert(simd::alignment<V, U>::value == optimal_cuda_alignment);
+  static_assert(simd::alignment_v<V, U> == optimal_cuda_alignment);
 }
 
 template <typename T>
 TEST_FUNC void test_type()
 {
   test_default_u<T, 1>();
-  test_default_u<T, 3, alignof(T)>();
+  test_default_u<T, 3>();
   test_default_u<T, 2>();
   test_default_u<T, 4>();
   test_default_u<T, 8>();
@@ -73,7 +75,7 @@ TEST_FUNC void test()
 
   // explicit U different from value_type
   test_explicit_u<int, 1, float>();
-  test_explicit_u<int, 3, float, alignof(float)>();
+  test_explicit_u<int, 3, float>();
   test_explicit_u<int, 4, char>();
   test_explicit_u<float, 2, double>();
   test_explicit_u<double, 4, int>();
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 7a82933ba97..4c073d6069f 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -21,12 +21,17 @@ else()
   return()
 endif()
 
+if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  message("-- clang-cuda does not support the -dc simd codegen tests")
+  return()
+endif()
+
 find_program(cuobjdump "cuobjdump" REQUIRED)
 find_program(bash "bash" REQUIRED)
 
 set(libcudacxx_simd_codegen_tests)
 if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  file(GLOB libcudacxx_simd_codegen_tests "*.cu")
+  file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu")
 endif()
 
 set(simd_codegen_cuda_archs 80 90)
@@ -34,47 +39,75 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
   list(APPEND simd_codegen_cuda_archs 100 120)
 endif()
 
-function(simd_codegen_add_tests test_path)
-  cmake_path(GET test_path FILENAME test_file)
-  cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
+set(simd_codegen_arch_specific_cuda_archs ${simd_codegen_cuda_archs})
+if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9)
+  list(APPEND simd_codegen_arch_specific_cuda_archs 120f)
+endif()
+
+function(simd_codegen_get_check_prefixes out_var test_contents arch)
+  set(check_prefixes "SMXX")
+  set(arch_prefix "SM${arch}")
 
-  file(READ "${test_path}" test_contents)
   string(
     REGEX MATCH
-    "SM[0-9][0-9]*"
-    has_arch_specific_prefix
+    ";[ \t]*${arch_prefix}(:|-)"
+    has_arch_prefix
     "${test_contents}"
   )
-
-  # search for SMXX prefixes in the test contents
-  set(test_archs)
-  if (has_arch_specific_prefix)
-    foreach (arch IN LISTS simd_codegen_cuda_archs)
-      string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix)
-      if (NOT arch_specific_prefix EQUAL -1)
-        list(APPEND test_archs "${arch}")
-      endif()
-    endforeach()
-  else()
-    set(test_archs ${simd_codegen_cuda_archs})
+  if (has_arch_prefix)
+    string(APPEND check_prefixes ",${arch_prefix}")
   endif()
 
-  # Run tests for each architecture specified in the test file
-  foreach (arch IN LISTS test_archs)
-    set(target_name "simd_codegen_sm${arch}_${test_name}")
-    set(check_prefixes "SMXX")
-    if (has_arch_specific_prefix)
-      string(APPEND check_prefixes ",SM${arch}")
+  if (arch MATCHES "^1[0-9][0-9][af]?$")
+    string(REGEX MATCH ";[ \t]*SM1XX(:|-)" has_sm1xx_prefix "${test_contents}")
+    if (has_sm1xx_prefix)
+      string(APPEND check_prefixes ",SM1XX")
     endif()
+  endif()
 
-    add_library(${target_name} STATIC "${test_path}")
+  set(${out_var} "${check_prefixes}" PARENT_SCOPE)
+endfunction()
 
+function(simd_codegen_set_cuda_arch target_name arch)
+  if (arch MATCHES "[af]$")
+    set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES OFF)
+    target_compile_options(
+      ${target_name}
+      PRIVATE "--generate-code=arch=compute_${arch},code=sm_${arch}"
+    )
+  else()
     set_target_properties(
       ${target_name}
       PROPERTIES CUDA_ARCHITECTURES "${arch}"
     )
+  endif()
+endfunction()
+
+function(simd_codegen_add_tests test_path)
+  cmake_path(GET test_path FILENAME test_file)
+  cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
+
+  file(READ "${test_path}" test_contents)
+
+  set(test_archs)
+  foreach (arch IN LISTS simd_codegen_arch_specific_cuda_archs)
+    simd_codegen_get_check_prefixes(check_prefixes "${test_contents}" "${arch}")
+    if (NOT "${check_prefixes}" STREQUAL "SMXX")
+      list(APPEND test_archs "${arch}")
+    endif()
+  endforeach()
+  if (NOT test_archs)
+    set(test_archs ${simd_codegen_cuda_archs})
+  endif()
 
-    target_compile_options(${target_name} PRIVATE "-Wno-comment")
+  # Run tests for each architecture specified in the test file
+  foreach (arch IN LISTS test_archs)
+    set(target_name "simd_codegen_sm${arch}_${test_name}")
+    simd_codegen_get_check_prefixes(check_prefixes "${test_contents}" "${arch}")
+
+    add_library(${target_name} STATIC "${test_path}")
+    simd_codegen_set_cuda_arch(${target_name} "${arch}")
+    target_compile_options(${target_name} PRIVATE "-Wno-comment" "-dc")
 
     target_include_directories(
       ${target_name}
diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu
similarity index 59%
rename from libcudacxx/test/simd_codegen/decrement_f32x2.cu
rename to libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu
index 41ec986c9c8..3ddc9e92282 100644
--- a/libcudacxx/test/simd_codegen/decrement_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu
@@ -9,28 +9,20 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 namespace simd = cuda::std::simd;
 
-using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+using Vec_f32_x4 = simd::basic_vec<float, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out)
+__device__ Vec_f32_x4 test_operator_decrement_f32_x4(Vec_f32_x4 vec)
 {
-  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
-
-  Vec_f32_4 vec(values);
   --vec;
-
-  out[0] = vec[0];
-  out[1] = vec[1];
-  out[2] = vec[2];
-  out[3] = vec[3];
+  return vec;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_decrement_f32_x4.*}}
 ; SM100: {{.*FADD2.*}}
 ; SM100: {{.*FADD2.*}}
 
diff --git a/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu
new file mode 100644
index 00000000000..31aed77de22
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+__device__ Vec_bf16_x4 test_fma_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs, Vec_bf16_x4 add)
+{
+  return lhs * rhs + add;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_fma_bf16_x4.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM1XX: {{.*HFMA2.*BF16.*}}
+; SM1XX: {{.*HFMA2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu b/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu
new file mode 100644
index 00000000000..f1e3b235e73
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+#if _CCCL_HAS_NVFP16()
+
+#  include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+__device__ Vec_f16_x4 test_fma_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs, Vec_f16_x4 add)
+{
+  return lhs * rhs + add;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_fma_f16_x4.*}}
+; SM80: {{.*HFMA2.*}}
+; SM80: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM1XX: {{.*HFMA2.*}}
+; SM1XX: {{.*HFMA2.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu
similarity index 59%
rename from libcudacxx/test/simd_codegen/increment_f32x2.cu
rename to libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu
index 9c696f11e40..d61770821fb 100644
--- a/libcudacxx/test/simd_codegen/increment_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu
@@ -9,28 +9,20 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 namespace simd = cuda::std::simd;
 
-using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+using Vec_f32_x4 = simd::basic_vec<float, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out)
+__device__ Vec_f32_x4 test_operator_increment_f32_x4(Vec_f32_x4 vec)
 {
-  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
-
-  Vec_f32_4 vec(values);
   ++vec;
-
-  out[0] = vec[0];
-  out[1] = vec[1];
-  out[2] = vec[2];
-  out[3] = vec[3];
+  return vec;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_increment_f32_x4.*}}
 ; SM100: {{.*FADD2.*}}
 ; SM100: {{.*FADD2.*}}
 
diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/less_bf16.cu
similarity index 51%
rename from libcudacxx/test/simd_codegen/less_bf16.cu
rename to libcudacxx/test/simd_codegen/floating_point/less_bf16.cu
index fff63e06d1c..7fec5668f19 100644
--- a/libcudacxx/test/simd_codegen/less_bf16.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/less_bf16.cu
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 #if _CCCL_HAS_NVBF16()
 
@@ -17,36 +16,25 @@
 
 namespace simd = cuda::std::simd;
 
-using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+using Vec_bf16_x4  = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+using Mask_bf16_x4 = Vec_bf16_x4::mask_type;
 
-extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out)
+__device__ Mask_bf16_x4 test_less_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs)
 {
-  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_bf16_4 lhs_vec(lhs_values);
-  const Vec_bf16_4 rhs_vec(rhs_values);
-  const auto result = lhs_vec < rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
+  return lhs < rhs;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_less_bf16_x4.*}}
 ; SM80: {{.*FSETP\.LT.*}}
 ; SM80: {{.*FSETP\.LT.*}}
 ; SM80: {{.*FSETP\.LT.*}}
 ; SM80: {{.*FSETP\.LT.*}}
 ; SM90: {{.*HSETP2.*BF16.*}}
 ; SM90: {{.*HSETP2.*BF16.*}}
-; SM100: {{.*HSETP2.*BF16.*}}
-; SM100: {{.*HSETP2.*BF16.*}}
-; SM120: {{.*HSETP2.*BF16.*}}
-; SM120: {{.*HSETP2.*BF16.*}}
+; SM1XX: {{.*HSETP2.*BF16.*}}
+; SM1XX: {{.*HSETP2.*BF16.*}}
 
 */
 
diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/floating_point/less_f16.cu
similarity index 51%
rename from libcudacxx/test/simd_codegen/less_f16.cu
rename to libcudacxx/test/simd_codegen/floating_point/less_f16.cu
index beef111b99e..62530ad19ce 100644
--- a/libcudacxx/test/simd_codegen/less_f16.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/less_f16.cu
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 #if _CCCL_HAS_NVFP16()
 
@@ -17,34 +16,23 @@
 
 namespace simd = cuda::std::simd;
 
-using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+using Vec_f16_x4  = simd::basic_vec<__half, simd::fixed_size<4>>;
+using Mask_f16_x4 = Vec_f16_x4::mask_type;
 
-extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out)
+__device__ Mask_f16_x4 test_less_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs)
 {
-  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_f16_4 lhs_vec(lhs_values);
-  const Vec_f16_4 rhs_vec(rhs_values);
-  const auto result = lhs_vec < rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
+  return lhs < rhs;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_less_f16_x4.*}}
 ; SM80: {{.*HSETP2.*}}
 ; SM80: {{.*HSETP2.*}}
 ; SM90: {{.*HSETP2.*}}
 ; SM90: {{.*HSETP2.*}}
-; SM100: {{.*HSETP2.*}}
-; SM100: {{.*HSETP2.*}}
-; SM120: {{.*HSETP2.*}}
-; SM120: {{.*HSETP2.*}}
+; SM1XX: {{.*HSETP2.*}}
+; SM1XX: {{.*HSETP2.*}}
 
 */
 
diff --git a/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu
new file mode 100644
index 00000000000..d96f073899e
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_x4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+__device__ Vec_f32_x4 test_operator_minus_f32_x4(Vec_f32_x4 lhs, Vec_f32_x4 rhs)
+{
+  return lhs - rhs;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_f32_x4.*}}
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu
new file mode 100644
index 00000000000..3e103287e14
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+__device__ Vec_bf16_x4 test_operator_multiplies_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs)
+{
+  return lhs * rhs;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_multiplies_bf16_x4.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HMUL2.*BF16.*}}
+; SM1XX: {{.*HFMA2.*BF16.*}}
+; SM1XX: {{.*HMUL2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu
similarity index 50%
rename from libcudacxx/test/simd_codegen/multiplies_f16.cu
rename to libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu
index bf721009051..bcd982ffe0f 100644
--- a/libcudacxx/test/simd_codegen/multiplies_f16.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 #if _CCCL_HAS_NVFP16()
 
@@ -17,34 +16,22 @@
 
 namespace simd = cuda::std::simd;
 
-using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out)
+__device__ Vec_f16_x4 test_operator_multiplies_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs)
 {
-  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_f16_4 lhs_vec(lhs_values);
-  const Vec_f16_4 rhs_vec(rhs_values);
-  const Vec_f16_4 result = lhs_vec * rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
+  return lhs * rhs;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_multiplies_f16_x4.*}}
 ; SM80: {{.*(HMUL2|HFMA2).*}}
 ; SM80: {{.*(HMUL2|HFMA2).*}}
 ; SM90: {{.*(HMUL2|HFMA2).*}}
 ; SM90: {{.*(HMUL2|HFMA2).*}}
-; SM100: {{.*(HMUL2|HFMA2).*}}
-; SM100: {{.*(HMUL2|HFMA2).*}}
-; SM120: {{.*(HMUL2|HFMA2).*}}
-; SM120: {{.*(HMUL2|HFMA2).*}}
+; SM1XX: {{.*(HMUL2|HFMA2).*}}
+; SM1XX: {{.*(HMUL2|HFMA2).*}}
 
 */
 
diff --git a/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu
new file mode 100644
index 00000000000..5321f23955d
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+__device__ Vec_bf16_x4 test_operator_plus_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs)
+{
+  return lhs + rhs;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_bf16_x4.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HADD2.*BF16.*}}
+; SM1XX: {{.*HFMA2.*BF16.*}}
+; SM1XX: {{.*HADD2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/floating_point/plus_f16.cu
similarity index 53%
rename from libcudacxx/test/simd_codegen/plus_f16.cu
rename to libcudacxx/test/simd_codegen/floating_point/plus_f16.cu
index 4a044a19d1c..a7a4e9120da 100644
--- a/libcudacxx/test/simd_codegen/plus_f16.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/plus_f16.cu
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 #if _CCCL_HAS_NVFP16()
 
@@ -17,26 +16,16 @@
 
 namespace simd = cuda::std::simd;
 
-using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out)
+__device__ Vec_f16_x4 test_operator_plus_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs)
 {
-  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_f16_4 lhs_vec(lhs_values);
-  const Vec_f16_4 rhs_vec(rhs_values);
-  const Vec_f16_4 result = lhs_vec + rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
+  return lhs + rhs;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_f16_x4.*}}
 ; SMXX: {{.*(HADD2|HFMA2).*}}
 ; SMXX: {{.*(HADD2|HFMA2).*}}
 
diff --git a/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu
new file mode 100644
index 00000000000..1979e6bf001
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_x4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+__device__ Vec_f32_x4 test_operator_plus_f32_x4(Vec_f32_x4 lhs, Vec_f32_x4 rhs)
+{
+  return lhs + rhs;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_f32_x4.*}}
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu
similarity index 56%
rename from libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
rename to libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu
index 02d9c14f18b..a8878b18f71 100644
--- a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu
@@ -9,28 +9,19 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
 
 namespace simd = cuda::std::simd;
 
-using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+using Vec_f32_x4 = simd::basic_vec<float, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out)
+__device__ Vec_f32_x4 test_operator_unary_minus_f32_x4(Vec_f32_x4 in)
 {
-  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
-
-  const Vec_f32_4 vec(values);
-  const Vec_f32_4 result = -vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
+  return -in;
 }
 
 /*
 
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_f32_x4.*}}
 ; SM100: {{.*FADD2.*}}
 ; SM100: {{.*FADD2.*}}
 
diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu
deleted file mode 100644
index f608f209e09..00000000000
--- a/libcudacxx/test/simd_codegen/fma_bf16.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-#if _CCCL_HAS_NVBF16()
-
-#  include <cuda_bf16.h>
-
-namespace simd = cuda::std::simd;
-
-using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
-
-extern "C" __global__ void
-test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out)
-{
-  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-  const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]};
-
-  const Vec_bf16_4 lhs_vec(lhs_values);
-  const Vec_bf16_4 rhs_vec(rhs_values);
-  const Vec_bf16_4 add_vec(add_values);
-  const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HFMA2.*BF16.*}}
-; SM100: {{.*HFMA2.*BF16.*}}
-; SM100: {{.*HFMA2.*BF16.*}}
-; SM120: {{.*HFMA2.*BF16.*}}
-; SM120: {{.*HFMA2.*BF16.*}}
-
-*/
-
-#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu
deleted file mode 100644
index a0dad310ac9..00000000000
--- a/libcudacxx/test/simd_codegen/fma_f16.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-#if _CCCL_HAS_NVFP16()
-
-#  include <cuda_fp16.h>
-
-namespace simd = cuda::std::simd;
-
-using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
-
-extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out)
-{
-  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-  const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]};
-
-  const Vec_f16_4 lhs_vec(lhs_values);
-  const Vec_f16_4 rhs_vec(rhs_values);
-  const Vec_f16_4 add_vec(add_values);
-  const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4
-; SM80: {{.*HFMA2.*}}
-; SM80: {{.*HFMA2.*}}
-; SM90: {{.*HFMA2.*}}
-; SM90: {{.*HFMA2.*}}
-; SM100: {{.*HFMA2.*}}
-; SM100: {{.*HFMA2.*}}
-; SM120: {{.*HFMA2.*}}
-; SM120: {{.*HFMA2.*}}
-
-*/
-
-#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu b/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu
new file mode 100644
index 00000000000..9d989c7377e
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_u16_x2 = simd::basic_vec<cuda::std::uint16_t, simd::fixed_size<2>>;
+
+__device__ Vec_u16_x2 test_operator_plus_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs)
+{
+  return lhs + rhs;
+}
+
+__device__ Vec_u16_x2 test_operator_minus_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs)
+{
+  return lhs - rhs;
+}
+
+__device__ Vec_u16_x2 test_operator_post_decrement_u16_x2(Vec_u16_x2 in)
+{
+  (void) in--;
+  return in;
+}
+
+__device__ Vec_u16_x2 test_operator_post_increment_u16_x2(Vec_u16_x2 in)
+{
+  (void) in++;
+  return in;
+}
+
+__device__ Vec_u16_x2 test_operator_unary_minus_u16_x2(Vec_u16_x2 in)
+{
+  return -in;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_u16_x2.*}}
+; SM90: {{.*VIADD.*}}
+; SM1XX: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_increment_u16_x2.*}}
+; SM90: {{.*VIADD.*}}
+; SM1XX: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_decrement_u16_x2.*}}
+; SM90: {{.*VIADD.*}}
+; SM1XX: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_u16_x2.*}}
+; SM90: {{.*VIADD.*}}
+; SM1XX: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_u16_x2.*}}
+; SM90: {{.*VIADD.*}}
+; SM1XX: {{.*VIADD.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu b/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu
new file mode 100644
index 00000000000..c836af3cec8
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_u8_x4 = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+__device__ Vec_u8_x4 test_operator_plus_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs)
+{
+  return lhs + rhs;
+}
+
+__device__ Vec_u8_x4 test_operator_minus_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs)
+{
+  return lhs - rhs;
+}
+
+__device__ Vec_u8_x4 test_operator_post_decrement_u8_x4(Vec_u8_x4 in)
+{
+  (void) in--;
+  return in;
+}
+
+__device__ Vec_u8_x4 test_operator_post_increment_u8_x4(Vec_u8_x4 in)
+{
+  (void) in++;
+  return in;
+}
+
+__device__ Vec_u8_x4 test_operator_unary_minus_u8_x4(Vec_u8_x4 in)
+{
+  return -in;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_u8_x4.*}}
+; SM120f: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_increment_u8_x4.*}}
+; SM120f: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_decrement_u8_x4.*}}
+; SM120f: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_u8_x4.*}}
+; SM120f: {{.*VIADD.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_u8_x4.*}}
+; SM120f: {{.*VIADD.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu b/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu
new file mode 100644
index 00000000000..eaffb5ded08
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using cuda::std::uint16_t;
+using cuda::std::uint8_t;
+
+using Vec_u16_x2 = simd::basic_vec<uint16_t, simd::fixed_size<2>>;
+using Vec_u8_x4  = simd::basic_vec<uint8_t, simd::fixed_size<4>>;
+
+__device__ void test_bitwise_and_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs)
+{
+  out = lhs & rhs;
+}
+
+__device__ void test_bitwise_or_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs)
+{
+  out = lhs | rhs;
+}
+
+__device__ void test_bitwise_xor_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs)
+{
+  out = lhs ^ rhs;
+}
+
+__device__ void test_bitwise_not_u16_x2(Vec_u16_x2& out, Vec_u16_x2& in)
+{
+  out = ~in;
+}
+
+__device__ void test_bitwise_and_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs)
+{
+  out = lhs & rhs;
+}
+
+__device__ void test_bitwise_or_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs)
+{
+  out = lhs | rhs;
+}
+
+__device__ void test_bitwise_xor_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs)
+{
+  out = lhs ^ rhs;
+}
+
+__device__ void test_bitwise_not_u8_x4(Vec_u8_x4& out, Vec_u8_x4& in)
+{
+  out = ~in;
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_not_u8_x4.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_xor_u8_x4.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_or_u8_x4.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_and_u8_x4.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_not_u16_x2.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_xor_u16_x2.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_or_u16_x2.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_and_u16_x2.*}}
+; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}}
+; SMXX: {{.*LOP3.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu
deleted file mode 100644
index 2c00e62b9ee..00000000000
--- a/libcudacxx/test/simd_codegen/minus_f32x2.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-namespace simd = cuda::std::simd;
-
-using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
-
-extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out)
-{
-  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_f32_4 lhs_vec(lhs_values);
-  const Vec_f32_4 rhs_vec(rhs_values);
-  const Vec_f32_4 result = lhs_vec - rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4
-; SM100: {{.*FADD2.*}}
-; SM100: {{.*FADD2.*}}
-
-*/
diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
deleted file mode 100644
index 3036c4fd526..00000000000
--- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-#if _CCCL_HAS_NVBF16()
-
-#  include <cuda_bf16.h>
-
-namespace simd = cuda::std::simd;
-
-using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
-
-extern "C" __global__ void
-test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
-{
-  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_bf16_4 lhs_vec(lhs_values);
-  const Vec_bf16_4 rhs_vec(rhs_values);
-  const Vec_bf16_4 result = lhs_vec * rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HMUL2.*BF16.*}}
-; SM100: {{.*HFMA2.*BF16.*}}
-; SM100: {{.*HMUL2.*BF16.*}}
-; SM120: {{.*HFMA2.*BF16.*}}
-; SM120: {{.*HMUL2.*BF16.*}}
-
-*/
-
-#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu
deleted file mode 100644
index 3d192fdad83..00000000000
--- a/libcudacxx/test/simd_codegen/plus_bf16.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-#if _CCCL_HAS_NVBF16()
-
-#  include <cuda_bf16.h>
-
-namespace simd = cuda::std::simd;
-
-using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
-
-extern "C" __global__ void
-test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
-{
-  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_bf16_4 lhs_vec(lhs_values);
-  const Vec_bf16_4 rhs_vec(rhs_values);
-  const Vec_bf16_4 result = lhs_vec + rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM80: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HFMA2.*BF16.*}}
-; SM90: {{.*HADD2.*BF16.*}}
-; SM100: {{.*HFMA2.*BF16.*}}
-; SM100: {{.*HADD2.*BF16.*}}
-; SM120: {{.*HFMA2.*BF16.*}}
-; SM120: {{.*HADD2.*BF16.*}}
-
-*/
-
-#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu
deleted file mode 100644
index b0caf15341b..00000000000
--- a/libcudacxx/test/simd_codegen/plus_f32x2.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cuda/std/__simd_> // IWYU pragma: keep
-#include <cuda/std/array>
-
-namespace simd = cuda::std::simd;
-
-using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
-
-extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out)
-{
-  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
-  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
-
-  const Vec_f32_4 lhs_vec(lhs_values);
-  const Vec_f32_4 rhs_vec(rhs_values);
-  const Vec_f32_4 result = lhs_vec + rhs_vec;
-
-  out[0] = result[0];
-  out[1] = result[1];
-  out[2] = result[2];
-  out[3] = result[3];
-}
-
-/*
-
-; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4
-; SM100: {{.*FADD2.*}}
-; SM100: {{.*FADD2.*}}
-
-*/

From cbb03d925245fc24a6eb988a8413e57c6182b503 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 19 May 2026 15:43:59 -0700
Subject: [PATCH 2/4] `cuda::simd` Add `saturation_add`

---
 docs/libcudacxx/extended_api.rst              |   1 +
 docs/libcudacxx/extended_api/simd.rst         |  24 ++++
 .../extended_api/simd/saturating_add.rst      |  78 +++++++++++
 .../include/cuda/__simd/saturating_add.h      |  89 +++++++++++++
 .../include/cuda/__simd/simd_intrinsics.h     | 104 +++++++++++++++
 .../cuda/__simd/simd_intrinsics_array.h       |  83 ++++++++++++
 libcudacxx/include/cuda/simd                  |  27 ++++
 .../include/cuda/std/__internal/features.h    |  12 ++
 .../include/cuda/std/__internal/namespaces.h  |   2 +
 .../include/cuda/std/__simd/basic_vec.h       |   2 -
 .../simd/simd.non_std/saturation_add.pass.cpp | 124 ++++++++++++++++++
 libcudacxx/test/simd_codegen/CMakeLists.txt   |  10 ++
 .../saturation_add/saturating_add.cu          |  54 ++++++++
 13 files changed, 608 insertions(+), 2 deletions(-)
 create mode 100644 docs/libcudacxx/extended_api/simd.rst
 create mode 100644 docs/libcudacxx/extended_api/simd/saturating_add.rst
 create mode 100644 libcudacxx/include/cuda/__simd/saturating_add.h
 create mode 100644 libcudacxx/include/cuda/__simd/simd_intrinsics.h
 create mode 100644 libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
 create mode 100644 libcudacxx/include/cuda/simd
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp
 create mode 100644 libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu

diff --git a/docs/libcudacxx/extended_api.rst b/docs/libcudacxx/extended_api.rst
index a2416b09f94..dbebf3e60e3 100644
--- a/docs/libcudacxx/extended_api.rst
+++ b/docs/libcudacxx/extended_api.rst
@@ -20,6 +20,7 @@ Extended API
    extended_api/type_traits
    extended_api/vector_tuple_protocol
    extended_api/numeric
+   extended_api/simd
    extended_api/random
    extended_api/memory
    extended_api/memory_resource
diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst
new file mode 100644
index 00000000000..080a3b48331
--- /dev/null
+++ b/docs/libcudacxx/extended_api/simd.rst
@@ -0,0 +1,24 @@
+.. _libcudacxx-extended-api-simd:
+
+SIMD
+====
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+   simd/saturating_add
+
+.. list-table::
+   :widths: 25 45 30 30
+   :header-rows: 1
+
+   * - **Header**
+     - **Content**
+     - **CCCL Availability**
+     - **CUDA Toolkit Availability**
+
+   * - :ref:`cuda::simd::saturating_add <libcudacxx-extended-api-simd-saturating-add>`
+     - Performs element-wise saturating addition of two ``basic_vec`` objects
+     - CCCL 3.5.0
+     - CUDA 13.5
diff --git a/docs/libcudacxx/extended_api/simd/saturating_add.rst b/docs/libcudacxx/extended_api/simd/saturating_add.rst
new file mode 100644
index 00000000000..4f34f582256
--- /dev/null
+++ b/docs/libcudacxx/extended_api/simd/saturating_add.rst
@@ -0,0 +1,78 @@
+.. _libcudacxx-extended-api-simd-saturating-add:
+
+``cuda::simd::saturating_add``
+==============================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class Abi>
+   [[nodiscard]] __host__ __device__ constexpr
+   cuda::std::simd::basic_vec<T, Abi> saturating_add(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::saturating_add`` performs element-wise saturating addition of two ``cuda::std::simd::basic_vec`` objects.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   cuda::std::saturating_add(lhs[i], rhs[i])
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+
+**Return value**
+
+Returns a ``cuda::std::simd::basic_vec<T, Abi>`` where each element contains the saturated sum of the corresponding elements in ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T`` must be an `integer type <https://eel.is/c++draft/basic.fundamental#1>`__.
+
+**Performance considerations**
+
+- Packed 8-bit and 16-bit integer vectors perform saturating addition using ``VIADD.S8x4``, ``VIADD.U8x4``, ``VIADD.S16x2``, ``VIADD.U16x2`` on ``SM120f``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t = simd::basic_vec<uint8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<uint8_t, 4> lhs_values{250, 10, 20, 30};
+        cuda::std::array<uint8_t, 4> rhs_values{10, 20, 30, 40};
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+
+        vec_t result = cuda::simd::saturating_add(lhs, rhs);
+
+        assert(result[0] == 255);
+        assert(result[1] == 30);
+        assert(result[2] == 50);
+        assert(result[3] == 70);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
diff --git a/libcudacxx/include/cuda/__simd/saturating_add.h b/libcudacxx/include/cuda/__simd/saturating_add.h
new file mode 100644
index 00000000000..c5824aeffef
--- /dev/null
+++ b/libcudacxx/include/cuda/__simd/saturating_add.h
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_SATURATING_ADD_H
+#define _CUDA___SIMD_SATURATING_ADD_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__internal/features.h>
+#include <cuda/std/__numeric/saturating_add.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#if _CCCL_HAS_SIMD_SAT()
+#  include <cuda/__simd/simd_intrinsics_array.h>
+#  include <cuda/std/__simd/specializations/fixed_size_storage.h>
+#  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#endif // _CCCL_HAS_SIMD_SAT()
+
+#include <nv/target>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
+[[nodiscard]] _CCCL_API constexpr ::cuda::std::simd::basic_vec<_Tp, _Abi> saturating_add(
+  const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs, const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __rhs) noexcept
+{
+  using __basic_vec_t    = ::cuda::std::simd::basic_vec<_Tp, _Abi>;
+  using __simd_storage_t = typename __basic_vec_t::_Storage;
+  constexpr auto __size  = __basic_vec_t::__size;
+
+#if _CCCL_HAS_SIMD_SAT()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (sizeof(_Tp) == 1 || sizeof(_Tp) == 2)
+    {
+      NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
+                     using __unsigned_storage_t = ::cuda::std::simd::__simd_storage_u32_t<__simd_storage_t>;
+                     const auto __lhs_u         = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u         = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     __unsigned_storage_t __result_u{};
+                     if constexpr (sizeof(_Tp) == 2)
+                     {
+                       __result_u = ::cuda::simd::__vadd_sat_16bit_x2<_Tp>(__lhs_u, __rhs_u);
+                     }
+                     else
+                     {
+                       __result_u = ::cuda::simd::__vadd_sat_8bit_x4<_Tp>(__lhs_u, __rhs_u);
+                     }
+                     const auto __result_s =
+                       ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__result_u);
+                     return __basic_vec_t{__result_s, __basic_vec_t::__storage_tag};
+                   }));
+    }
+  }
+#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() && _CCCL_HAS_SIMD_SAT()
+
+  __simd_storage_t __result{};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::simd::__simd_size_type __i = 0; __i < __size; ++__i)
+  {
+    __result.__data[__i] = ::cuda::std::saturating_add(__lhs.__s_.__data[__i], __rhs.__s_.__data[__i]);
+  }
+  return __basic_vec_t{__result, __basic_vec_t::__storage_tag};
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___SIMD_SATURATING_ADD_H
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
new file mode 100644
index 00000000000..cc241740520
--- /dev/null
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_SIMD_INTRINSICS_H
+#define _CUDA___SIMD_SIMD_INTRINSICS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_SIMD_SAT()
+
+#  include <cuda/std/__internal/features.h>
+#  include <cuda/std/cstdint>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u16x2(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
+{
+#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus2(__lhs, __rhs);))
+#  elif _CCCL_HAS_SIMD_SAT_PTX()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("add.sat.u16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }))
+#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u16x2: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s16x2(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
+{
+#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss2(__lhs, __rhs);))
+#  elif _CCCL_HAS_SIMD_SAT_PTX()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("add.sat.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }))
+#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s16x2: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
+{
+#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus4(__lhs, __rhs);))
+#  elif _CCCL_HAS_SIMD_SAT_PTX()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("add.sat.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }))
+#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
+{
+#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss4(__lhs, __rhs);))
+#  elif _CCCL_HAS_SIMD_SAT_PTX()
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("add.sat.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
+                 return __result;
+               }))
+#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_SIMD_SAT()
+#endif // _CUDA___SIMD_SIMD_INTRINSICS_H
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
new file mode 100644
index 00000000000..71fd0613702
--- /dev/null
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H
+#define _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_SIMD_SAT()
+
+#  include <cuda/__simd/simd_intrinsics.h>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#  include <cuda/std/__type_traits/is_unsigned.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+//----------------------------------------------------------------------------------------------------------------------
+// device-only functions
+
+template <typename _Tp, ::cuda::std::size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_16bit_x2(
+  const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept
+{
+  ::cuda::std::simd::__array_u32_t<_Np> __result_u;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (::cuda::std::is_unsigned_v<_Tp>)
+    {
+      __result_u[__i] = ::cuda::simd::__vadd_sat_u16x2(__lhs_u[__i], __rhs_u[__i]);
+    }
+    else
+    {
+      __result_u[__i] = ::cuda::simd::__vadd_sat_s16x2(__lhs_u[__i], __rhs_u[__i]);
+    }
+  }
+  return __result_u;
+}
+
+template <typename _Tp, ::cuda::std::size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_8bit_x4(
+  const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept
+{
+  ::cuda::std::simd::__array_u32_t<_Np> __result_u;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (::cuda::std::is_unsigned_v<_Tp>)
+    {
+      __result_u[__i] = ::cuda::simd::__vadd_sat_u8x4(__lhs_u[__i], __rhs_u[__i]);
+    }
+    else
+    {
+      __result_u[__i] = ::cuda::simd::__vadd_sat_s8x4(__lhs_u[__i], __rhs_u[__i]);
+    }
+  }
+  return __result_u;
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_SIMD_SAT()
+#endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H
diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd
new file mode 100644
index 00000000000..327ba22f527
--- /dev/null
+++ b/libcudacxx/include/cuda/simd
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_SIMD
+#define _CUDA_SIMD
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__simd/saturating_add.h>
+#include <cuda/std/__simd_>
+
+#endif // _CUDA_SIMD
diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h
index 3b8d43c521f..a9b01dfc274 100644
--- a/libcudacxx/include/cuda/std/__internal/features.h
+++ b/libcudacxx/include/cuda/std/__internal/features.h
@@ -111,6 +111,18 @@
 #define _CCCL_HAS_SIMD_8BIT() \
   ((_CCCL_HAS_SIMD_8BIT_PTX() || _CCCL_HAS_SIMD_8BIT_INTRINSICS()) && !_CCCL_TILE_COMPILATION())
 
+// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
+#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
+#define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
+#define _CCCL_HAS_SIMD_SAT() \
+  (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+
+// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
+#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
+#define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
+#define _CCCL_HAS_SIMD_SAT() \
+  (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+
 // Third party libraries
 
 #if (__has_include(<dlpack/dlpack.h>) || __has_include(<dlpack.h>)) && \
diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index 2cca27fc6be..1e3b8a74004 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -80,6 +80,8 @@
 // Namespaces related to <simd>
 #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD _CCCL_BEGIN_NAMESPACE(cuda::std::simd)
 #define _CCCL_END_NAMESPACE_CUDA_STD_SIMD   _CCCL_END_NAMESPACE(cuda::std::simd)
+#define _CCCL_BEGIN_NAMESPACE_CUDA_SIMD     _CCCL_BEGIN_NAMESPACE(cuda::simd)
+#define _CCCL_END_NAMESPACE_CUDA_SIMD       _CCCL_END_NAMESPACE(cuda::simd)
 
 // Namespaces related to <ranges>
 #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_RANGES _CCCL_BEGIN_NAMESPACE(cuda::std::ranges)
diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h
index 37824dbc48c..7d0796f779d 100644
--- a/libcudacxx/include/cuda/std/__simd/basic_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h
@@ -70,7 +70,6 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
   using value_type = _Tp;
   using mask_type  = basic_mask<sizeof(value_type), _Abi>;
 
-private:
   template <size_t, typename, typename>
   friend class basic_mask;
 
@@ -100,7 +99,6 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_
     __s_.__set(__i, __v);
   }
 
-public:
   using abi_type = _Abi;
 
   using iterator       = __simd_iterator<basic_vec>;
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp
new file mode 100644
index 00000000000..16f77eb8d3b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp
@@ -0,0 +1,124 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/__simd/saturating_add.h>
+
+// template<class T, class Abi>
+//   constexpr basic_vec<T, Abi> cuda::simd::saturating_add(
+//     const basic_vec<T, Abi>& lhs, const basic_vec<T, Abi>& rhs) noexcept;
+
+#include <cuda/simd>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+template <typename T, int N>
+using fixed_size_vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+template <typename Vec, typename = void>
+inline constexpr bool has_saturating_add = false;
+
+template <typename Vec>
+inline constexpr bool has_saturating_add<
+  Vec,
+  cuda::std::void_t<decltype(cuda::simd::saturating_add(cuda::std::declval<Vec>(), cuda::std::declval<Vec>()))>> = true;
+
+template <typename T, int N>
+TEST_FUNC constexpr void test_values(cuda::std::array<T, N> lhs_values, cuda::std::array<T, N> rhs_values)
+{
+  using Vec = simd::basic_vec<T, simd::fixed_size<N>>;
+  Vec lhs(lhs_values);
+  Vec rhs(rhs_values);
+
+  static_assert(cuda::std::is_same_v<decltype(cuda::simd::saturating_add(lhs, rhs)), Vec>);
+  static_assert(noexcept(cuda::simd::saturating_add(lhs, rhs)));
+
+  Vec result = cuda::simd::saturating_add(lhs, rhs);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == cuda::std::saturating_add(lhs_values[i], rhs_values[i]));
+  }
+}
+
+template <typename T, int N>
+TEST_FUNC constexpr void test_size()
+{
+  constexpr auto min_val = cuda::std::numeric_limits<T>::min();
+  constexpr auto max_val = cuda::std::numeric_limits<T>::max();
+
+  if constexpr (cuda::std::is_signed_v<T>)
+  {
+    cuda::std::array<T, N> lhs_values{max_val, min_val, T{10}};
+    cuda::std::array<T, N> rhs_values{T{1}, T{-1}, T{-20}};
+    if constexpr (N > 3)
+    {
+      lhs_values[3] = T{-20};
+      rhs_values[3] = T{10};
+    }
+    test_values<T, N>(lhs_values, rhs_values);
+  }
+  else
+  {
+    cuda::std::array<T, N> lhs_values{max_val, min_val, T{10}};
+    cuda::std::array<T, N> rhs_values{T{1}, T{1}, T{20}};
+    if constexpr (N > 3)
+    {
+      lhs_values[3] = T{20};
+      rhs_values[3] = T{30};
+    }
+    test_values<T, N>(lhs_values, rhs_values);
+  }
+}
+
+template <typename T>
+TEST_FUNC constexpr void test()
+{
+  test_size<T, 4>();
+  test_size<T, 3>();
+}
+
+TEST_FUNC constexpr bool test_all()
+{
+  static_assert(!has_saturating_add<fixed_size_vec<float, 4>>);
+
+  test<signed char>();
+  test<signed short>();
+  test<signed int>();
+  test<signed long>();
+  test<signed long long>();
+#if _CCCL_HAS_INT128()
+  test<__int128_t>();
+#endif // _CCCL_HAS_INT128()
+
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned int>();
+  test<unsigned long>();
+  test<unsigned long long>();
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()
+
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test_all());
+  static_assert(test_all());
+  return 0;
+}
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 4c073d6069f..79b98ac4693 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -32,6 +32,16 @@ find_program(bash "bash" REQUIRED)
 set(libcudacxx_simd_codegen_tests)
 if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu")
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.2)
+    file(
+      GLOB libcudacxx_simd_codegen_saturation_add_tests
+      "saturation_add/*.cu"
+    )
+    list(
+      APPEND libcudacxx_simd_codegen_tests
+      ${libcudacxx_simd_codegen_saturation_add_tests}
+    )
+  endif()
 endif()
 
 set(simd_codegen_cuda_archs 80 90)
diff --git a/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu b/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu
new file mode 100644
index 00000000000..193079e6fac
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/simd> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_s16_x2 = simd::basic_vec<cuda::std::int16_t, simd::fixed_size<2>>;
+using Vec_s8_x4  = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+using Vec_u16_x2 = simd::basic_vec<cuda::std::uint16_t, simd::fixed_size<2>>;
+using Vec_u8_x4  = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+__device__ Vec_u16_x2 test_saturating_add_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs)
+{
+  return cuda::simd::saturating_add(lhs, rhs);
+}
+
+__device__ Vec_s16_x2 test_saturating_add_s16_x2(Vec_s16_x2 lhs, Vec_s16_x2 rhs)
+{
+  return cuda::simd::saturating_add(lhs, rhs);
+}
+
+__device__ Vec_u8_x4 test_saturating_add_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs)
+{
+  return cuda::simd::saturating_add(lhs, rhs);
+}
+
+__device__ Vec_s8_x4 test_saturating_add_s8_x4(Vec_s8_x4 lhs, Vec_s8_x4 rhs)
+{
+  return cuda::simd::saturating_add(lhs, rhs);
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_s8_x4.*}}
+; SM120f: {{.*VIADD\.S8x4\.ISAT.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_u8_x4.*}}
+; SM120f: {{.*VIADD\.U8x4\.ISAT.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_s16_x2.*}}
+; SM120f: {{.*VIADD\.S16x2\.ISAT.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_u16_x2.*}}
+; SM120f: {{.*VIADD\.16x2\.ISAT.*}}
+
+*/

From f690b2e444464764b1e2ad73dda085ebc02094db Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 19 May 2026 15:44:00 -0700
Subject: [PATCH 3/4] `cuda::simd` Add `abs_diff`

---
 docs/libcudacxx/extended_api/simd.rst         |   6 +
 .../libcudacxx/extended_api/simd/abs_diff.rst |  84 +++++++++++
 .../include/cuda/__simd/simd_intrinsics.h     |  64 +++++++--
 .../cuda/__simd/simd_intrinsics_array.h       |  34 ++++-
 libcudacxx/include/cuda/__simd/vabsdiff.h     |  85 +++++++++++
 libcudacxx/include/cuda/simd                  |   1 +
 .../include/cuda/std/__internal/features.h    |  16 ++-
 .../__simd/specializations/simd_intrinsics.h  |  37 +++--
 .../simd/simd.non_std/vabsdiff.pass.cpp       | 133 ++++++++++++++++++
 libcudacxx/test/simd_codegen/CMakeLists.txt   |   7 +-
 .../test/simd_codegen/vabsdiff/vabsdiff.cu    |  44 ++++++
 11 files changed, 473 insertions(+), 38 deletions(-)
 create mode 100644 docs/libcudacxx/extended_api/simd/abs_diff.rst
 create mode 100644 libcudacxx/include/cuda/__simd/vabsdiff.h
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp
 create mode 100644 libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu

diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst
index 080a3b48331..524b1bb8cb1 100644
--- a/docs/libcudacxx/extended_api/simd.rst
+++ b/docs/libcudacxx/extended_api/simd.rst
@@ -8,6 +8,7 @@ SIMD
    :maxdepth: 1
 
    simd/saturating_add
+   simd/abs_diff
 
 .. list-table::
    :widths: 25 45 30 30
@@ -22,3 +23,8 @@ SIMD
      - Performs element-wise saturating addition of two ``basic_vec`` objects
      - CCCL 3.5.0
      - CUDA 13.5
+
+   * - :ref:`cuda::simd::abs_diff <libcudacxx-extended-api-simd-abs-diff>`
+     - Performs element-wise absolute difference of two integer ``basic_vec`` objects
+     - CCCL 3.5.0
+     - CUDA 13.5
diff --git a/docs/libcudacxx/extended_api/simd/abs_diff.rst b/docs/libcudacxx/extended_api/simd/abs_diff.rst
new file mode 100644
index 00000000000..2bdc4011fa3
--- /dev/null
+++ b/docs/libcudacxx/extended_api/simd/abs_diff.rst
@@ -0,0 +1,84 @@
+.. _libcudacxx-extended-api-simd-abs-diff:
+
+``cuda::simd::abs_diff``
+========================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class Abi>
+   [[nodiscard]] __host__ __device__ constexpr
+   cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi> abs_diff(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::abs_diff`` performs element-wise absolute difference of two integer ``cuda::std::simd::basic_vec`` objects.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   abs(lhs[i] - rhs[i])
+
+The return type is always an *unsigned* ``basic_vec`` with the same ABI as the input vectors.
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+
+**Return value**
+
+Returns a ``cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi>`` where each element contains the unsigned absolute difference of the corresponding elements in ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T`` must be an integer type.
+
+**Performance considerations**
+
+- Packed 8-bit integer vectors perform absolute difference using:
+
+  - ``VABSDIFF4`` on ``SM80``, ``SM86``, ``SM87``, ``SM89``, ``SM90``, ``SM100``, ``SM103``, and ``SM110``.
+  - ``VIMNMX.S8x4/U8x4`` on ``SM120f``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t        = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+        using result_vec_t = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<cuda::std::int8_t, 4> lhs_values{-128, 10, 20, 30};
+        cuda::std::array<cuda::std::int8_t, 4> rhs_values{127, 20, -30, 40};
+
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+        result_vec_t result = cuda::simd::abs_diff(lhs, rhs);
+
+        assert(result[0] == 255);
+        assert(result[1] == 10);
+        assert(result[2] == 50);
+        assert(result[3] == 10);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
index cc241740520..cb1902eaa62 100644
--- a/libcudacxx/include/cuda/__simd/simd_intrinsics.h
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_HAS_SIMD_SAT()
+#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
 
 #  include <cuda/std/__internal/features.h>
 #  include <cuda/std/cstdint>
@@ -32,18 +32,20 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 
+#  if _CCCL_HAS_SIMD_SAT()
+
 [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u16x2(
   [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
 {
-#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+#    if _CCCL_HAS_SIMD_SAT_INTRINSICS()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus2(__lhs, __rhs);))
-#  elif _CCCL_HAS_SIMD_SAT_PTX()
+#    elif _CCCL_HAS_SIMD_SAT_PTX()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
                  ::cuda::std::uint32_t __result{};
                  asm("add.sat.u16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
                  return __result;
                }))
-#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+#    endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
   _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u16x2: Unsupported architecture");
   return ::cuda::std::uint32_t{};
 }
@@ -51,15 +53,15 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s16x2(
   [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
 {
-#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+#    if _CCCL_HAS_SIMD_SAT_INTRINSICS()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss2(__lhs, __rhs);))
-#  elif _CCCL_HAS_SIMD_SAT_PTX()
+#    elif _CCCL_HAS_SIMD_SAT_PTX()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
                  ::cuda::std::uint32_t __result{};
                  asm("add.sat.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
                  return __result;
                }))
-#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+#    endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
   _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s16x2: Unsupported architecture");
   return ::cuda::std::uint32_t{};
 }
@@ -67,15 +69,15 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u8x4(
   [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
 {
-#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+#    if _CCCL_HAS_SIMD_SAT_INTRINSICS()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus4(__lhs, __rhs);))
-#  elif _CCCL_HAS_SIMD_SAT_PTX()
+#    elif _CCCL_HAS_SIMD_SAT_PTX()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
                  ::cuda::std::uint32_t __result{};
                  asm("add.sat.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
                  return __result;
                }))
-#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+#    endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
   _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u8x4: Unsupported architecture");
   return ::cuda::std::uint32_t{};
 }
@@ -83,22 +85,56 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s8x4(
   [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept
 {
-#  if _CCCL_HAS_SIMD_SAT_INTRINSICS()
+#    if _CCCL_HAS_SIMD_SAT_INTRINSICS()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss4(__lhs, __rhs);))
-#  elif _CCCL_HAS_SIMD_SAT_PTX()
+#    elif _CCCL_HAS_SIMD_SAT_PTX()
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({
                  ::cuda::std::uint32_t __result{};
                  asm("add.sat.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
                  return __result;
                }))
-#  endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
+#    endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX()
   _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s8x4: Unsupported architecture");
   return ::cuda::std::uint32_t{};
 }
 
+#  endif // _CCCL_HAS_SIMD_SAT()
+
+#  if _CCCL_HAS_SIMD_VABSDIFF()
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vabsdiff_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __c) noexcept
+{
+  NV_IF_TARGET(NV_IS_DEVICE, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("vabsdiff4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__c));
+                 return __result;
+               }))
+  _CCCL_VERIFY(false, "cuda::__simd::__vabsdiff_u8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vabsdiff_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __c) noexcept
+{
+  NV_IF_TARGET(NV_IS_DEVICE, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("vabsdiff4.u32.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__c));
+                 return __result;
+               }))
+  _CCCL_VERIFY(false, "cuda::__simd::__vabsdiff_s8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+#  endif // _CCCL_HAS_SIMD_VABSDIFF()
+
 _CCCL_END_NAMESPACE_CUDA_SIMD
 
 #  include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CCCL_HAS_SIMD_SAT()
+#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
 #endif // _CUDA___SIMD_SIMD_INTRINSICS_H
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
index 71fd0613702..4a056ba6292 100644
--- a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_HAS_SIMD_SAT()
+#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
 
 #  include <cuda/__simd/simd_intrinsics.h>
 #  include <cuda/std/__cstddef/types.h>
@@ -35,6 +35,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 //----------------------------------------------------------------------------------------------------------------------
 // device-only functions
 
+#  if _CCCL_HAS_SIMD_SAT()
+
 template <typename _Tp, ::cuda::std::size_t _Np>
 [[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_16bit_x2(
   const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept
@@ -75,9 +77,37 @@ template <typename _Tp, ::cuda::std::size_t _Np>
   return __result_u;
 }
 
+#  endif // _CCCL_HAS_SIMD_SAT()
+
+#  if _CCCL_HAS_SIMD_VABSDIFF()
+
+template <typename _Tp, ::cuda::std::size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vabsdiff_8bit_x4(
+  const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u,
+  const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u,
+  const ::cuda::std::simd::__array_u32_t<_Np>& __c_u) noexcept
+{
+  ::cuda::std::simd::__array_u32_t<_Np> __result_u;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (::cuda::std::is_unsigned_v<_Tp>)
+    {
+      __result_u[__i] = ::cuda::simd::__vabsdiff_u8x4(__lhs_u[__i], __rhs_u[__i], __c_u[__i]);
+    }
+    else
+    {
+      __result_u[__i] = ::cuda::simd::__vabsdiff_s8x4(__lhs_u[__i], __rhs_u[__i], __c_u[__i]);
+    }
+  }
+  return __result_u;
+}
+
+#  endif // _CCCL_HAS_SIMD_VABSDIFF()
+
 _CCCL_END_NAMESPACE_CUDA_SIMD
 
 #  include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CCCL_HAS_SIMD_SAT()
+#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
 #endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H
diff --git a/libcudacxx/include/cuda/__simd/vabsdiff.h b/libcudacxx/include/cuda/__simd/vabsdiff.h
new file mode 100644
index 00000000000..2e02dd49615
--- /dev/null
+++ b/libcudacxx/include/cuda/__simd/vabsdiff.h
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_VABSDIFF_H
+#define _CUDA___SIMD_VABSDIFF_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__internal/features.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#if _CCCL_HAS_SIMD_VABSDIFF()
+#  include <cuda/__simd/simd_intrinsics_array.h>
+#  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#endif // _CCCL_HAS_SIMD_VABSDIFF()
+
+#include <nv/target>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
+[[nodiscard]] _CCCL_API constexpr ::cuda::std::simd::basic_vec<::cuda::std::make_unsigned_t<_Tp>, _Abi> abs_diff(
+  const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs, const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __rhs) noexcept
+{
+  using __result_value_t   = ::cuda::std::make_unsigned_t<_Tp>;
+  using __basic_vec_t      = ::cuda::std::simd::basic_vec<_Tp, _Abi>;
+  using __result_vec_t     = ::cuda::std::simd::basic_vec<__result_value_t, _Abi>;
+  using __result_storage_t = typename __result_vec_t::_Storage;
+  constexpr auto __size    = __basic_vec_t::__size;
+
+#if _CCCL_HAS_SIMD_VABSDIFF()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (sizeof(_Tp) == 1)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, ({
+                     constexpr ::cuda::std::simd::__simd_storage_u32_t<__result_storage_t> __c_u{};
+                     const auto __lhs_u    = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u    = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     const auto __result_u = ::cuda::simd::__vabsdiff_8bit_x4<_Tp>(__lhs_u, __rhs_u, __c_u);
+                     const auto __result_s =
+                       ::cuda::std::simd::__copy_from_unsigned_storage<__result_storage_t>(__result_u);
+                     return __result_vec_t{__result_s, __result_vec_t::__storage_tag};
+                   }));
+    }
+  }
+#endif // _CCCL_HAS_SIMD_VABSDIFF()
+
+  __result_storage_t __result{};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::simd::__simd_size_type __i = 0; __i < __size; ++__i)
+  {
+    const auto __lhs_i   = static_cast<__result_value_t>(__lhs.__s_.__data[__i]);
+    const auto __rhs_i   = static_cast<__result_value_t>(__rhs.__s_.__data[__i]);
+    const auto __is_less = (__lhs.__s_.__data[__i] < __rhs.__s_.__data[__i]);
+    __result.__data[__i] = __is_less ? (__rhs_i - __lhs_i) : (__lhs_i - __rhs_i);
+  }
+  return __result_vec_t{__result, __result_vec_t::__storage_tag};
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___SIMD_VABSDIFF_H
diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd
index 327ba22f527..c6c4889bc30 100644
--- a/libcudacxx/include/cuda/simd
+++ b/libcudacxx/include/cuda/simd
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/__simd/saturating_add.h>
+#include <cuda/__simd/vabsdiff.h>
 #include <cuda/std/__simd_>
 
 #endif // _CUDA_SIMD
diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h
index a9b01dfc274..be22b4b9fb9 100644
--- a/libcudacxx/include/cuda/std/__internal/features.h
+++ b/libcudacxx/include/cuda/std/__internal/features.h
@@ -114,8 +114,20 @@
 // TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
 #define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
 #define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
-#define _CCCL_HAS_SIMD_SAT() \
-  (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+#define _CCCL_HAS_SIMD_SAT()                                                                 \
+  ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \
+   && !_CCCL_TILE_COMPILATION())
+
+#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
+
+// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
+#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
+#define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
+#define _CCCL_HAS_SIMD_SAT()                                                                 \
+  ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \
+   && !_CCCL_TILE_COMPILATION())
+
+#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
 
 // TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
 #define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
index 6e7cd7caffe..3cae299eb76 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h
@@ -42,14 +42,13 @@ __vadd_u16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint3
 [[nodiscard]] _CCCL_DEVICE_API inline uint32_t
 __vadd_s16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
 {
-  // prevent MSVC warning
+  uint32_t __result{};
   NV_IF_TARGET(NV_PROVIDES_SM_90,
-               ({
-                 uint32_t __result{};
-                 asm("add.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
-                 return __result;
-               }),
-               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture"); return uint32_t{};));
+               ({ asm("add.s16x2 %0, %1, %2;"
+                      : "=r"(__result)
+                      : "r"(__lhs), "r"(__rhs)); }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture");));
+  return __result;
 }
 
 #  if _CCCL_HAS_SIMD_8BIT()
@@ -62,26 +61,26 @@ __vadd_u8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32
                (return ::__vadd4(__lhs, __rhs);), //
                (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};));
 #    else // ^^^ _CCCL_HAS_SIMD_8BIT_INTRINSICS() ^^^ / vvv !_CCCL_HAS_SIMD_8BIT_INTRINSICS() vvv
+  uint32_t __result{};
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
-               ({
-                 uint32_t __result{};
-                 asm("add.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
-                 return __result;
-               }),
-               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};));
+               ({ asm("add.u8x4 %0, %1, %2;"
+                      : "=r"(__result)
+                      : "r"(__lhs), "r"(__rhs)); }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture");));
+  return __result;
 #    endif // _CCCL_HAS_SIMD_8BIT()
 }
 
 [[nodiscard]] _CCCL_DEVICE_API inline uint32_t
 __vadd_s8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept
 {
+  uint32_t __result{};
   NV_IF_TARGET(NV_HAS_FEATURE_SM_120f,
-               ({
-                 uint32_t __result{};
-                 asm("add.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs));
-                 return __result;
-               }),
-               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture"); return uint32_t{};));
+               ({ asm("add.s8x4 %0, %1, %2;"
+                      : "=r"(__result)
+                      : "r"(__lhs), "r"(__rhs)); }),
+               (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture");));
+  return __result;
 }
 
 #  endif // _CCCL_HAS_SIMD_8BIT()
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp
new file mode 100644
index 00000000000..628d602e726
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/__simd/abs_diff.h>
+
+// template<class T, class Abi>
+//   constexpr basic_vec<make_unsigned_t<T>, Abi> cuda::simd::abs_diff(
+//     const basic_vec<T, Abi>& lhs, const basic_vec<T, Abi>& rhs) noexcept;
+
+#include <cuda/simd>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+template <typename T, int N>
+using fixed_size_vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+template <typename Vec, typename = void>
+inline constexpr bool has_abs_diff = false;
+
+template <typename Vec>
+inline constexpr bool has_abs_diff<
+  Vec,
+  cuda::std::void_t<decltype(cuda::simd::abs_diff(cuda::std::declval<Vec>(), cuda::std::declval<Vec>()))>> = true;
+
+template <typename T>
+TEST_FUNC constexpr cuda::std::make_unsigned_t<T> scalar_abs_diff(T lhs, T rhs)
+{
+  using ResultT = cuda::std::make_unsigned_t<T>;
+  ResultT lhs_u = static_cast<ResultT>(lhs);
+  ResultT rhs_u = static_cast<ResultT>(rhs);
+  return (lhs < rhs) ? (rhs_u - lhs_u) : (lhs_u - rhs_u);
+}
+
+template <typename T, int N>
+TEST_FUNC constexpr void test_values(cuda::std::array<T, N> lhs_values, cuda::std::array<T, N> rhs_values)
+{
+  using Vec       = simd::basic_vec<T, simd::fixed_size<N>>;
+  using ResultVec = simd::basic_vec<cuda::std::make_unsigned_t<T>, simd::fixed_size<N>>;
+  Vec lhs(lhs_values);
+  Vec rhs(rhs_values);
+
+  static_assert(cuda::std::is_same_v<decltype(cuda::simd::abs_diff(lhs, rhs)), ResultVec>);
+  static_assert(noexcept(cuda::simd::abs_diff(lhs, rhs)));
+
+  ResultVec result = cuda::simd::abs_diff(lhs, rhs);
+  for (int i = 0; i < N; ++i)
+  {
+    assert(result[i] == scalar_abs_diff(lhs_values[i], rhs_values[i]));
+  }
+}
+
+template <typename T, int N>
+TEST_FUNC constexpr void test_size()
+{
+  constexpr auto min_val = cuda::std::numeric_limits<T>::min();
+  constexpr auto max_val = cuda::std::numeric_limits<T>::max();
+
+  if constexpr (cuda::std::is_signed_v<T>)
+  {
+    cuda::std::array<T, N> lhs_values{min_val, max_val, T{-10}};
+    cuda::std::array<T, N> rhs_values{max_val, min_val, T{20}};
+    if constexpr (N > 3)
+    {
+      lhs_values[3] = T{-5};
+      rhs_values[3] = T{-5};
+    }
+    test_values<T, N>(lhs_values, rhs_values);
+  }
+  else
+  {
+    cuda::std::array<T, N> lhs_values{min_val, max_val, T{10}};
+    cuda::std::array<T, N> rhs_values{max_val, min_val, T{20}};
+    if constexpr (N > 3)
+    {
+      lhs_values[3] = T{5};
+      rhs_values[3] = T{5};
+    }
+    test_values<T, N>(lhs_values, rhs_values);
+  }
+}
+
+template <typename T>
+TEST_FUNC constexpr void test()
+{
+  test_size<T, 4>();
+  test_size<T, 3>();
+}
+
+TEST_FUNC constexpr bool test_all()
+{
+  static_assert(!has_abs_diff<fixed_size_vec<float, 4>>);
+
+  test<signed char>();
+  test<signed short>();
+  test<signed int>();
+  test<signed long>();
+  test<signed long long>();
+#if _CCCL_HAS_INT128()
+  test<__int128_t>();
+#endif // _CCCL_HAS_INT128()
+
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned int>();
+  test<unsigned long>();
+  test<unsigned long long>();
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()
+
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test_all());
+  static_assert(test_all());
+  return 0;
+}
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 79b98ac4693..ab3c686467e 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -31,7 +31,12 @@ find_program(bash "bash" REQUIRED)
 
 set(libcudacxx_simd_codegen_tests)
 if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu")
+  file(
+    GLOB libcudacxx_simd_codegen_tests
+    "floating_point/*.cu"
+    "integer/*.cu"
+    "vabsdiff/*.cu"
+  )
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.2)
     file(
       GLOB libcudacxx_simd_codegen_saturation_add_tests
diff --git a/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu b/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu
new file mode 100644
index 00000000000..71c0d8ed48d
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/simd> // IWYU pragma: keep
+
+namespace simd = cuda::std::simd;
+
+using Vec_s8_x4 = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+using Vec_u8_x4 = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+__device__ Vec_u8_x4 test_abs_diff_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs)
+{
+  return cuda::simd::abs_diff(lhs, rhs);
+}
+
+__device__ Vec_u8_x4 test_abs_diff_s8_x4(Vec_s8_x4 lhs, Vec_s8_x4 rhs)
+{
+  return cuda::simd::abs_diff(lhs, rhs);
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_abs_diff_s8_x4.*}}
+; SM80: {{.*VABSDIFF4 .*}}
+; SM90: {{.*VABSDIFF4 .*}}
+; SM100: {{.*VABSDIFF4 .*}}
+; SM120: {{.*VIMNMX\.S8x4.*}}
+; SM120: {{.*VIMNMX\.S8x4.*}}
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_abs_diff_u8_x4.*}}
+; SM80: {{.*VABSDIFF4\.U8.*}}
+; SM90: {{.*VABSDIFF4\.U8.*}}
+; SM100: {{.*VABSDIFF4\.U8.*}}
+; SM120: {{.*VIMNMX\.U8x4.*}}
+; SM120: {{.*VIMNMX\.U8x4.*}}
+
+*/

From 4e548e6ee2ca8cc5f2653a6a50d7b7fd54978262 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Tue, 19 May 2026 15:44:17 -0700
Subject: [PATCH 4/4] `cuda::simd` Integer dot product

---
 docs/libcudacxx/extended_api/simd.rst         |   6 +
 docs/libcudacxx/extended_api/simd/idot.rst    |  79 +++++++
 libcudacxx/include/cuda/__simd/idot.h         | 108 +++++++++
 .../include/cuda/__simd/simd_intrinsics.h     | 218 +++++++++++++++++-
 .../cuda/__simd/simd_intrinsics_array.h       |  98 +++++++-
 libcudacxx/include/cuda/simd                  |   1 +
 .../include/cuda/std/__internal/features.h    |  17 +-
 .../numerics/simd/simd.non_std/idot.pass.cpp  | 196 ++++++++++++++++
 libcudacxx/test/simd_codegen/CMakeLists.txt   |   1 +
 libcudacxx/test/simd_codegen/idot/idp2.cu     |  95 ++++++++
 libcudacxx/test/simd_codegen/idot/idp4.cu     |  53 +++++
 11 files changed, 855 insertions(+), 17 deletions(-)
 create mode 100644 docs/libcudacxx/extended_api/simd/idot.rst
 create mode 100644 libcudacxx/include/cuda/__simd/idot.h
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp
 create mode 100644 libcudacxx/test/simd_codegen/idot/idp2.cu
 create mode 100644 libcudacxx/test/simd_codegen/idot/idp4.cu

diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst
index 524b1bb8cb1..29ec24da4a3 100644
--- a/docs/libcudacxx/extended_api/simd.rst
+++ b/docs/libcudacxx/extended_api/simd.rst
@@ -9,6 +9,7 @@ SIMD
 
    simd/saturating_add
    simd/abs_diff
+   simd/idot
 
 .. list-table::
    :widths: 25 45 30 30
@@ -28,3 +29,8 @@ SIMD
      - Performs element-wise absolute difference of two integer ``basic_vec`` objects
      - CCCL 3.5.0
      - CUDA 13.5
+
+   * - :ref:`cuda::simd::idot <libcudacxx-extended-api-simd-idot>`
+     - Computes the integer dot product of two ``basic_vec`` objects and an accumulator
+     - CCCL 3.5.0
+     - CUDA 13.5
diff --git a/docs/libcudacxx/extended_api/simd/idot.rst b/docs/libcudacxx/extended_api/simd/idot.rst
new file mode 100644
index 00000000000..a90730f6b90
--- /dev/null
+++ b/docs/libcudacxx/extended_api/simd/idot.rst
@@ -0,0 +1,79 @@
+.. _libcudacxx-extended-api-simd-idot:
+
+``cuda::simd::idot``
+====================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class U, class Abi, class AccT>
+   [[nodiscard]] __host__ __device__ constexpr
+   AccT idot(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<U, Abi>& rhs,
+     AccT acc) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::idot`` computes the dot product of two integer ``cuda::std::simd::basic_vec`` objects and adds the result to an accumulator.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   acc += static_cast<AccT>(lhs[i]) * static_cast<AccT>(rhs[i])
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+- ``acc``: The initial accumulator value.
+
+**Return value**
+
+Returns ``acc`` plus the integer dot product of ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T``, ``U``, and ``AccT`` must be integer types.
+
+**Performance considerations**
+
+- Packed 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP4A`` on ``SM61`` and newer device targets.
+- Packed 16-bit by 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP2A`` on ``SM61`` and newer device targets.
+- Other integer input and accumulator combinations use the scalar fallback.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t = simd::basic_vec<int8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<int8_t, 4> lhs_values{1, 2, 3, 4};
+        cuda::std::array<int8_t, 4> rhs_values{5, 6, 7, 8};
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+
+        int32_t result = cuda::simd::idot(lhs, rhs, int32_t{10});
+
+        assert(result == 80);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
diff --git a/libcudacxx/include/cuda/__simd/idot.h b/libcudacxx/include/cuda/__simd/idot.h
new file mode 100644
index 00000000000..c2d9a99102c
--- /dev/null
+++ b/libcudacxx/include/cuda/__simd/idot.h
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_IDOT_H
+#define _CUDA___SIMD_IDOT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__internal/features.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#if _CCCL_HAS_SIMD_IDOT()
+#  include <cuda/__simd/simd_intrinsics_array.h>
+#  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#endif // _CCCL_HAS_SIMD_IDOT()
+
+#include <nv/target>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+_CCCL_TEMPLATE(typename _Tp, typename _Up, typename _Abi, typename _AccumT)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::std::__cccl_is_integer_v<_Up>
+                 _CCCL_AND ::cuda::std::__cccl_is_integer_v<_AccumT>)
+[[nodiscard]] _CCCL_API constexpr _AccumT
+idot(const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs,
+     const ::cuda::std::simd::basic_vec<_Up, _Abi>& __rhs,
+     const _AccumT __acc) noexcept
+{
+#if _CCCL_HAS_SIMD_IDOT()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    using ::cuda::std::is_signed_v;
+    using ::cuda::std::is_unsigned_v;
+    constexpr bool __is_unsigned_dot   = is_unsigned_v<_Tp> && is_unsigned_v<_Up> && is_unsigned_v<_AccumT>;
+    constexpr bool __is_signed_dot     = (is_signed_v<_Tp> || is_signed_v<_Up>) && is_signed_v<_AccumT>;
+    constexpr bool __has_matching_sign = __is_unsigned_dot || __is_signed_dot;
+
+    constexpr bool __is_dp4 = sizeof(_Tp) == 1 && sizeof(_Up) == 1 && sizeof(_AccumT) == 4 && __has_matching_sign;
+
+    constexpr bool __is_dp2_16bitx2_8bitx4 =
+      (sizeof(_Tp) == 2 && sizeof(_Up) == 1) && sizeof(_AccumT) == 4 && __has_matching_sign;
+    constexpr bool __is_dp2_8bitx4_16bitx2 =
+      (sizeof(_Tp) == 1 && sizeof(_Up) == 2) && sizeof(_AccumT) == 4 && __has_matching_sign;
+
+    if constexpr (__is_dp4)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp4a_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
+                   }))
+    }
+    else if constexpr (__is_dp2_16bitx2_8bitx4)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
+                   }))
+    }
+    else if constexpr (__is_dp2_8bitx4_16bitx2)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Up, _Tp>(__rhs_u, __lhs_u, __acc);
+                   }))
+    }
+  }
+#endif // _CCCL_HAS_SIMD_IDOT()
+
+  _AccumT __result = __acc;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::simd::__simd_size_type __i = 0; __i < __lhs.__size; ++__i)
+  {
+    const auto __lhs_value = static_cast<_AccumT>(__lhs.__s_.__data[__i]);
+    const auto __rhs_value = static_cast<_AccumT>(__rhs.__s_.__data[__i]);
+    const auto __product   = static_cast<_AccumT>(__lhs_value * __rhs_value);
+    __result               = static_cast<_AccumT>(__result + __product);
+  }
+  return __result;
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___SIMD_IDOT_H
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
index cb1902eaa62..82623d16479 100644
--- a/libcudacxx/include/cuda/__simd/simd_intrinsics.h
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
+#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT()
 
 #  include <cuda/std/__internal/features.h>
 #  include <cuda/std/cstdint>
@@ -132,9 +132,223 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD
 
 #  endif // _CCCL_HAS_SIMD_VABSDIFF()
 
+#  if _CCCL_HAS_SIMD_IDOT()
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp4a_u8x4_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp4a(__lhs, __rhs, __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp4a_u8x4_u8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_s8x4_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_61,
+    (return ::__dp4a(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp4a_s8x4_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_u8x4_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp4a_u8x4_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_s8x4_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp4a_s8x4_u8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp2a_lo_u16x2_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp2a_lo(__lhs, __rhs, __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_u16x2_u8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp2a_hi_u16x2_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp2a_hi(__lhs, __rhs, __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::uint32_t __result{};
+                 asm("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_u16x2_u8x4: Unsupported architecture");
+  return ::cuda::std::uint32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_s16x2_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_61,
+    (return ::__dp2a_lo(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_s16x2_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_u16x2_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.lo.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_u16x2_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_s16x2_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.lo.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_s16x2_u8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_s16x2_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_INTRINSICS()
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_61,
+    (return ::__dp2a_hi(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);))
+#    elif _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_s16x2_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_u16x2_s8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.hi.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_u16x2_s8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_s16x2_u8x4(
+  [[maybe_unused]] const ::cuda::std::uint32_t __lhs,
+  [[maybe_unused]] const ::cuda::std::uint32_t __rhs,
+  [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept
+{
+#    if _CCCL_HAS_SIMD_IDOT_PTX()
+  NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                 ::cuda::std::int32_t __result{};
+                 asm("dp2a.hi.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc));
+                 return __result;
+               }))
+#    endif // _CCCL_HAS_SIMD_IDOT_PTX()
+  _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_s16x2_u8x4: Unsupported architecture");
+  return ::cuda::std::int32_t{};
+}
+
+#  endif // _CCCL_HAS_SIMD_IDOT()
+
 _CCCL_END_NAMESPACE_CUDA_SIMD
 
 #  include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
+#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT()
 #endif // _CUDA___SIMD_SIMD_INTRINSICS_H
diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
index 4a056ba6292..f8765b85480 100644
--- a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
+++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h
@@ -21,12 +21,13 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
+#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT()
 
 #  include <cuda/__simd/simd_intrinsics.h>
 #  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
 #  include <cuda/std/__type_traits/is_unsigned.h>
+#  include <cuda/std/cstdint>
 
 #  include <cuda/std/__cccl/prologue.h>
 
@@ -105,9 +106,102 @@ template <typename _Tp, ::cuda::std::size_t _Np>
 
 #  endif // _CCCL_HAS_SIMD_VABSDIFF()
 
+#  if _CCCL_HAS_SIMD_IDOT()
+
+template <typename _Tp, typename _Up, typename _AccumT, ::cuda::std::size_t _Np>
+[[nodiscard]] _CCCL_DEVICE_API _AccumT __dp4a_8bit_x4(
+  const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u,
+  const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u,
+  const _AccumT __acc) noexcept
+{
+  _AccumT __result = __acc;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::size_t __i = 0; __i < _Np; ++__i)
+  {
+    if constexpr (::cuda::std::is_unsigned_v<_Tp> && ::cuda::std::is_unsigned_v<_Up>)
+    {
+      __result = ::cuda::simd::__dp4a_u8x4_u8x4(__lhs_u[__i], __rhs_u[__i], __result);
+    }
+    else if constexpr (::cuda::std::is_unsigned_v<_Tp>)
+    {
+      __result = ::cuda::simd::__dp4a_u8x4_s8x4(__lhs_u[__i], __rhs_u[__i], __result);
+    }
+    else if constexpr (::cuda::std::is_unsigned_v<_Up>)
+    {
+      __result = ::cuda::simd::__dp4a_s8x4_u8x4(__lhs_u[__i], __rhs_u[__i], __result);
+    }
+    else
+    {
+      __result = ::cuda::simd::__dp4a_s8x4_s8x4(__lhs_u[__i], __rhs_u[__i], __result);
+    }
+  }
+  return __result;
+}
+
+template <typename _Tp, typename _Up, typename _AccumT, ::cuda::std::size_t _N_16Bit, ::cuda::std::size_t _N_8Bit>
+[[nodiscard]] _CCCL_DEVICE_API _AccumT __dp2a_16bit_x2_8bit_x4(
+  const ::cuda::std::simd::__array_u32_t<_N_16Bit>& __lhs_u16,
+  const ::cuda::std::simd::__array_u32_t<_N_8Bit>& __rhs_u8,
+  const _AccumT __acc) noexcept
+{
+  _AccumT __result = __acc;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::size_t __i = 0; __i < _N_16Bit; ++__i)
+  {
+    const auto __rhs_u = __rhs_u8[__i / 2];
+    if constexpr (::cuda::std::is_unsigned_v<_Tp> && ::cuda::std::is_unsigned_v<_Up>)
+    {
+      if (__i % 2 == 0)
+      {
+        __result = ::cuda::simd::__dp2a_lo_u16x2_u8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+      else
+      {
+        __result = ::cuda::simd::__dp2a_hi_u16x2_u8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+    }
+    else if constexpr (::cuda::std::is_unsigned_v<_Tp>)
+    {
+      if (__i % 2 == 0)
+      {
+        __result = ::cuda::simd::__dp2a_lo_u16x2_s8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+      else
+      {
+        __result = ::cuda::simd::__dp2a_hi_u16x2_s8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+    }
+    else if constexpr (::cuda::std::is_unsigned_v<_Up>)
+    {
+      if (__i % 2 == 0)
+      {
+        __result = ::cuda::simd::__dp2a_lo_s16x2_u8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+      else
+      {
+        __result = ::cuda::simd::__dp2a_hi_s16x2_u8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+    }
+    else
+    {
+      if (__i % 2 == 0)
+      {
+        __result = ::cuda::simd::__dp2a_lo_s16x2_s8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+      else
+      {
+        __result = ::cuda::simd::__dp2a_hi_s16x2_s8x4(__lhs_u16[__i], __rhs_u, __result);
+      }
+    }
+  }
+  return __result;
+}
+
+#  endif // _CCCL_HAS_SIMD_IDOT()
+
 _CCCL_END_NAMESPACE_CUDA_SIMD
 
 #  include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF()
+#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT()
 #endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H
diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd
index c6c4889bc30..9ede1dabd80 100644
--- a/libcudacxx/include/cuda/simd
+++ b/libcudacxx/include/cuda/simd
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__simd/idot.h>
 #include <cuda/__simd/saturating_add.h>
 #include <cuda/__simd/vabsdiff.h>
 #include <cuda/std/__simd_>
diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h
index be22b4b9fb9..f2660f646e0 100644
--- a/libcudacxx/include/cuda/std/__internal/features.h
+++ b/libcudacxx/include/cuda/std/__internal/features.h
@@ -120,21 +120,12 @@
 
 #define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
 
-// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
-#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
-#define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
-#define _CCCL_HAS_SIMD_SAT()                                                                 \
-  ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \
+#define _CCCL_HAS_SIMD_IDOT_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 2) && _CCCL_HAS_CTK())
+#define _CCCL_HAS_SIMD_IDOT_PTX()        (__cccl_ptx_isa >= 500ULL)
+#define _CCCL_HAS_SIMD_IDOT()                                                                  \
+  ((_CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()) && _CCCL_CUDA_COMPILATION() \
    && !_CCCL_TILE_COMPILATION())
 
-#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
-
-// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics
-#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0
-#define _CCCL_HAS_SIMD_SAT_PTX()        (__cccl_ptx_isa >= 920ULL)
-#define _CCCL_HAS_SIMD_SAT() \
-  (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION()
-
 // Third party libraries
 
 #if (__has_include(<dlpack/dlpack.h>) || __has_include(<dlpack.h>)) && \
diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp
new file mode 100644
index 00000000000..2f3a7f84ed4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp
@@ -0,0 +1,196 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/__simd/idot.h>
+
+// template<class T, class U, class Abi, class AccT>
+//   constexpr AccT cuda::simd::idot(
+//     const basic_vec<T, Abi>& lhs, const basic_vec<U, Abi>& rhs, AccT acc) noexcept;
+
+#include <cuda/simd>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+namespace simd = cuda::std::simd;
+
+template <typename T, int N>
+using fixed_size_vec = simd::basic_vec<T, simd::fixed_size<N>>;
+
+template <typename LhsVec, typename RhsVec, typename AccT, typename = void>
+inline constexpr bool has_idot = false;
+
+template <typename LhsVec, typename RhsVec, typename AccT>
+inline constexpr bool
+  has_idot<LhsVec,
+           RhsVec,
+           AccT,
+           cuda::std::void_t<decltype(cuda::simd::idot(
+             cuda::std::declval<LhsVec>(), cuda::std::declval<RhsVec>(), cuda::std::declval<AccT>()))>> = true;
+
+template <typename T, typename U, typename AccT, int N>
+TEST_FUNC constexpr AccT
+scalar_idot(const cuda::std::array<T, N>& lhs_values, const cuda::std::array<U, N>& rhs_values, AccT acc)
+{
+  AccT result = acc;
+  for (int i = 0; i < N; ++i)
+  {
+    AccT lhs_value = static_cast<AccT>(lhs_values[i]);
+    AccT rhs_value = static_cast<AccT>(rhs_values[i]);
+    AccT product   = static_cast<AccT>(lhs_value * rhs_value);
+    result         = static_cast<AccT>(result + product);
+  }
+  return result;
+}
+
+template <typename T, typename U, typename AccT, int N>
+TEST_FUNC constexpr void test_values(cuda::std::array<T, N> lhs_values, cuda::std::array<U, N> rhs_values, AccT acc)
+{
+  using LhsVec = simd::basic_vec<T, simd::fixed_size<N>>;
+  using RhsVec = simd::basic_vec<U, simd::fixed_size<N>>;
+  LhsVec lhs(lhs_values);
+  RhsVec rhs(rhs_values);
+
+  static_assert(cuda::std::is_same_v<decltype(cuda::simd::idot(lhs, rhs, acc)), AccT>);
+  static_assert(noexcept(cuda::simd::idot(lhs, rhs, acc)));
+
+  AccT result   = cuda::simd::idot(lhs, rhs, acc);
+  AccT expected = scalar_idot<T, U, AccT, N>(lhs_values, rhs_values, acc);
+  assert(result == expected);
+}
+
+template <typename T, typename U, typename AccT, int N>
+TEST_FUNC constexpr void test_generated(AccT acc)
+{
+  cuda::std::array<T, N> lhs_values{};
+  cuda::std::array<U, N> rhs_values{};
+  for (int i = 0; i < N; ++i)
+  {
+    if constexpr (cuda::std::is_signed_v<T>)
+    {
+      lhs_values[i] = static_cast<T>((i % 5) - 2);
+    }
+    else
+    {
+      lhs_values[i] = static_cast<T>((i % 5) + 1);
+    }
+
+    if constexpr (cuda::std::is_signed_v<U>)
+    {
+      rhs_values[i] = static_cast<U>((i % 7) - 3);
+    }
+    else
+    {
+      rhs_values[i] = static_cast<U>((i % 7) + 2);
+    }
+  }
+  test_values<T, U, AccT, N>(lhs_values, rhs_values, acc);
+}
+
+TEST_FUNC constexpr void test_8bit_dp4a()
+{
+  {
+    cuda::std::array<int8_t, 4> lhs_values{-8, -3, 2, 7};
+    cuda::std::array<int8_t, 4> rhs_values{4, -5, 6, -7};
+    test_values<int8_t, int8_t, int32_t, 4>(lhs_values, rhs_values, int32_t{11});
+  }
+  {
+    cuda::std::array<uint8_t, 7> lhs_values{1, 2, 3, 4, 5, 6, 7};
+    cuda::std::array<uint8_t, 7> rhs_values{8, 7, 6, 5, 4, 3, 2};
+    test_values<uint8_t, uint8_t, uint32_t, 7>(lhs_values, rhs_values, uint32_t{13});
+  }
+  {
+    cuda::std::array<uint8_t, 5> lhs_values{1, 2, 3, 4, 5};
+    cuda::std::array<int8_t, 5> rhs_values{-1, 2, -3, 4, -5};
+    test_values<uint8_t, int8_t, int32_t, 5>(lhs_values, rhs_values, int32_t{-17});
+  }
+  {
+    cuda::std::array<int8_t, 3> lhs_values{-4, 5, -6};
+    cuda::std::array<uint8_t, 3> rhs_values{7, 8, 9};
+    test_values<int8_t, uint8_t, int32_t, 3>(lhs_values, rhs_values, int32_t{19});
+  }
+}
+
+TEST_FUNC constexpr void test_16bit_8bit_dp2a()
+{
+  {
+    cuda::std::array<int16_t, 5> lhs_values{-300, 20, 45, -12, 17};
+    cuda::std::array<int8_t, 5> rhs_values{3, -4, 5, -6, 7};
+    test_values<int16_t, int8_t, int32_t, 5>(lhs_values, rhs_values, int32_t{23});
+  }
+  {
+    cuda::std::array<int8_t, 5> lhs_values{3, -4, 5, -6, 7};
+    cuda::std::array<int16_t, 5> rhs_values{-300, 20, 45, -12, 17};
+    test_values<int8_t, int16_t, int32_t, 5>(lhs_values, rhs_values, int32_t{29});
+  }
+  {
+    cuda::std::array<uint16_t, 5> lhs_values{300, 20, 45, 12, 17};
+    cuda::std::array<uint8_t, 5> rhs_values{3, 4, 5, 6, 7};
+    test_values<uint16_t, uint8_t, uint32_t, 5>(lhs_values, rhs_values, uint32_t{31});
+  }
+  {
+    cuda::std::array<uint8_t, 5> lhs_values{3, 4, 5, 6, 7};
+    cuda::std::array<uint16_t, 5> rhs_values{300, 20, 45, 12, 17};
+    test_values<uint8_t, uint16_t, uint32_t, 5>(lhs_values, rhs_values, uint32_t{37});
+  }
+  {
+    cuda::std::array<int16_t, 5> lhs_values{-300, 20, 45, -12, 17};
+    cuda::std::array<uint8_t, 5> rhs_values{3, 200, 5, 255, 7};
+    test_values<int16_t, uint8_t, int32_t, 5>(lhs_values, rhs_values, int32_t{41});
+  }
+  {
+    cuda::std::array<uint8_t, 5> lhs_values{3, 200, 5, 255, 7};
+    cuda::std::array<int16_t, 5> rhs_values{-300, 20, 45, -12, 17};
+    test_values<uint8_t, int16_t, int32_t, 5>(lhs_values, rhs_values, int32_t{-47});
+  }
+  {
+    cuda::std::array<uint16_t, 5> lhs_values{300, 40000, 45, 65535, 17};
+    cuda::std::array<int8_t, 5> rhs_values{3, -4, 5, -6, 7};
+    test_values<uint16_t, int8_t, int32_t, 5>(lhs_values, rhs_values, int32_t{43});
+  }
+  {
+    cuda::std::array<int8_t, 5> lhs_values{3, -4, 5, -6, 7};
+    cuda::std::array<uint16_t, 5> rhs_values{300, 40000, 45, 65535, 17};
+    test_values<int8_t, uint16_t, int32_t, 5>(lhs_values, rhs_values, int32_t{-53});
+  }
+}
+
+TEST_FUNC constexpr bool test_all()
+{
+  static_assert(!has_idot<fixed_size_vec<float, 4>, fixed_size_vec<float, 4>, int>);
+  static_assert(!has_idot<fixed_size_vec<int, 4>, fixed_size_vec<float, 4>, int>);
+  static_assert(!has_idot<fixed_size_vec<int, 4>, fixed_size_vec<int, 4>, float>);
+
+  test_8bit_dp4a();
+  test_16bit_8bit_dp2a();
+
+  test_generated<short, short, int, 3>(5);
+  test_generated<int, unsigned, long long, 5>(-7);
+  test_generated<long, long long, long long, 4>(9);
+  test_generated<unsigned short, unsigned, unsigned long long, 6>(11);
+  test_generated<unsigned long, unsigned long long, unsigned long long, 3>(13);
+#if _CCCL_HAS_INT128()
+  test_generated<__int128_t, __uint128_t, __int128_t, 3>(__int128_t{17});
+  test_generated<__uint128_t, __uint128_t, __uint128_t, 5>(__uint128_t{19});
+#endif // _CCCL_HAS_INT128()
+
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test_all());
+  static_assert(test_all());
+  return 0;
+}
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index ab3c686467e..68b884b04a9 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -34,6 +34,7 @@ if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
   file(
     GLOB libcudacxx_simd_codegen_tests
     "floating_point/*.cu"
+    "idot/*.cu"
     "integer/*.cu"
     "vabsdiff/*.cu"
   )
diff --git a/libcudacxx/test/simd_codegen/idot/idp2.cu b/libcudacxx/test/simd_codegen/idot/idp2.cu
new file mode 100644
index 00000000000..0073adeacea
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/idot/idp2.cu
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/simd>
+#include <cuda/std/cstdint>
+
+namespace simd = cuda::std::simd;
+
+using Vec_s8_x4  = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+using Vec_u8_x4  = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+using Vec_s16_x4 = simd::basic_vec<cuda::std::int16_t, simd::fixed_size<4>>;
+using Vec_u16_x4 = simd::basic_vec<cuda::std::uint16_t, simd::fixed_size<4>>;
+
+__device__ cuda::std::int32_t test_idot_s16_s8(Vec_s16_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_s8_s16(Vec_s8_x4 lhs, Vec_s16_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_s16_u8(Vec_s16_x4 lhs, Vec_u8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_u8_s16(Vec_u8_x4 lhs, Vec_s16_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_u16_s8(Vec_u16_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_s8_u16(Vec_s8_x4 lhs, Vec_u16_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::uint32_t test_idot_u16_u8(Vec_u16_x4 lhs, Vec_u8_x4 rhs, cuda::std::uint32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::uint32_t test_idot_u8_u16(Vec_u8_x4 lhs, Vec_u16_x4 rhs, cuda::std::uint32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+/*
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s16_s8.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_s16.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s16_u8.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_s16.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u16_s8.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_u16.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u16_u8.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_u16.*}}
+; SMXX-DAG: {{.*IDP\.2A.*LO.*}}
+; SMXX-DAG: {{.*IDP\.2A.*HI.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/idot/idp4.cu b/libcudacxx/test/simd_codegen/idot/idp4.cu
new file mode 100644
index 00000000000..836dfc376ba
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/idot/idp4.cu
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/simd>
+#include <cuda/std/cstdint>
+
+namespace simd = cuda::std::simd;
+
+using Vec_s8_x4 = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+using Vec_u8_x4 = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+__device__ cuda::std::int32_t test_idot_s8_s8(Vec_s8_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::uint32_t test_idot_u8_u8(Vec_u8_x4 lhs, Vec_u8_x4 rhs, cuda::std::uint32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_u8_s8(Vec_u8_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+__device__ cuda::std::int32_t test_idot_s8_u8(Vec_s8_x4 lhs, Vec_u8_x4 rhs, cuda::std::int32_t acc)
+{
+  return cuda::simd::idot(lhs, rhs, acc);
+}
+
+/*
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_s8.*}}
+; SMXX-DAG: {{.*IDP\.4A\.S8\.S8.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_u8.*}}
+; SMXX-DAG: {{.*IDP\.4A\.U8\.U8.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_s8.*}}
+; SMXX-DAG: {{.*IDP\.4A\.U8\.S8.*}}
+
+; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_u8.*}}
+; SMXX-DAG: {{.*IDP\.4A\.S8\.U8.*}}
+
+*/