NVIDIA · fbusato · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
@@ -20,6 +20,7 @@ Extended API
    extended_api/type_traits
    extended_api/vector_tuple_protocol
    extended_api/numeric
+   extended_api/simd
    extended_api/random
    extended_api/memory
    extended_api/memory_resource

@@ -0,0 +1,36 @@
+.. _libcudacxx-extended-api-simd:
+
+SIMD
+====
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+   simd/saturating_add
+   simd/abs_diff
+   simd/idot
+
+.. list-table::
+   :widths: 25 45 30 30
+   :header-rows: 1
+
+   * - **Header**
+     - **Content**
+     - **CCCL Availability**
+     - **CUDA Toolkit Availability**
+
+   * - :ref:`cuda::simd::saturating_add <libcudacxx-extended-api-simd-saturating-add>`
+     - Performs element-wise saturating addition of two ``basic_vec`` objects
+     - CCCL 3.5.0
+     - CUDA 13.5
+
+   * - :ref:`cuda::simd::abs_diff <libcudacxx-extended-api-simd-abs-diff>`
+     - Performs element-wise absolute difference of two integer ``basic_vec`` objects
+     - CCCL 3.5.0
+     - CUDA 13.5
+
+   * - :ref:`cuda::simd::idot <libcudacxx-extended-api-simd-idot>`
+     - Computes the integer dot product of two ``basic_vec`` objects and an accumulator
+     - CCCL 3.5.0
+     - CUDA 13.5
@@ -0,0 +1,84 @@
+.. _libcudacxx-extended-api-simd-abs-diff:
+
+``cuda::simd::abs_diff``
+========================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class Abi>
+   [[nodiscard]] __host__ __device__ constexpr
+   cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi> abs_diff(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::abs_diff`` performs element-wise absolute difference of two integer ``cuda::std::simd::basic_vec`` objects.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   abs(lhs[i] - rhs[i])
+
+The return type is always an *unsigned* ``basic_vec`` with the same ABI as the input vectors.
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+
+**Return value**
+
+Returns a ``cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi>`` where each element contains the unsigned absolute difference of the corresponding elements in ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T`` must be an integer type.
+
+**Performance considerations**
+
+- Packed 8-bit integer vectors perform absolute difference using:
+
+  - ``VABSDIFF4`` on ``SM80``, ``SM86``, ``SM87``, ``SM89``, ``SM90``, ``SM100``, ``SM103``, and ``SM110``.
+  - ``VIMNMX.S8x4/U8x4`` on ``SM120f``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t        = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
+        using result_vec_t = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<cuda::std::int8_t, 4> lhs_values{-128, 10, 20, 30};
+        cuda::std::array<cuda::std::int8_t, 4> rhs_values{127, 20, -30, 40};
+
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+        result_vec_t result = cuda::simd::abs_diff(lhs, rhs);
+
+        assert(result[0] == 255);
+        assert(result[1] == 10);
+        assert(result[2] == 50);
+        assert(result[3] == 10);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
@@ -0,0 +1,79 @@
+.. _libcudacxx-extended-api-simd-idot:
+
+``cuda::simd::idot``
+====================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class U, class Abi, class AccT>
+   [[nodiscard]] __host__ __device__ constexpr
+   AccT idot(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<U, Abi>& rhs,
+     AccT acc) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::idot`` computes the dot product of two integer ``cuda::std::simd::basic_vec`` objects and adds the result to an accumulator.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   acc += static_cast<AccT>(lhs[i]) * static_cast<AccT>(rhs[i])
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+- ``acc``: The initial accumulator value.
+
+**Return value**
+
+Returns ``acc`` plus the integer dot product of ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T``, ``U``, and ``AccT`` must be integer types.
+
+**Performance considerations**
+
+- Packed 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP4A`` on ``SM61`` and newer device targets.
+- Packed 16-bit by 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP2A`` on ``SM61`` and newer device targets.
+- Other integer input and accumulator combinations use the scalar fallback.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t = simd::basic_vec<int8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<int8_t, 4> lhs_values{1, 2, 3, 4};
+        cuda::std::array<int8_t, 4> rhs_values{5, 6, 7, 8};
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+
+        int32_t result = cuda::simd::idot(lhs, rhs, int32_t{10});
+
+        assert(result == 80);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
@@ -0,0 +1,78 @@
+.. _libcudacxx-extended-api-simd-saturating-add:
+
+``cuda::simd::saturating_add``
+==============================
+
+Defined in the ``<cuda/simd>`` header.
+
+.. code:: cuda
+
+   namespace cuda::simd {
+
+   template <class T, class Abi>
+   [[nodiscard]] __host__ __device__ constexpr
+   cuda::std::simd::basic_vec<T, Abi> saturating_add(
+     const cuda::std::simd::basic_vec<T, Abi>& lhs,
+     const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;
+
+   } // namespace cuda::simd
+
+The function ``cuda::simd::saturating_add`` performs element-wise saturating addition of two ``cuda::std::simd::basic_vec`` objects.
+
+For each element ``i`` in the input vectors, the result is equivalent to:
+
+.. code:: cuda
+
+   cuda::std::saturating_add(lhs[i], rhs[i])
+
+**Parameters**
+
+- ``lhs``: The left-hand side input vector.
+- ``rhs``: The right-hand side input vector.
+
+**Return value**
+
+Returns a ``cuda::std::simd::basic_vec<T, Abi>`` where each element contains the saturated sum of the corresponding elements in ``lhs`` and ``rhs``.
+
+**Constraints**
+
+- ``T`` must be an `integer type <https://eel.is/c++draft/basic.fundamental#1>`__.
+
+**Performance considerations**
+
+- Packed 8-bit and 16-bit integer vectors perform saturating addition using ``VIADD.S8x4``, ``VIADD.U8x4``, ``VIADD.S16x2``, ``VIADD.U16x2`` on ``SM120f``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/simd>
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    namespace simd = cuda::std::simd;
+
+    __global__ void kernel()
+    {
+        using vec_t = simd::basic_vec<uint8_t, simd::fixed_size<4>>;
+
+        cuda::std::array<uint8_t, 4> lhs_values{250, 10, 20, 30};
+        cuda::std::array<uint8_t, 4> rhs_values{10, 20, 30, 40};
+        vec_t lhs(lhs_values);
+        vec_t rhs(rhs_values);
+
+        vec_t result = cuda::simd::saturating_add(lhs, rhs);
+
+        assert(result[0] == 255);
+        assert(result[1] == 30);
+        assert(result[2] == 50);
+        assert(result[3] == 70);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+    }
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___SIMD_IDOT_H
+#define _CUDA___SIMD_IDOT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__internal/features.h>
+#include <cuda/std/__simd/basic_vec.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#if _CCCL_HAS_SIMD_IDOT()
+#  include <cuda/__simd/simd_intrinsics_array.h>
+#  include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
+#endif // _CCCL_HAS_SIMD_IDOT()
+
+#include <nv/target>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_SIMD
+
+_CCCL_TEMPLATE(typename _Tp, typename _Up, typename _Abi, typename _AccumT)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::std::__cccl_is_integer_v<_Up>
+                 _CCCL_AND ::cuda::std::__cccl_is_integer_v<_AccumT>)
+[[nodiscard]] _CCCL_API constexpr _AccumT
+idot(const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs,
+     const ::cuda::std::simd::basic_vec<_Up, _Abi>& __rhs,
+     const _AccumT __acc) noexcept
+{
+#if _CCCL_HAS_SIMD_IDOT()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    using ::cuda::std::is_signed_v;
+    using ::cuda::std::is_unsigned_v;
+    constexpr bool __is_unsigned_dot   = is_unsigned_v<_Tp> && is_unsigned_v<_Up> && is_unsigned_v<_AccumT>;
+    constexpr bool __is_signed_dot     = (is_signed_v<_Tp> || is_signed_v<_Up>) && is_signed_v<_AccumT>;
+    constexpr bool __has_matching_sign = __is_unsigned_dot || __is_signed_dot;
+
+    constexpr bool __is_dp4 = sizeof(_Tp) == 1 && sizeof(_Up) == 1 && sizeof(_AccumT) == 4 && __has_matching_sign;
+
+    constexpr bool __is_dp2_16bitx2_8bitx4 =
+      (sizeof(_Tp) == 2 && sizeof(_Up) == 1) && sizeof(_AccumT) == 4 && __has_matching_sign;
+    constexpr bool __is_dp2_8bitx4_16bitx2 =
+      (sizeof(_Tp) == 1 && sizeof(_Up) == 2) && sizeof(_AccumT) == 4 && __has_matching_sign;
+
+    if constexpr (__is_dp4)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp4a_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
+                   }))
+    }
+    else if constexpr (__is_dp2_16bitx2_8bitx4)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
+                   }))
+    }
+    else if constexpr (__is_dp2_8bitx4_16bitx2)
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_61, ({
+                     const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
+                     const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
+                     return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Up, _Tp>(__rhs_u, __lhs_u, __acc);
+                   }))
+    }
+  }
+#endif // _CCCL_HAS_SIMD_IDOT()
+
+  _AccumT __result = __acc;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (::cuda::std::simd::__simd_size_type __i = 0; __i < __lhs.__size; ++__i)
+  {
+    const auto __lhs_value = static_cast<_AccumT>(__lhs.__s_.__data[__i]);
+    const auto __rhs_value = static_cast<_AccumT>(__rhs.__s_.__data[__i]);
+    const auto __product   = static_cast<_AccumT>(__lhs_value * __rhs_value);
+    __result               = static_cast<_AccumT>(__result + __product);
+  }
+  return __result;
+}
+
+_CCCL_END_NAMESPACE_CUDA_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___SIMD_IDOT_H