Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/libcudacxx/extended_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Extended API
extended_api/type_traits
extended_api/vector_tuple_protocol
extended_api/numeric
extended_api/simd
extended_api/random
extended_api/memory
extended_api/memory_resource
Expand Down
36 changes: 36 additions & 0 deletions docs/libcudacxx/extended_api/simd.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
.. _libcudacxx-extended-api-simd:

SIMD
====

.. toctree::
:hidden:
:maxdepth: 1

simd/saturating_add
simd/abs_diff
simd/idot

.. list-table::
:widths: 25 45 30 30
:header-rows: 1

* - **Header**
- **Content**
- **CCCL Availability**
- **CUDA Toolkit Availability**

* - :ref:`cuda::simd::saturating_add <libcudacxx-extended-api-simd-saturating-add>`
- Performs element-wise saturating addition of two ``basic_vec`` objects
- CCCL 3.5.0
- CUDA 13.5

* - :ref:`cuda::simd::abs_diff <libcudacxx-extended-api-simd-abs-diff>`
- Performs element-wise absolute difference of two integer ``basic_vec`` objects
- CCCL 3.5.0
- CUDA 13.5

* - :ref:`cuda::simd::idot <libcudacxx-extended-api-simd-idot>`
- Computes the integer dot product of two ``basic_vec`` objects and an accumulator
- CCCL 3.5.0
- CUDA 13.5
84 changes: 84 additions & 0 deletions docs/libcudacxx/extended_api/simd/abs_diff.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
.. _libcudacxx-extended-api-simd-abs-diff:

``cuda::simd::abs_diff``
========================

Defined in the ``<cuda/simd>`` header.

.. code:: cuda

namespace cuda::simd {

template <class T, class Abi>
[[nodiscard]] __host__ __device__ constexpr
cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi> abs_diff(
const cuda::std::simd::basic_vec<T, Abi>& lhs,
const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;

} // namespace cuda::simd

The function ``cuda::simd::abs_diff`` performs element-wise absolute difference of two integer ``cuda::std::simd::basic_vec`` objects.

For each element ``i`` in the input vectors, the result is equivalent to:

.. code:: cuda

abs(lhs[i] - rhs[i])

The return type is always an *unsigned* ``basic_vec`` with the same ABI as the input vectors.

**Parameters**

- ``lhs``: The left-hand side input vector.
- ``rhs``: The right-hand side input vector.

**Return value**

Returns a ``cuda::std::simd::basic_vec<cuda::std::make_unsigned_t<T>, Abi>`` where each element contains the unsigned absolute difference of the corresponding elements in ``lhs`` and ``rhs``.

**Constraints**

- ``T`` must be an integer type.

**Performance considerations**

- Packed 8-bit integer vectors perform absolute difference using:

- ``VABSDIFF4`` on ``SM80``, ``SM86``, ``SM87``, ``SM89``, ``SM90``, ``SM100``, ``SM103``, and ``SM110``.
- ``VIMNMX.S8x4/U8x4`` on ``SM120f``.

Example
-------

.. code:: cuda

#include <cuda/simd>
#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>

namespace simd = cuda::std::simd;

__global__ void kernel()
{
using vec_t = simd::basic_vec<cuda::std::int8_t, simd::fixed_size<4>>;
using result_vec_t = simd::basic_vec<cuda::std::uint8_t, simd::fixed_size<4>>;

cuda::std::array<cuda::std::int8_t, 4> lhs_values{-128, 10, 20, 30};
cuda::std::array<cuda::std::int8_t, 4> rhs_values{127, 20, -30, 40};

vec_t lhs(lhs_values);
vec_t rhs(rhs_values);
result_vec_t result = cuda::simd::abs_diff(lhs, rhs);

assert(result[0] == 255);
assert(result[1] == 10);
assert(result[2] == 50);
assert(result[3] == 10);
}

int main()
{
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
79 changes: 79 additions & 0 deletions docs/libcudacxx/extended_api/simd/idot.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
.. _libcudacxx-extended-api-simd-idot:

``cuda::simd::idot``
====================

Defined in the ``<cuda/simd>`` header.

.. code:: cuda

namespace cuda::simd {

template <class T, class U, class Abi, class AccT>
[[nodiscard]] __host__ __device__ constexpr
AccT idot(
const cuda::std::simd::basic_vec<T, Abi>& lhs,
const cuda::std::simd::basic_vec<U, Abi>& rhs,
AccT acc) noexcept;

} // namespace cuda::simd

The function ``cuda::simd::idot`` computes the dot product of two integer ``cuda::std::simd::basic_vec`` objects and adds the result to an accumulator.

For each element ``i`` in the input vectors, the result is equivalent to:

.. code:: cuda

acc += static_cast<AccT>(lhs[i]) * static_cast<AccT>(rhs[i])

**Parameters**

- ``lhs``: The left-hand side input vector.
- ``rhs``: The right-hand side input vector.
- ``acc``: The initial accumulator value.

**Return value**

Returns ``acc`` plus the integer dot product of ``lhs`` and ``rhs``.

**Constraints**

- ``T``, ``U``, and ``AccT`` must be integer types.

**Performance considerations**

- Packed 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP4A`` on ``SM61`` and newer device targets.
- Packed 16-bit by 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP2A`` on ``SM61`` and newer device targets.
- Other integer input and accumulator combinations use the scalar fallback.

Example
-------

.. code:: cuda

#include <cuda/simd>
#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>

namespace simd = cuda::std::simd;

__global__ void kernel()
{
using vec_t = simd::basic_vec<int8_t, simd::fixed_size<4>>;

cuda::std::array<int8_t, 4> lhs_values{1, 2, 3, 4};
cuda::std::array<int8_t, 4> rhs_values{5, 6, 7, 8};
vec_t lhs(lhs_values);
vec_t rhs(rhs_values);

int32_t result = cuda::simd::idot(lhs, rhs, int32_t{10});

assert(result == 80);
}

int main()
{
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
78 changes: 78 additions & 0 deletions docs/libcudacxx/extended_api/simd/saturating_add.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
.. _libcudacxx-extended-api-simd-saturating-add:

``cuda::simd::saturating_add``
==============================

Defined in the ``<cuda/simd>`` header.

.. code:: cuda

namespace cuda::simd {

template <class T, class Abi>
[[nodiscard]] __host__ __device__ constexpr
cuda::std::simd::basic_vec<T, Abi> saturating_add(
const cuda::std::simd::basic_vec<T, Abi>& lhs,
const cuda::std::simd::basic_vec<T, Abi>& rhs) noexcept;

} // namespace cuda::simd

The function ``cuda::simd::saturating_add`` performs element-wise saturating addition of two ``cuda::std::simd::basic_vec`` objects.

For each element ``i`` in the input vectors, the result is equivalent to:

.. code:: cuda

cuda::std::saturating_add(lhs[i], rhs[i])

**Parameters**

- ``lhs``: The left-hand side input vector.
- ``rhs``: The right-hand side input vector.

**Return value**

Returns a ``cuda::std::simd::basic_vec<T, Abi>`` where each element contains the saturated sum of the corresponding elements in ``lhs`` and ``rhs``.

**Constraints**

- ``T`` must be an `integer type <https://eel.is/c++draft/basic.fundamental#1>`__.

**Performance considerations**

- Packed 8-bit and 16-bit integer vectors perform saturating addition using ``VIADD.S8x4``, ``VIADD.U8x4``, ``VIADD.S16x2``, ``VIADD.U16x2`` on ``SM120f``.

Example
-------

.. code:: cuda

#include <cuda/simd>
#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>

namespace simd = cuda::std::simd;

__global__ void kernel()
{
using vec_t = simd::basic_vec<uint8_t, simd::fixed_size<4>>;

cuda::std::array<uint8_t, 4> lhs_values{250, 10, 20, 30};
cuda::std::array<uint8_t, 4> rhs_values{10, 20, 30, 40};
vec_t lhs(lhs_values);
vec_t rhs(rhs_values);

vec_t result = cuda::simd::saturating_add(lhs, rhs);

assert(result[0] == 255);
assert(result[1] == 30);
assert(result[2] == 50);
assert(result[3] == 70);
}

int main()
{
kernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
108 changes: 108 additions & 0 deletions libcudacxx/include/cuda/__simd/idot.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++ in the CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___SIMD_IDOT_H
#define _CUDA___SIMD_IDOT_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__internal/features.h>
#include <cuda/std/__simd/basic_vec.h>
#include <cuda/std/__type_traits/is_integer.h>
#include <cuda/std/__type_traits/is_signed.h>
#include <cuda/std/__type_traits/is_unsigned.h>
#if _CCCL_HAS_SIMD_IDOT()
# include <cuda/__simd/simd_intrinsics_array.h>
# include <cuda/std/__simd/specializations/simd_intrinsics_array.h>
#endif // _CCCL_HAS_SIMD_IDOT()

#include <nv/target>

#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_SIMD

_CCCL_TEMPLATE(typename _Tp, typename _Up, typename _Abi, typename _AccumT)
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::std::__cccl_is_integer_v<_Up>
_CCCL_AND ::cuda::std::__cccl_is_integer_v<_AccumT>)
[[nodiscard]] _CCCL_API constexpr _AccumT
idot(const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs,
const ::cuda::std::simd::basic_vec<_Up, _Abi>& __rhs,
const _AccumT __acc) noexcept
{
#if _CCCL_HAS_SIMD_IDOT()
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
using ::cuda::std::is_signed_v;
using ::cuda::std::is_unsigned_v;
constexpr bool __is_unsigned_dot = is_unsigned_v<_Tp> && is_unsigned_v<_Up> && is_unsigned_v<_AccumT>;
constexpr bool __is_signed_dot = (is_signed_v<_Tp> || is_signed_v<_Up>) && is_signed_v<_AccumT>;
constexpr bool __has_matching_sign = __is_unsigned_dot || __is_signed_dot;

constexpr bool __is_dp4 = sizeof(_Tp) == 1 && sizeof(_Up) == 1 && sizeof(_AccumT) == 4 && __has_matching_sign;

constexpr bool __is_dp2_16bitx2_8bitx4 =
(sizeof(_Tp) == 2 && sizeof(_Up) == 1) && sizeof(_AccumT) == 4 && __has_matching_sign;
constexpr bool __is_dp2_8bitx4_16bitx2 =
(sizeof(_Tp) == 1 && sizeof(_Up) == 2) && sizeof(_AccumT) == 4 && __has_matching_sign;

if constexpr (__is_dp4)
{
NV_IF_TARGET(NV_PROVIDES_SM_61, ({
const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
return ::cuda::simd::__dp4a_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
}))
}
else if constexpr (__is_dp2_16bitx2_8bitx4)
{
NV_IF_TARGET(NV_PROVIDES_SM_61, ({
const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc);
}))
}
else if constexpr (__is_dp2_8bitx4_16bitx2)
{
NV_IF_TARGET(NV_PROVIDES_SM_61, ({
const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_);
const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_);
return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Up, _Tp>(__rhs_u, __lhs_u, __acc);
}))
}
}
#endif // _CCCL_HAS_SIMD_IDOT()

_AccumT __result = __acc;
_CCCL_PRAGMA_UNROLL_FULL()
for (::cuda::std::simd::__simd_size_type __i = 0; __i < __lhs.__size; ++__i)
{
const auto __lhs_value = static_cast<_AccumT>(__lhs.__s_.__data[__i]);
const auto __rhs_value = static_cast<_AccumT>(__rhs.__s_.__data[__i]);
const auto __product = static_cast<_AccumT>(__lhs_value * __rhs_value);
__result = static_cast<_AccumT>(__result + __product);
}
return __result;
}

_CCCL_END_NAMESPACE_CUDA_SIMD

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA___SIMD_IDOT_H
Loading
Loading