From 82dd5266b9bb2e5ccca60ad3672a127db79329f9 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 19 May 2026 15:40:55 -0700 Subject: [PATCH 1/4] `cuda::std::simd` Optimize small integer operations --- libcudacxx/include/cuda/std/__fwd/simd.h | 1 + .../include/cuda/std/__internal/features.h | 7 +- .../include/cuda/std/__simd/basic_vec.h | 1 + .../specializations/fixed_size_integral_vec.h | 223 ++++++++++++++++++ .../specializations/fixed_size_storage.h | 10 +- .../__simd/specializations/simd_intrinsics.h | 93 ++++++++ .../specializations/simd_intrinsics_array.h | 133 +++++++++++ .../include/cuda/std/__simd/type_traits.h | 13 +- .../simd/simd.traits/alignment.pass.cpp | 18 +- libcudacxx/test/simd_codegen/CMakeLists.txt | 87 ++++--- .../{ => floating_point}/decrement_f32x2.cu | 16 +- .../simd_codegen/floating_point/fma_bf16.cu | 38 +++ .../simd_codegen/floating_point/fma_f16.cu | 38 +++ .../{ => floating_point}/increment_f32x2.cu | 16 +- .../{ => floating_point}/less_bf16.cu | 26 +- .../{ => floating_point}/less_f16.cu | 26 +- .../floating_point/minus_f32x2.cu | 28 +++ .../floating_point/multiplies_bf16.cu | 38 +++ .../{ => floating_point}/multiplies_f16.cu | 25 +- .../simd_codegen/floating_point/plus_bf16.cu | 38 +++ .../{ => floating_point}/plus_f16.cu | 19 +- .../simd_codegen/floating_point/plus_f32x2.cu | 28 +++ .../{ => floating_point}/unary_minus_f32x2.cu | 17 +- libcudacxx/test/simd_codegen/fma_bf16.cu | 54 ----- libcudacxx/test/simd_codegen/fma_f16.cu | 53 ----- .../simd_codegen/integer/arithmetic_u16x2.cu | 66 ++++++ .../simd_codegen/integer/arithmetic_u8x4.cu | 61 +++++ .../integer/bitwise_u16x2_u8x4.cu | 95 ++++++++ libcudacxx/test/simd_codegen/minus_f32x2.cu | 39 --- .../test/simd_codegen/multiplies_bf16.cu | 52 ---- libcudacxx/test/simd_codegen/plus_bf16.cu | 52 ---- libcudacxx/test/simd_codegen/plus_f32x2.cu | 39 --- 32 files changed, 1010 insertions(+), 440 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h rename libcudacxx/test/simd_codegen/{ => floating_point}/decrement_f32x2.cu (59%) create mode 100644 libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu create mode 100644 libcudacxx/test/simd_codegen/floating_point/fma_f16.cu rename libcudacxx/test/simd_codegen/{ => floating_point}/increment_f32x2.cu (59%) rename libcudacxx/test/simd_codegen/{ => floating_point}/less_bf16.cu (51%) rename libcudacxx/test/simd_codegen/{ => floating_point}/less_f16.cu (51%) create mode 100644 libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu create mode 100644 libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu rename libcudacxx/test/simd_codegen/{ => floating_point}/multiplies_f16.cu (50%) create mode 100644 libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu rename libcudacxx/test/simd_codegen/{ => floating_point}/plus_f16.cu (53%) create mode 100644 libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu rename libcudacxx/test/simd_codegen/{ => floating_point}/unary_minus_f32x2.cu (56%) delete mode 100644 libcudacxx/test/simd_codegen/fma_bf16.cu delete mode 100644 libcudacxx/test/simd_codegen/fma_f16.cu create mode 100644 libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu create mode 100644 libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu create mode 100644 libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu delete mode 100644 libcudacxx/test/simd_codegen/minus_f32x2.cu delete mode 100644 libcudacxx/test/simd_codegen/multiplies_bf16.cu delete mode 100644 libcudacxx/test/simd_codegen/plus_bf16.cu delete mode 100644 libcudacxx/test/simd_codegen/plus_f32x2.cu diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h index 1ce9102d9c0..02f26d4d335 100644 --- a/libcudacxx/include/cuda/std/__fwd/simd.h +++ b/libcudacxx/include/cuda/std/__fwd/simd.h @@ -50,6 +50,7 @@ enum class __simd_operations_kind { __default, __fixed_size_float, + __fixed_size_integral, }; template diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h index 79ad0ec42b1..3b8d43c521f 100644 --- a/libcudacxx/include/cuda/std/__internal/features.h +++ b/libcudacxx/include/cuda/std/__internal/features.h @@ -103,8 +103,13 @@ #define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK() && !_CCCL_COMPILER(CLANG)) #define _CCCL_HAS_SIMD_F32X2_PTX() (__cccl_ptx_isa >= 860ULL) +#define _CCCL_HAS_SIMD_F32X2() \ + (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) && !_CCCL_TILE_COMPILATION() -#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) +#define _CCCL_HAS_SIMD_8BIT_INTRINSICS() 0 // TODO(fbusato): CTK 13.2 produces non-optimal code for 8-bit SIMD instrs. +#define _CCCL_HAS_SIMD_8BIT_PTX() (__cccl_ptx_isa >= 920ULL) +#define _CCCL_HAS_SIMD_8BIT() \ + ((_CCCL_HAS_SIMD_8BIT_PTX() || _CCCL_HAS_SIMD_8BIT_INTRINSICS()) && !_CCCL_TILE_COMPILATION()) // Third party libraries diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h index f16dfc9fdc2..37824dbc48c 100644 --- a/libcudacxx/include/cuda/std/__simd/basic_vec.h +++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h new file mode 100644 index 00000000000..abdb1c9dd05 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_integral_vec.h @@ -0,0 +1,223 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// automatic vectorization for small integers is not supported (until CUDA 13.2) +// TODO(fbusato): remove this path once the feature is supported +// TODO(fbusato): extend to other GPU archs in the future + +#include +#include +#include +#include +#include +#include + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template +inline constexpr bool __is_fixed_size_small_integral_v = + is_integral_v<_Tp> && sizeof(_Tp) < sizeof(uint32_t) && _Np >= 2; + +inline constexpr auto __simd_operations_small_integral = __simd_operations_kind::__fixed_size_integral; + +template +inline constexpr __simd_operations_kind __simd_operations_kind_v<_Tp, __fixed_size<_Np>> = + __is_fixed_size_small_integral_v<_Tp, _Np> ? __simd_operations_small_integral : __simd_operations_kind::__default; + +#define _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(_NAME, _OP) \ + [[nodiscard]] _CCCL_API static constexpr __simd_storage_t _NAME( \ + const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept \ + { \ + _CCCL_IF_NOT_CONSTEVAL_DEFAULT \ + { \ + __unsigned_storage_t __result_u{}; \ + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs); \ + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs); \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (__simd_size_type __i = 0; __i < __usize; ++__i) \ + { \ + __result_u[__i] = __lhs_u[__i] _OP __rhs_u[__i]; \ + } \ + return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__result_u); \ + } \ + return __base::_NAME(__lhs, __rhs); \ + } + +// Simd operations for fixed_size ABI with small integral element types. +template +struct __simd_operations<_Tp, __fixed_size<_Np>, __simd_operations_small_integral> : __fixed_size_operations<_Tp, _Np> +{ + using __base = __fixed_size_operations<_Tp, _Np>; + using __simd_storage_t = __simd_storage<_Tp, __fixed_size<_Np>>; + + // all computation is done on uint32_t, so the alignment must be at least the alignment of uint32_t + static_assert(alignof(__simd_storage_t) >= alignof(uint32_t)); + + static constexpr __simd_size_type __ratio = sizeof(uint32_t) / sizeof(_Tp); + static constexpr __simd_size_type __usize = ::cuda::ceil_div(_Np, __ratio); + using __unsigned_storage_t = array; + + [[nodiscard]] _CCCL_API static constexpr __simd_storage_t __bitwise_not(const __simd_storage_t& __s) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + auto __udata = ::cuda::std::simd::__to_unsigned_storage(__s); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __usize; ++__i) + { + __udata[__i] = ~__udata[__i]; + } + return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__udata); + } + return __fixed_size_operations<_Tp, _Np>::__bitwise_not(__s); + } + + _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_and, &) + _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_or, |) + _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE(__bitwise_xor, ^) + +#if _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + // Unary arithmetic operations + + // x++ = x + 1 + _CCCL_API static constexpr void __increment(__simd_storage_t& __s) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + [[maybe_unused]] constexpr __simd_storage_t __one = __base::__broadcast(1); + if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, (__s = __plus(__s, __one); return;)) + } +# if _CCCL_HAS_SIMD_8BIT() + else if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (__s = __plus(__s, __one); return;)) + } +# endif // _CCCL_HAS_SIMD_8BIT() + } + __base::__increment(__s); + } + + // x-- = x - 1 + _CCCL_API static constexpr void __decrement(__simd_storage_t& __s) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + [[maybe_unused]] constexpr __simd_storage_t __minus_one = __base::__broadcast(static_cast<_Tp>(-1)); + if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, (__s = __plus(__s, __minus_one); return;)) + } +# if _CCCL_HAS_SIMD_8BIT() + else if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (__s = __plus(__s, __minus_one); return;)) + } +# endif // _CCCL_HAS_SIMD_8BIT() + } + __base::__decrement(__s); + } + + // -x = ~x + 1 + [[nodiscard]] + _CCCL_API static constexpr __simd_storage_t __unary_minus(const __simd_storage_t& __s) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + [[maybe_unused]] constexpr __simd_storage_t __one = __base::__broadcast(1); + if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, (return __plus(__bitwise_not(__s), __one);)) + } +# if _CCCL_HAS_SIMD_8BIT() + else if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return __plus(__bitwise_not(__s), __one);)) + } +# endif // _CCCL_HAS_SIMD_8BIT() + } + return __base::__unary_minus(__s); + } + + // Binary arithmetic operations + + [[nodiscard]] + _CCCL_API static constexpr __simd_storage_t + __plus(const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + [[maybe_unused]] const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs); + [[maybe_unused]] const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs); + if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, + (return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>( + ::cuda::std::simd::__vadd_16bit_x2<_Tp>(__lhs_u, __rhs_u));)) + } +# if _CCCL_HAS_SIMD_8BIT() + else if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, + (return ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>( + ::cuda::std::simd::__vadd_8bit_x4<_Tp>(__lhs_u, __rhs_u));)) + } +# endif // _CCCL_HAS_SIMD_8BIT() + } + return __fixed_size_operations<_Tp, _Np>::__plus(__lhs, __rhs); + } + + [[nodiscard]] + _CCCL_API static constexpr __simd_storage_t + __minus(const __simd_storage_t& __lhs, const __simd_storage_t& __rhs) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, (return __plus(__lhs, __unary_minus(__rhs));)) + } +# if _CCCL_HAS_SIMD_8BIT() + else if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return __plus(__lhs, __unary_minus(__rhs));)) + } +# endif // _CCCL_HAS_SIMD_8BIT() + } + return __base::__minus(__lhs, __rhs); + } +#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() +}; + +#undef _CCCL_SIMD_FIXED_SIZE_INTEGRAL_BINARY_BITWISE + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_INTEGRAL_VEC_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h index 01723e33ddf..fcbc63d933d 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include @@ -36,9 +37,16 @@ struct __fixed_size static constexpr __simd_size_type __simd_size = _Np; }; +// SIMD storage never directly interacts with memory. Users must use load/store API for that purpose. +// However, SIMD storage could spill from register to cache/memory. This could break the alignment of the data for +// vectorized instructions. For this reason, we align the SIMD storage to at least 8 bytes (max SIMD instruction size). +// 8 bytes is a negligible constraint in case of spilling. +template +inline constexpr size_t __simd_storage_alignment_v = ::cuda::std::max(alignof(_Tp), size_t{8}); + // Element-per-slot simd storage for fixed_size ABI template -struct __simd_storage<_Tp, __fixed_size<_Np>> +struct alignas(__simd_storage_alignment_v<_Tp, _Np>) __simd_storage<_Tp, __fixed_size<_Np>> { using value_type = _Tp; diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h new file mode 100644 index 00000000000..6e7cd7caffe --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUDA_COMPILATION() + +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +[[nodiscard]] _CCCL_DEVICE_API inline uint32_t +__vadd_u16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept +{ + NV_IF_TARGET(NV_PROVIDES_SM_90, + (return ::__vadd2(__lhs, __rhs);), // + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u16x2: Unsupported architecture"); return uint32_t{};)); +} + +[[nodiscard]] _CCCL_DEVICE_API inline uint32_t +__vadd_s16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept +{ + // prevent MSVC warning + NV_IF_TARGET(NV_PROVIDES_SM_90, + ({ + uint32_t __result{}; + asm("add.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture"); return uint32_t{};)); +} + +# if _CCCL_HAS_SIMD_8BIT() + +[[nodiscard]] _CCCL_DEVICE_API inline uint32_t +__vadd_u8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept +{ +# if _CCCL_HAS_SIMD_8BIT_INTRINSICS() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, + (return ::__vadd4(__lhs, __rhs);), // + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};)); +# else // ^^^ _CCCL_HAS_SIMD_8BIT_INTRINSICS() ^^^ / vvv !_CCCL_HAS_SIMD_8BIT_INTRINSICS() vvv + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, + ({ + uint32_t __result{}; + asm("add.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};)); +# endif // _CCCL_HAS_SIMD_8BIT() +} + +[[nodiscard]] _CCCL_DEVICE_API inline uint32_t +__vadd_s8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept +{ + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, + ({ + uint32_t __result{}; + asm("add.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture"); return uint32_t{};)); +} + +# endif // _CCCL_HAS_SIMD_8BIT() + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +# include +#endif // _CCCL_CUDA_COMPILATION() +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h new file mode 100644 index 00000000000..aa55459da48 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics_array.h @@ -0,0 +1,133 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +//---------------------------------------------------------------------------------------------------------------------- +// conversion utilities + +template +inline constexpr size_t __simd_storage_size_u32 = 0; + +template +inline constexpr size_t __simd_storage_size_u32<__simd_storage<_Tp, __fixed_size<_Np>>> = + ::cuda::ceil_div(_Np, sizeof(uint32_t) / sizeof(_Tp)); + +template +using __array_u32_t = array; + +template +using __simd_storage_u32_t = __array_u32_t<__simd_storage_size_u32<_SimdStorage>>; + +template +inline constexpr size_t __simd_storage_copy_size_u32 = 0; + +template +inline constexpr size_t __simd_storage_copy_size_u32<__simd_storage<_Tp, __fixed_size<_Np>>> = _Np * sizeof(_Tp); + +template > +[[nodiscard]] _CCCL_API constexpr _SimdStorageU32 __to_unsigned_storage(const _SimdStorage& __s) noexcept +{ + _SimdStorageU32 __tmp{}; + const auto __input_data = ::cuda::std::assume_aligned(__s.__data); + ::cuda::std::memcpy(__tmp.data(), __input_data, __simd_storage_copy_size_u32<_SimdStorage>); + return __tmp; +} + +template > +[[nodiscard]] _CCCL_API constexpr _SimdStorage __copy_from_unsigned_storage(const _SimdStorageU32& __tmp) noexcept +{ + _SimdStorage __result{}; + const auto __result_ptr = ::cuda::std::assume_aligned(__result.__data); + ::cuda::std::memcpy(__result_ptr, __tmp.data(), __simd_storage_copy_size_u32<_SimdStorage>); + return __result; +} + +//---------------------------------------------------------------------------------------------------------------------- +// device-only functions + +#if _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr __array_u32_t<_Np> +__vadd_16bit_x2(const __array_u32_t<_Np>& __lhs_u, const __array_u32_t<_Np>& __rhs_u) noexcept +{ + __array_u32_t<_Np> __result_u; + _CCCL_PRAGMA_UNROLL_FULL() + for (size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (is_unsigned_v<_Tp>) + { + __result_u[__i] = ::cuda::std::simd::__vadd_u16x2(__lhs_u[__i], __rhs_u[__i]); + } + else + { + __result_u[__i] = ::cuda::std::simd::__vadd_s16x2(__lhs_u[__i], __rhs_u[__i]); + } + } + return __result_u; +} + +# if _CCCL_HAS_SIMD_8BIT() + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr __array_u32_t<_Np> +__vadd_8bit_x4(const __array_u32_t<_Np>& __lhs_u, const __array_u32_t<_Np>& __rhs_u) noexcept +{ + __array_u32_t<_Np> __result_u; + _CCCL_PRAGMA_UNROLL_FULL() + for (size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (is_unsigned_v<_Tp>) + { + __result_u[__i] = ::cuda::std::simd::__vadd_u8x4(__lhs_u[__i], __rhs_u[__i]); + } + else + { + __result_u[__i] = ::cuda::std::simd::__vadd_s8x4(__lhs_u[__i], __rhs_u[__i]); + } + } + return __result_u; +} + +# endif // _CCCL_HAS_SIMD_8BIT() + +#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_SIMD_INTRINSICS_ARRAY_H diff --git a/libcudacxx/include/cuda/std/__simd/type_traits.h b/libcudacxx/include/cuda/std/__simd/type_traits.h index 8f350b8b938..3929ac9bb70 100644 --- a/libcudacxx/include/cuda/std/__simd/type_traits.h +++ b/libcudacxx/include/cuda/std/__simd/type_traits.h @@ -32,16 +32,19 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD +inline constexpr size_t __optimal_cuda_alignment = _CCCL_CTK_AT_LEAST(12, 9) ? 32 : 16; + +// The best alignment for a pointer to a SIMD type is the maximum of the type's alignment and the optimal CUDA +// alignment. +template +inline constexpr size_t __simd_pointer_alignment_v = ::cuda::std::max(alignof(_Tp), __optimal_cuda_alignment); + // [simd.traits], alignment template struct alignment; template -struct alignment, _Up> - : integral_constant * alignof(_Up)) - ? __simd_size_v<_Tp, _Abi> * alignof(_Up) - : alignof(_Up)> +struct alignment, _Up> : integral_constant> { static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type"); }; diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp index e237fc28fd3..dc8cd76ddaf 100644 --- a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.traits/alignment.pass.cpp @@ -26,27 +26,29 @@ namespace simd = cuda::std::simd; -template +inline constexpr size_t optimal_cuda_alignment = _CCCL_CTK_AT_LEAST(12, 9) ? 32 : 16; + +template TEST_FUNC void test_default_u() { using V = simd::basic_vec>; - static_assert(simd::alignment::value == ExpectedAlign); - static_assert(simd::alignment_v == ExpectedAlign); + static_assert(simd::alignment::value == optimal_cuda_alignment); + static_assert(simd::alignment_v == optimal_cuda_alignment); } -template +template TEST_FUNC void test_explicit_u() { using V = simd::basic_vec>; - static_assert(simd::alignment::value == ExpectedAlign); - static_assert(simd::alignment_v == ExpectedAlign); + static_assert(simd::alignment::value == optimal_cuda_alignment); + static_assert(simd::alignment_v == optimal_cuda_alignment); } template TEST_FUNC void test_type() { test_default_u(); - test_default_u(); + test_default_u(); test_default_u(); test_default_u(); test_default_u(); @@ -73,7 +75,7 @@ TEST_FUNC void test() // explicit U different from value_type test_explicit_u(); - test_explicit_u(); + test_explicit_u(); test_explicit_u(); test_explicit_u(); test_explicit_u(); diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 7a82933ba97..4c073d6069f 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -21,12 +21,17 @@ else() return() endif() +if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") + message("-- clang-cuda does not support the -dc simd codegen tests") + return() +endif() + find_program(cuobjdump "cuobjdump" REQUIRED) find_program(bash "bash" REQUIRED) set(libcudacxx_simd_codegen_tests) if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - file(GLOB libcudacxx_simd_codegen_tests "*.cu") + file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu") endif() set(simd_codegen_cuda_archs 80 90) @@ -34,47 +39,75 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND simd_codegen_cuda_archs 100 120) endif() -function(simd_codegen_add_tests test_path) - cmake_path(GET test_path FILENAME test_file) - cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) +set(simd_codegen_arch_specific_cuda_archs ${simd_codegen_cuda_archs}) +if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9) + list(APPEND simd_codegen_arch_specific_cuda_archs 120f) +endif() + +function(simd_codegen_get_check_prefixes out_var test_contents arch) + set(check_prefixes "SMXX") + set(arch_prefix "SM${arch}") - file(READ "${test_path}" test_contents) string( REGEX MATCH - "SM[0-9][0-9]*" - has_arch_specific_prefix + ";[ \t]*${arch_prefix}(:|-)" + has_arch_prefix "${test_contents}" ) - - # search for SMXX prefixes in the test contents - set(test_archs) - if (has_arch_specific_prefix) - foreach (arch IN LISTS simd_codegen_cuda_archs) - string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix) - if (NOT arch_specific_prefix EQUAL -1) - list(APPEND test_archs "${arch}") - endif() - endforeach() - else() - set(test_archs ${simd_codegen_cuda_archs}) + if (has_arch_prefix) + string(APPEND check_prefixes ",${arch_prefix}") endif() - # Run tests for each architecture specified in the test file - foreach (arch IN LISTS test_archs) - set(target_name "simd_codegen_sm${arch}_${test_name}") - set(check_prefixes "SMXX") - if (has_arch_specific_prefix) - string(APPEND check_prefixes ",SM${arch}") + if (arch MATCHES "^1[0-9][0-9][af]?$") + string(REGEX MATCH ";[ \t]*SM1XX(:|-)" has_sm1xx_prefix "${test_contents}") + if (has_sm1xx_prefix) + string(APPEND check_prefixes ",SM1XX") endif() + endif() - add_library(${target_name} STATIC "${test_path}") + set(${out_var} "${check_prefixes}" PARENT_SCOPE) +endfunction() +function(simd_codegen_set_cuda_arch target_name arch) + if (arch MATCHES "[af]$") + set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES OFF) + target_compile_options( + ${target_name} + PRIVATE "--generate-code=arch=compute_${arch},code=sm_${arch}" + ) + else() set_target_properties( ${target_name} PROPERTIES CUDA_ARCHITECTURES "${arch}" ) + endif() +endfunction() + +function(simd_codegen_add_tests test_path) + cmake_path(GET test_path FILENAME test_file) + cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) + + file(READ "${test_path}" test_contents) + + set(test_archs) + foreach (arch IN LISTS simd_codegen_arch_specific_cuda_archs) + simd_codegen_get_check_prefixes(check_prefixes "${test_contents}" "${arch}") + if (NOT "${check_prefixes}" STREQUAL "SMXX") + list(APPEND test_archs "${arch}") + endif() + endforeach() + if (NOT test_archs) + set(test_archs ${simd_codegen_cuda_archs}) + endif() - target_compile_options(${target_name} PRIVATE "-Wno-comment") + # Run tests for each architecture specified in the test file + foreach (arch IN LISTS test_archs) + set(target_name "simd_codegen_sm${arch}_${test_name}") + simd_codegen_get_check_prefixes(check_prefixes "${test_contents}" "${arch}") + + add_library(${target_name} STATIC "${test_path}") + simd_codegen_set_cuda_arch(${target_name} "${arch}") + target_compile_options(${target_name} PRIVATE "-Wno-comment" "-dc") target_include_directories( ${target_name} diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu similarity index 59% rename from libcudacxx/test/simd_codegen/decrement_f32x2.cu rename to libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu index 41ec986c9c8..3ddc9e92282 100644 --- a/libcudacxx/test/simd_codegen/decrement_f32x2.cu +++ b/libcudacxx/test/simd_codegen/floating_point/decrement_f32x2.cu @@ -9,28 +9,20 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include namespace simd = cuda::std::simd; -using Vec_f32_4 = simd::basic_vec>; +using Vec_f32_x4 = simd::basic_vec>; -extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out) +__device__ Vec_f32_x4 test_operator_decrement_f32_x4(Vec_f32_x4 vec) { - const cuda::std::array values{in[0], in[1], in[2], in[3]}; - - Vec_f32_4 vec(values); --vec; - - out[0] = vec[0]; - out[1] = vec[1]; - out[2] = vec[2]; - out[3] = vec[3]; + return vec; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_decrement_f32_x4.*}} ; SM100: {{.*FADD2.*}} ; SM100: {{.*FADD2.*}} diff --git a/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu new file mode 100644 index 00000000000..31aed77de22 --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/fma_bf16.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +__device__ Vec_bf16_x4 test_fma_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs, Vec_bf16_x4 add) +{ + return lhs * rhs + add; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_fma_bf16_x4.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM1XX: {{.*HFMA2.*BF16.*}} +; SM1XX: {{.*HFMA2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu b/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu new file mode 100644 index 00000000000..f1e3b235e73 --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/fma_f16.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +#if _CCCL_HAS_NVFP16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +__device__ Vec_f16_x4 test_fma_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs, Vec_f16_x4 add) +{ + return lhs * rhs + add; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_fma_f16_x4.*}} +; SM80: {{.*HFMA2.*}} +; SM80: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM1XX: {{.*HFMA2.*}} +; SM1XX: {{.*HFMA2.*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu similarity index 59% rename from libcudacxx/test/simd_codegen/increment_f32x2.cu rename to libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu index 9c696f11e40..d61770821fb 100644 --- a/libcudacxx/test/simd_codegen/increment_f32x2.cu +++ b/libcudacxx/test/simd_codegen/floating_point/increment_f32x2.cu @@ -9,28 +9,20 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include namespace simd = cuda::std::simd; -using Vec_f32_4 = simd::basic_vec>; +using Vec_f32_x4 = simd::basic_vec>; -extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out) +__device__ Vec_f32_x4 test_operator_increment_f32_x4(Vec_f32_x4 vec) { - const cuda::std::array values{in[0], in[1], in[2], in[3]}; - - Vec_f32_4 vec(values); ++vec; - - out[0] = vec[0]; - out[1] = vec[1]; - out[2] = vec[2]; - out[3] = vec[3]; + return vec; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_increment_f32_x4.*}} ; SM100: {{.*FADD2.*}} ; SM100: {{.*FADD2.*}} diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/less_bf16.cu similarity index 51% rename from libcudacxx/test/simd_codegen/less_bf16.cu rename to libcudacxx/test/simd_codegen/floating_point/less_bf16.cu index fff63e06d1c..7fec5668f19 100644 --- a/libcudacxx/test/simd_codegen/less_bf16.cu +++ b/libcudacxx/test/simd_codegen/floating_point/less_bf16.cu @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include #if _CCCL_HAS_NVBF16() @@ -17,36 +16,25 @@ namespace simd = cuda::std::simd; -using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; +using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; +using Mask_bf16_x4 = Vec_bf16_x4::mask_type; -extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out) +__device__ Mask_bf16_x4 test_less_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs) { - const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_bf16_4 lhs_vec(lhs_values); - const Vec_bf16_4 rhs_vec(rhs_values); - const auto result = lhs_vec < rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; + return lhs < rhs; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_less_bf16_x4.*}} ; SM80: {{.*FSETP\.LT.*}} ; SM80: {{.*FSETP\.LT.*}} ; SM80: {{.*FSETP\.LT.*}} ; SM80: {{.*FSETP\.LT.*}} ; SM90: {{.*HSETP2.*BF16.*}} ; SM90: {{.*HSETP2.*BF16.*}} -; SM100: {{.*HSETP2.*BF16.*}} -; SM100: {{.*HSETP2.*BF16.*}} -; SM120: {{.*HSETP2.*BF16.*}} -; SM120: {{.*HSETP2.*BF16.*}} +; SM1XX: {{.*HSETP2.*BF16.*}} +; SM1XX: {{.*HSETP2.*BF16.*}} */ diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/floating_point/less_f16.cu similarity index 51% rename from libcudacxx/test/simd_codegen/less_f16.cu rename to libcudacxx/test/simd_codegen/floating_point/less_f16.cu index beef111b99e..62530ad19ce 100644 --- a/libcudacxx/test/simd_codegen/less_f16.cu +++ b/libcudacxx/test/simd_codegen/floating_point/less_f16.cu @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include #if _CCCL_HAS_NVFP16() @@ -17,34 +16,23 @@ namespace simd = cuda::std::simd; -using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; +using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>; +using Mask_f16_x4 = Vec_f16_x4::mask_type; -extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out) +__device__ Mask_f16_x4 test_less_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs) { - const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_f16_4 lhs_vec(lhs_values); - const Vec_f16_4 rhs_vec(rhs_values); - const auto result = lhs_vec < rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; + return lhs < rhs; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_less_f16_x4.*}} ; SM80: {{.*HSETP2.*}} ; SM80: {{.*HSETP2.*}} ; SM90: {{.*HSETP2.*}} ; SM90: {{.*HSETP2.*}} -; SM100: {{.*HSETP2.*}} -; SM100: {{.*HSETP2.*}} -; SM120: {{.*HSETP2.*}} -; SM120: {{.*HSETP2.*}} +; SM1XX: {{.*HSETP2.*}} +; SM1XX: {{.*HSETP2.*}} */ diff --git a/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu new file mode 100644 index 00000000000..d96f073899e --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/minus_f32x2.cu @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_f32_x4 = simd::basic_vec>; + +__device__ Vec_f32_x4 test_operator_minus_f32_x4(Vec_f32_x4 lhs, Vec_f32_x4 rhs) +{ + return lhs - rhs; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_f32_x4.*}} +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu new file mode 100644 index 00000000000..3e103287e14 --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/multiplies_bf16.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +__device__ Vec_bf16_x4 test_operator_multiplies_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs) +{ + return lhs * rhs; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_multiplies_bf16_x4.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HMUL2.*BF16.*}} +; SM1XX: {{.*HFMA2.*BF16.*}} +; SM1XX: {{.*HMUL2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu similarity index 50% rename from libcudacxx/test/simd_codegen/multiplies_f16.cu rename to libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu index bf721009051..bcd982ffe0f 100644 --- a/libcudacxx/test/simd_codegen/multiplies_f16.cu +++ b/libcudacxx/test/simd_codegen/floating_point/multiplies_f16.cu @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include #if _CCCL_HAS_NVFP16() @@ -17,34 +16,22 @@ namespace simd = cuda::std::simd; -using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; +using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>; -extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out) +__device__ Vec_f16_x4 test_operator_multiplies_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs) { - const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_f16_4 lhs_vec(lhs_values); - const Vec_f16_4 rhs_vec(rhs_values); - const Vec_f16_4 result = lhs_vec * rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; + return lhs * rhs; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_multiplies_f16_x4.*}} ; SM80: {{.*(HMUL2|HFMA2).*}} ; SM80: {{.*(HMUL2|HFMA2).*}} ; SM90: {{.*(HMUL2|HFMA2).*}} ; SM90: {{.*(HMUL2|HFMA2).*}} -; SM100: {{.*(HMUL2|HFMA2).*}} -; SM100: {{.*(HMUL2|HFMA2).*}} -; SM120: {{.*(HMUL2|HFMA2).*}} -; SM120: {{.*(HMUL2|HFMA2).*}} +; SM1XX: {{.*(HMUL2|HFMA2).*}} +; SM1XX: {{.*(HMUL2|HFMA2).*}} */ diff --git a/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu b/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu new file mode 100644 index 00000000000..5321f23955d --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/plus_bf16.cu @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_x4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +__device__ Vec_bf16_x4 test_operator_plus_bf16_x4(Vec_bf16_x4 lhs, Vec_bf16_x4 rhs) +{ + return lhs + rhs; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_bf16_x4.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HADD2.*BF16.*}} +; SM1XX: {{.*HFMA2.*BF16.*}} +; SM1XX: {{.*HADD2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/floating_point/plus_f16.cu similarity index 53% rename from libcudacxx/test/simd_codegen/plus_f16.cu rename to libcudacxx/test/simd_codegen/floating_point/plus_f16.cu index 4a044a19d1c..a7a4e9120da 100644 --- a/libcudacxx/test/simd_codegen/plus_f16.cu +++ b/libcudacxx/test/simd_codegen/floating_point/plus_f16.cu @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include #if _CCCL_HAS_NVFP16() @@ -17,26 +16,16 @@ namespace simd = cuda::std::simd; -using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; +using Vec_f16_x4 = simd::basic_vec<__half, simd::fixed_size<4>>; -extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out) +__device__ Vec_f16_x4 test_operator_plus_f16_x4(Vec_f16_x4 lhs, Vec_f16_x4 rhs) { - const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_f16_4 lhs_vec(lhs_values); - const Vec_f16_4 rhs_vec(rhs_values); - const Vec_f16_4 result = lhs_vec + rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; + return lhs + rhs; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_f16_x4.*}} ; SMXX: {{.*(HADD2|HFMA2).*}} ; SMXX: {{.*(HADD2|HFMA2).*}} diff --git a/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu new file mode 100644 index 00000000000..1979e6bf001 --- /dev/null +++ b/libcudacxx/test/simd_codegen/floating_point/plus_f32x2.cu @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_f32_x4 = simd::basic_vec>; + +__device__ Vec_f32_x4 test_operator_plus_f32_x4(Vec_f32_x4 lhs, Vec_f32_x4 rhs) +{ + return lhs + rhs; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_f32_x4.*}} +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu similarity index 56% rename from libcudacxx/test/simd_codegen/unary_minus_f32x2.cu rename to libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu index 02d9c14f18b..a8878b18f71 100644 --- a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu +++ b/libcudacxx/test/simd_codegen/floating_point/unary_minus_f32x2.cu @@ -9,28 +9,19 @@ //===----------------------------------------------------------------------===// #include // IWYU pragma: keep -#include namespace simd = cuda::std::simd; -using Vec_f32_4 = simd::basic_vec>; +using Vec_f32_x4 = simd::basic_vec>; -extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out) +__device__ Vec_f32_x4 test_operator_unary_minus_f32_x4(Vec_f32_x4 in) { - const cuda::std::array values{in[0], in[1], in[2], in[3]}; - - const Vec_f32_4 vec(values); - const Vec_f32_4 result = -vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; + return -in; } /* -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4 +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_f32_x4.*}} ; SM100: {{.*FADD2.*}} ; SM100: {{.*FADD2.*}} diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu deleted file mode 100644 index f608f209e09..00000000000 --- a/libcudacxx/test/simd_codegen/fma_bf16.cu +++ /dev/null @@ -1,54 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -#if _CCCL_HAS_NVBF16() - -# include - -namespace simd = cuda::std::simd; - -using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; - -extern "C" __global__ void -test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out) -{ - const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]}; - - const Vec_bf16_4 lhs_vec(lhs_values); - const Vec_bf16_4 rhs_vec(rhs_values); - const Vec_bf16_4 add_vec(add_values); - const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4 -; SM80: {{.*HFMA2.*BF16.*}} -; SM80: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HFMA2.*BF16.*}} -; SM100: {{.*HFMA2.*BF16.*}} -; SM100: {{.*HFMA2.*BF16.*}} -; SM120: {{.*HFMA2.*BF16.*}} -; SM120: {{.*HFMA2.*BF16.*}} - -*/ - -#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu deleted file mode 100644 index a0dad310ac9..00000000000 --- a/libcudacxx/test/simd_codegen/fma_f16.cu +++ /dev/null @@ -1,53 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -#if _CCCL_HAS_NVFP16() - -# include - -namespace simd = cuda::std::simd; - -using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; - -extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out) -{ - const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]}; - - const Vec_f16_4 lhs_vec(lhs_values); - const Vec_f16_4 rhs_vec(rhs_values); - const Vec_f16_4 add_vec(add_values); - const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4 -; SM80: {{.*HFMA2.*}} -; SM80: {{.*HFMA2.*}} -; SM90: {{.*HFMA2.*}} -; SM90: {{.*HFMA2.*}} -; SM100: {{.*HFMA2.*}} -; SM100: {{.*HFMA2.*}} -; SM120: {{.*HFMA2.*}} -; SM120: {{.*HFMA2.*}} - -*/ - -#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu b/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu new file mode 100644 index 00000000000..9d989c7377e --- /dev/null +++ b/libcudacxx/test/simd_codegen/integer/arithmetic_u16x2.cu @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_u16_x2 = simd::basic_vec>; + +__device__ Vec_u16_x2 test_operator_plus_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs) +{ + return lhs + rhs; +} + +__device__ Vec_u16_x2 test_operator_minus_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs) +{ + return lhs - rhs; +} + +__device__ Vec_u16_x2 test_operator_post_decrement_u16_x2(Vec_u16_x2 in) +{ + (void) in--; + return in; +} + +__device__ Vec_u16_x2 test_operator_post_increment_u16_x2(Vec_u16_x2 in) +{ + (void) in++; + return in; +} + +__device__ Vec_u16_x2 test_operator_unary_minus_u16_x2(Vec_u16_x2 in) +{ + return -in; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_u16_x2.*}} +; SM90: {{.*VIADD.*}} +; SM1XX: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_increment_u16_x2.*}} +; SM90: {{.*VIADD.*}} +; SM1XX: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_decrement_u16_x2.*}} +; SM90: {{.*VIADD.*}} +; SM1XX: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_u16_x2.*}} +; SM90: {{.*VIADD.*}} +; SM1XX: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_u16_x2.*}} +; SM90: {{.*VIADD.*}} +; SM1XX: {{.*VIADD.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu b/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu new file mode 100644 index 00000000000..c836af3cec8 --- /dev/null +++ b/libcudacxx/test/simd_codegen/integer/arithmetic_u8x4.cu @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_u8_x4 = simd::basic_vec>; + +__device__ Vec_u8_x4 test_operator_plus_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs) +{ + return lhs + rhs; +} + +__device__ Vec_u8_x4 test_operator_minus_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs) +{ + return lhs - rhs; +} + +__device__ Vec_u8_x4 test_operator_post_decrement_u8_x4(Vec_u8_x4 in) +{ + (void) in--; + return in; +} + +__device__ Vec_u8_x4 test_operator_post_increment_u8_x4(Vec_u8_x4 in) +{ + (void) in++; + return in; +} + +__device__ Vec_u8_x4 test_operator_unary_minus_u8_x4(Vec_u8_x4 in) +{ + return -in; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_unary_minus_u8_x4.*}} +; SM120f: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_increment_u8_x4.*}} +; SM120f: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_post_decrement_u8_x4.*}} +; SM120f: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_minus_u8_x4.*}} +; SM120f: {{.*VIADD.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_operator_plus_u8_x4.*}} +; SM120f: {{.*VIADD.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu b/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu new file mode 100644 index 00000000000..eaffb5ded08 --- /dev/null +++ b/libcudacxx/test/simd_codegen/integer/bitwise_u16x2_u8x4.cu @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using cuda::std::uint16_t; +using cuda::std::uint8_t; + +using Vec_u16_x2 = simd::basic_vec>; +using Vec_u8_x4 = simd::basic_vec>; + +__device__ void test_bitwise_and_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs) +{ + out = lhs & rhs; +} + +__device__ void test_bitwise_or_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs) +{ + out = lhs | rhs; +} + +__device__ void test_bitwise_xor_u16_x2(Vec_u16_x2& out, Vec_u16_x2& lhs, Vec_u16_x2& rhs) +{ + out = lhs ^ rhs; +} + +__device__ void test_bitwise_not_u16_x2(Vec_u16_x2& out, Vec_u16_x2& in) +{ + out = ~in; +} + +__device__ void test_bitwise_and_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs) +{ + out = lhs & rhs; +} + +__device__ void test_bitwise_or_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs) +{ + out = lhs | rhs; +} + +__device__ void test_bitwise_xor_u8_x4(Vec_u8_x4& out, Vec_u8_x4& lhs, Vec_u8_x4& rhs) +{ + out = lhs ^ rhs; +} + +__device__ void test_bitwise_not_u8_x4(Vec_u8_x4& out, Vec_u8_x4& in) +{ + out = ~in; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_not_u8_x4.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_xor_u8_x4.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_or_u8_x4.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_and_u8_x4.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_not_u16_x2.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_xor_u16_x2.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_or_u16_x2.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_bitwise_and_u16_x2.*}} +; SMXX-NOT: {{.*(LD\.E\.U(8|16)|PRMT|SHF|I2I|IMAD\.SHL).*}} +; SMXX: {{.*LOP3.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu deleted file mode 100644 index 2c00e62b9ee..00000000000 --- a/libcudacxx/test/simd_codegen/minus_f32x2.cu +++ /dev/null @@ -1,39 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -namespace simd = cuda::std::simd; - -using Vec_f32_4 = simd::basic_vec>; - -extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out) -{ - const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_f32_4 lhs_vec(lhs_values); - const Vec_f32_4 rhs_vec(rhs_values); - const Vec_f32_4 result = lhs_vec - rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4 -; SM100: {{.*FADD2.*}} -; SM100: {{.*FADD2.*}} - -*/ diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu deleted file mode 100644 index 3036c4fd526..00000000000 --- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu +++ /dev/null @@ -1,52 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -#if _CCCL_HAS_NVBF16() - -# include - -namespace simd = cuda::std::simd; - -using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; - -extern "C" __global__ void -test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) -{ - const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_bf16_4 lhs_vec(lhs_values); - const Vec_bf16_4 rhs_vec(rhs_values); - const Vec_bf16_4 result = lhs_vec * rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4 -; SM80: {{.*HFMA2.*BF16.*}} -; SM80: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HMUL2.*BF16.*}} -; SM100: {{.*HFMA2.*BF16.*}} -; SM100: {{.*HMUL2.*BF16.*}} -; SM120: {{.*HFMA2.*BF16.*}} -; SM120: {{.*HMUL2.*BF16.*}} - -*/ - -#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu deleted file mode 100644 index 3d192fdad83..00000000000 --- a/libcudacxx/test/simd_codegen/plus_bf16.cu +++ /dev/null @@ -1,52 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -#if _CCCL_HAS_NVBF16() - -# include - -namespace simd = cuda::std::simd; - -using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; - -extern "C" __global__ void -test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) -{ - const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_bf16_4 lhs_vec(lhs_values); - const Vec_bf16_4 rhs_vec(rhs_values); - const Vec_bf16_4 result = lhs_vec + rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4 -; SM80: {{.*HFMA2.*BF16.*}} -; SM80: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HFMA2.*BF16.*}} -; SM90: {{.*HADD2.*BF16.*}} -; SM100: {{.*HFMA2.*BF16.*}} -; SM100: {{.*HADD2.*BF16.*}} -; SM120: {{.*HFMA2.*BF16.*}} -; SM120: {{.*HADD2.*BF16.*}} - -*/ - -#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu deleted file mode 100644 index b0caf15341b..00000000000 --- a/libcudacxx/test/simd_codegen/plus_f32x2.cu +++ /dev/null @@ -1,39 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include // IWYU pragma: keep -#include - -namespace simd = cuda::std::simd; - -using Vec_f32_4 = simd::basic_vec>; - -extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out) -{ - const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; - const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; - - const Vec_f32_4 lhs_vec(lhs_values); - const Vec_f32_4 rhs_vec(rhs_values); - const Vec_f32_4 result = lhs_vec + rhs_vec; - - out[0] = result[0]; - out[1] = result[1]; - out[2] = result[2]; - out[3] = result[3]; -} - -/* - -; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4 -; SM100: {{.*FADD2.*}} -; SM100: {{.*FADD2.*}} - -*/ From cbb03d925245fc24a6eb988a8413e57c6182b503 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 19 May 2026 15:43:59 -0700 Subject: [PATCH 2/4] `cuda::simd` Add `saturation_add` --- docs/libcudacxx/extended_api.rst | 1 + docs/libcudacxx/extended_api/simd.rst | 24 ++++ .../extended_api/simd/saturating_add.rst | 78 +++++++++++ .../include/cuda/__simd/saturating_add.h | 89 +++++++++++++ .../include/cuda/__simd/simd_intrinsics.h | 104 +++++++++++++++ .../cuda/__simd/simd_intrinsics_array.h | 83 ++++++++++++ libcudacxx/include/cuda/simd | 27 ++++ .../include/cuda/std/__internal/features.h | 12 ++ .../include/cuda/std/__internal/namespaces.h | 2 + .../include/cuda/std/__simd/basic_vec.h | 2 - .../simd/simd.non_std/saturation_add.pass.cpp | 124 ++++++++++++++++++ libcudacxx/test/simd_codegen/CMakeLists.txt | 10 ++ .../saturation_add/saturating_add.cu | 54 ++++++++ 13 files changed, 608 insertions(+), 2 deletions(-) create mode 100644 docs/libcudacxx/extended_api/simd.rst create mode 100644 docs/libcudacxx/extended_api/simd/saturating_add.rst create mode 100644 libcudacxx/include/cuda/__simd/saturating_add.h create mode 100644 libcudacxx/include/cuda/__simd/simd_intrinsics.h create mode 100644 libcudacxx/include/cuda/__simd/simd_intrinsics_array.h create mode 100644 libcudacxx/include/cuda/simd create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp create mode 100644 libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu diff --git a/docs/libcudacxx/extended_api.rst b/docs/libcudacxx/extended_api.rst index a2416b09f94..dbebf3e60e3 100644 --- a/docs/libcudacxx/extended_api.rst +++ b/docs/libcudacxx/extended_api.rst @@ -20,6 +20,7 @@ Extended API extended_api/type_traits extended_api/vector_tuple_protocol extended_api/numeric + extended_api/simd extended_api/random extended_api/memory extended_api/memory_resource diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst new file mode 100644 index 00000000000..080a3b48331 --- /dev/null +++ b/docs/libcudacxx/extended_api/simd.rst @@ -0,0 +1,24 @@ +.. _libcudacxx-extended-api-simd: + +SIMD +==== + +.. toctree:: + :hidden: + :maxdepth: 1 + + simd/saturating_add + +.. list-table:: + :widths: 25 45 30 30 + :header-rows: 1 + + * - **Header** + - **Content** + - **CCCL Availability** + - **CUDA Toolkit Availability** + + * - :ref:`cuda::simd::saturating_add ` + - Performs element-wise saturating addition of two ``basic_vec`` objects + - CCCL 3.5.0 + - CUDA 13.5 diff --git a/docs/libcudacxx/extended_api/simd/saturating_add.rst b/docs/libcudacxx/extended_api/simd/saturating_add.rst new file mode 100644 index 00000000000..4f34f582256 --- /dev/null +++ b/docs/libcudacxx/extended_api/simd/saturating_add.rst @@ -0,0 +1,78 @@ +.. _libcudacxx-extended-api-simd-saturating-add: + +``cuda::simd::saturating_add`` +============================== + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda::simd { + + template + [[nodiscard]] __host__ __device__ constexpr + cuda::std::simd::basic_vec saturating_add( + const cuda::std::simd::basic_vec& lhs, + const cuda::std::simd::basic_vec& rhs) noexcept; + + } // namespace cuda::simd + +The function ``cuda::simd::saturating_add`` performs element-wise saturating addition of two ``cuda::std::simd::basic_vec`` objects. + +For each element ``i`` in the input vectors, the result is equivalent to: + +.. code:: cuda + + cuda::std::saturating_add(lhs[i], rhs[i]) + +**Parameters** + +- ``lhs``: The left-hand side input vector. +- ``rhs``: The right-hand side input vector. + +**Return value** + +Returns a ``cuda::std::simd::basic_vec`` where each element contains the saturated sum of the corresponding elements in ``lhs`` and ``rhs``. + +**Constraints** + +- ``T`` must be an `integer type `__. + +**Performance considerations** + +- Packed 8-bit and 16-bit integer vectors perform saturating addition using ``VIADD.S8x4``, ``VIADD.U8x4``, ``VIADD.S16x2``, ``VIADD.U16x2`` on ``SM120f``. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + namespace simd = cuda::std::simd; + + __global__ void kernel() + { + using vec_t = simd::basic_vec>; + + cuda::std::array lhs_values{250, 10, 20, 30}; + cuda::std::array rhs_values{10, 20, 30, 40}; + vec_t lhs(lhs_values); + vec_t rhs(rhs_values); + + vec_t result = cuda::simd::saturating_add(lhs, rhs); + + assert(result[0] == 255); + assert(result[1] == 30); + assert(result[2] == 50); + assert(result[3] == 70); + } + + int main() + { + kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + } diff --git a/libcudacxx/include/cuda/__simd/saturating_add.h b/libcudacxx/include/cuda/__simd/saturating_add.h new file mode 100644 index 00000000000..c5824aeffef --- /dev/null +++ b/libcudacxx/include/cuda/__simd/saturating_add.h @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___SIMD_SATURATING_ADD_H +#define _CUDA___SIMD_SATURATING_ADD_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#if _CCCL_HAS_SIMD_SAT() +# include +# include +# include +#endif // _CCCL_HAS_SIMD_SAT() + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_SIMD + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) +[[nodiscard]] _CCCL_API constexpr ::cuda::std::simd::basic_vec<_Tp, _Abi> saturating_add( + const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs, const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __rhs) noexcept +{ + using __basic_vec_t = ::cuda::std::simd::basic_vec<_Tp, _Abi>; + using __simd_storage_t = typename __basic_vec_t::_Storage; + constexpr auto __size = __basic_vec_t::__size; + +#if _CCCL_HAS_SIMD_SAT() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (sizeof(_Tp) == 1 || sizeof(_Tp) == 2) + { + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ + using __unsigned_storage_t = ::cuda::std::simd::__simd_storage_u32_t<__simd_storage_t>; + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_); + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_); + __unsigned_storage_t __result_u{}; + if constexpr (sizeof(_Tp) == 2) + { + __result_u = ::cuda::simd::__vadd_sat_16bit_x2<_Tp>(__lhs_u, __rhs_u); + } + else + { + __result_u = ::cuda::simd::__vadd_sat_8bit_x4<_Tp>(__lhs_u, __rhs_u); + } + const auto __result_s = + ::cuda::std::simd::__copy_from_unsigned_storage<__simd_storage_t>(__result_u); + return __basic_vec_t{__result_s, __basic_vec_t::__storage_tag}; + })); + } + } +#endif // _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() && _CCCL_HAS_SIMD_SAT() + + __simd_storage_t __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::simd::__simd_size_type __i = 0; __i < __size; ++__i) + { + __result.__data[__i] = ::cuda::std::saturating_add(__lhs.__s_.__data[__i], __rhs.__s_.__data[__i]); + } + return __basic_vec_t{__result, __basic_vec_t::__storage_tag}; +} + +_CCCL_END_NAMESPACE_CUDA_SIMD + +#include + +#endif // _CUDA___SIMD_SATURATING_ADD_H diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h new file mode 100644 index 00000000000..cc241740520 --- /dev/null +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___SIMD_SIMD_INTRINSICS_H +#define _CUDA___SIMD_SIMD_INTRINSICS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_SIMD_SAT() + +# include +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_SIMD + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u16x2( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept +{ +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus2(__lhs, __rhs);)) +# elif _CCCL_HAS_SIMD_SAT_PTX() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ + ::cuda::std::uint32_t __result{}; + asm("add.sat.u16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u16x2: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s16x2( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept +{ +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss2(__lhs, __rhs);)) +# elif _CCCL_HAS_SIMD_SAT_PTX() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ + ::cuda::std::uint32_t __result{}; + asm("add.sat.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s16x2: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept +{ +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus4(__lhs, __rhs);)) +# elif _CCCL_HAS_SIMD_SAT_PTX() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ + ::cuda::std::uint32_t __result{}; + asm("add.sat.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept +{ +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss4(__lhs, __rhs);)) +# elif _CCCL_HAS_SIMD_SAT_PTX() + NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ + ::cuda::std::uint32_t __result{}; + asm("add.sat.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +_CCCL_END_NAMESPACE_CUDA_SIMD + +# include + +#endif // _CCCL_HAS_SIMD_SAT() +#endif // _CUDA___SIMD_SIMD_INTRINSICS_H diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h new file mode 100644 index 00000000000..71fd0613702 --- /dev/null +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H +#define _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_SIMD_SAT() + +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_SIMD + +//---------------------------------------------------------------------------------------------------------------------- +// device-only functions + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_16bit_x2( + const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept +{ + ::cuda::std::simd::__array_u32_t<_Np> __result_u; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (::cuda::std::is_unsigned_v<_Tp>) + { + __result_u[__i] = ::cuda::simd::__vadd_sat_u16x2(__lhs_u[__i], __rhs_u[__i]); + } + else + { + __result_u[__i] = ::cuda::simd::__vadd_sat_s16x2(__lhs_u[__i], __rhs_u[__i]); + } + } + return __result_u; +} + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_8bit_x4( + const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept +{ + ::cuda::std::simd::__array_u32_t<_Np> __result_u; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (::cuda::std::is_unsigned_v<_Tp>) + { + __result_u[__i] = ::cuda::simd::__vadd_sat_u8x4(__lhs_u[__i], __rhs_u[__i]); + } + else + { + __result_u[__i] = ::cuda::simd::__vadd_sat_s8x4(__lhs_u[__i], __rhs_u[__i]); + } + } + return __result_u; +} + +_CCCL_END_NAMESPACE_CUDA_SIMD + +# include + +#endif // _CCCL_HAS_SIMD_SAT() +#endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd new file mode 100644 index 00000000000..327ba22f527 --- /dev/null +++ b/libcudacxx/include/cuda/simd @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_SIMD +#define _CUDA_SIMD + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#endif // _CUDA_SIMD diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h index 3b8d43c521f..a9b01dfc274 100644 --- a/libcudacxx/include/cuda/std/__internal/features.h +++ b/libcudacxx/include/cuda/std/__internal/features.h @@ -111,6 +111,18 @@ #define _CCCL_HAS_SIMD_8BIT() \ ((_CCCL_HAS_SIMD_8BIT_PTX() || _CCCL_HAS_SIMD_8BIT_INTRINSICS()) && !_CCCL_TILE_COMPILATION()) +// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics +#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 +#define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) +#define _CCCL_HAS_SIMD_SAT() \ + (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + +// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics +#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 +#define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) +#define _CCCL_HAS_SIMD_SAT() \ + (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + // Third party libraries #if (__has_include() || __has_include()) && \ diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index 2cca27fc6be..1e3b8a74004 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -80,6 +80,8 @@ // Namespaces related to #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD _CCCL_BEGIN_NAMESPACE(cuda::std::simd) #define _CCCL_END_NAMESPACE_CUDA_STD_SIMD _CCCL_END_NAMESPACE(cuda::std::simd) +#define _CCCL_BEGIN_NAMESPACE_CUDA_SIMD _CCCL_BEGIN_NAMESPACE(cuda::simd) +#define _CCCL_END_NAMESPACE_CUDA_SIMD _CCCL_END_NAMESPACE(cuda::simd) // Namespaces related to #define _CCCL_BEGIN_NAMESPACE_CUDA_STD_RANGES _CCCL_BEGIN_NAMESPACE(cuda::std::ranges) diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h index 37824dbc48c..7d0796f779d 100644 --- a/libcudacxx/include/cuda/std/__simd/basic_vec.h +++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h @@ -70,7 +70,6 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_ using value_type = _Tp; using mask_type = basic_mask; -private: template friend class basic_mask; @@ -100,7 +99,6 @@ class basic_vec<_Tp, _Abi, enable_if_t<__is_vectorizable_v<_Tp> && __is_enabled_ __s_.__set(__i, __v); } -public: using abi_type = _Abi; using iterator = __simd_iterator; diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp new file mode 100644 index 00000000000..16f77eb8d3b --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/saturation_add.pass.cpp @@ -0,0 +1,124 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template +// constexpr basic_vec cuda::simd::saturating_add( +// const basic_vec& lhs, const basic_vec& rhs) noexcept; + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +template +using fixed_size_vec = simd::basic_vec>; + +template +inline constexpr bool has_saturating_add = false; + +template +inline constexpr bool has_saturating_add< + Vec, + cuda::std::void_t(), cuda::std::declval()))>> = true; + +template +TEST_FUNC constexpr void test_values(cuda::std::array lhs_values, cuda::std::array rhs_values) +{ + using Vec = simd::basic_vec>; + Vec lhs(lhs_values); + Vec rhs(rhs_values); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::simd::saturating_add(lhs, rhs))); + + Vec result = cuda::simd::saturating_add(lhs, rhs); + for (int i = 0; i < N; ++i) + { + assert(result[i] == cuda::std::saturating_add(lhs_values[i], rhs_values[i])); + } +} + +template +TEST_FUNC constexpr void test_size() +{ + constexpr auto min_val = cuda::std::numeric_limits::min(); + constexpr auto max_val = cuda::std::numeric_limits::max(); + + if constexpr (cuda::std::is_signed_v) + { + cuda::std::array lhs_values{max_val, min_val, T{10}}; + cuda::std::array rhs_values{T{1}, T{-1}, T{-20}}; + if constexpr (N > 3) + { + lhs_values[3] = T{-20}; + rhs_values[3] = T{10}; + } + test_values(lhs_values, rhs_values); + } + else + { + cuda::std::array lhs_values{max_val, min_val, T{10}}; + cuda::std::array rhs_values{T{1}, T{1}, T{20}}; + if constexpr (N > 3) + { + lhs_values[3] = T{20}; + rhs_values[3] = T{30}; + } + test_values(lhs_values, rhs_values); + } +} + +template +TEST_FUNC constexpr void test() +{ + test_size(); + test_size(); +} + +TEST_FUNC constexpr bool test_all() +{ + static_assert(!has_saturating_add>); + + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__int128_t>(); +#endif // _CCCL_HAS_INT128() + + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + + return true; +} + +int main(int, char**) +{ + assert(test_all()); + static_assert(test_all()); + return 0; +} diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 4c073d6069f..79b98ac4693 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -32,6 +32,16 @@ find_program(bash "bash" REQUIRED) set(libcudacxx_simd_codegen_tests) if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.2) + file( + GLOB libcudacxx_simd_codegen_saturation_add_tests + "saturation_add/*.cu" + ) + list( + APPEND libcudacxx_simd_codegen_tests + ${libcudacxx_simd_codegen_saturation_add_tests} + ) + endif() endif() set(simd_codegen_cuda_archs 80 90) diff --git a/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu b/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu new file mode 100644 index 00000000000..193079e6fac --- /dev/null +++ b/libcudacxx/test/simd_codegen/saturation_add/saturating_add.cu @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_s16_x2 = simd::basic_vec>; +using Vec_s8_x4 = simd::basic_vec>; +using Vec_u16_x2 = simd::basic_vec>; +using Vec_u8_x4 = simd::basic_vec>; + +__device__ Vec_u16_x2 test_saturating_add_u16_x2(Vec_u16_x2 lhs, Vec_u16_x2 rhs) +{ + return cuda::simd::saturating_add(lhs, rhs); +} + +__device__ Vec_s16_x2 test_saturating_add_s16_x2(Vec_s16_x2 lhs, Vec_s16_x2 rhs) +{ + return cuda::simd::saturating_add(lhs, rhs); +} + +__device__ Vec_u8_x4 test_saturating_add_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs) +{ + return cuda::simd::saturating_add(lhs, rhs); +} + +__device__ Vec_s8_x4 test_saturating_add_s8_x4(Vec_s8_x4 lhs, Vec_s8_x4 rhs) +{ + return cuda::simd::saturating_add(lhs, rhs); +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_s8_x4.*}} +; SM120f: {{.*VIADD\.S8x4\.ISAT.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_u8_x4.*}} +; SM120f: {{.*VIADD\.U8x4\.ISAT.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_s16_x2.*}} +; SM120f: {{.*VIADD\.S16x2\.ISAT.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_saturating_add_u16_x2.*}} +; SM120f: {{.*VIADD\.16x2\.ISAT.*}} + +*/ From f690b2e444464764b1e2ad73dda085ebc02094db Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 19 May 2026 15:44:00 -0700 Subject: [PATCH 3/4] `cuda::simd` Add `abs_diff` --- docs/libcudacxx/extended_api/simd.rst | 6 + .../libcudacxx/extended_api/simd/abs_diff.rst | 84 +++++++++++ .../include/cuda/__simd/simd_intrinsics.h | 64 +++++++-- .../cuda/__simd/simd_intrinsics_array.h | 34 ++++- libcudacxx/include/cuda/__simd/vabsdiff.h | 85 +++++++++++ libcudacxx/include/cuda/simd | 1 + .../include/cuda/std/__internal/features.h | 16 ++- .../__simd/specializations/simd_intrinsics.h | 37 +++-- .../simd/simd.non_std/vabsdiff.pass.cpp | 133 ++++++++++++++++++ libcudacxx/test/simd_codegen/CMakeLists.txt | 7 +- .../test/simd_codegen/vabsdiff/vabsdiff.cu | 44 ++++++ 11 files changed, 473 insertions(+), 38 deletions(-) create mode 100644 docs/libcudacxx/extended_api/simd/abs_diff.rst create mode 100644 libcudacxx/include/cuda/__simd/vabsdiff.h create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp create mode 100644 libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst index 080a3b48331..524b1bb8cb1 100644 --- a/docs/libcudacxx/extended_api/simd.rst +++ b/docs/libcudacxx/extended_api/simd.rst @@ -8,6 +8,7 @@ SIMD :maxdepth: 1 simd/saturating_add + simd/abs_diff .. list-table:: :widths: 25 45 30 30 @@ -22,3 +23,8 @@ SIMD - Performs element-wise saturating addition of two ``basic_vec`` objects - CCCL 3.5.0 - CUDA 13.5 + + * - :ref:`cuda::simd::abs_diff ` + - Performs element-wise absolute difference of two integer ``basic_vec`` objects + - CCCL 3.5.0 + - CUDA 13.5 diff --git a/docs/libcudacxx/extended_api/simd/abs_diff.rst b/docs/libcudacxx/extended_api/simd/abs_diff.rst new file mode 100644 index 00000000000..2bdc4011fa3 --- /dev/null +++ b/docs/libcudacxx/extended_api/simd/abs_diff.rst @@ -0,0 +1,84 @@ +.. _libcudacxx-extended-api-simd-abs-diff: + +``cuda::simd::abs_diff`` +======================== + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda::simd { + + template + [[nodiscard]] __host__ __device__ constexpr + cuda::std::simd::basic_vec, Abi> abs_diff( + const cuda::std::simd::basic_vec& lhs, + const cuda::std::simd::basic_vec& rhs) noexcept; + + } // namespace cuda::simd + +The function ``cuda::simd::abs_diff`` performs element-wise absolute difference of two integer ``cuda::std::simd::basic_vec`` objects. + +For each element ``i`` in the input vectors, the result is equivalent to: + +.. code:: cuda + + abs(lhs[i] - rhs[i]) + +The return type is always an *unsigned* ``basic_vec`` with the same ABI as the input vectors. + +**Parameters** + +- ``lhs``: The left-hand side input vector. +- ``rhs``: The right-hand side input vector. + +**Return value** + +Returns a ``cuda::std::simd::basic_vec, Abi>`` where each element contains the unsigned absolute difference of the corresponding elements in ``lhs`` and ``rhs``. + +**Constraints** + +- ``T`` must be an integer type. + +**Performance considerations** + +- Packed 8-bit integer vectors perform absolute difference using: + + - ``VABSDIFF4`` on ``SM80``, ``SM86``, ``SM87``, ``SM89``, ``SM90``, ``SM100``, ``SM103``, and ``SM110``. + - ``VIMNMX.S8x4/U8x4`` on ``SM120f``. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + namespace simd = cuda::std::simd; + + __global__ void kernel() + { + using vec_t = simd::basic_vec>; + using result_vec_t = simd::basic_vec>; + + cuda::std::array lhs_values{-128, 10, 20, 30}; + cuda::std::array rhs_values{127, 20, -30, 40}; + + vec_t lhs(lhs_values); + vec_t rhs(rhs_values); + result_vec_t result = cuda::simd::abs_diff(lhs, rhs); + + assert(result[0] == 255); + assert(result[1] == 10); + assert(result[2] == 50); + assert(result[3] == 10); + } + + int main() + { + kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + } diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h index cc241740520..cb1902eaa62 100644 --- a/libcudacxx/include/cuda/__simd/simd_intrinsics.h +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if _CCCL_HAS_SIMD_SAT() +#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() # include # include @@ -32,18 +32,20 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD +# if _CCCL_HAS_SIMD_SAT() + [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u16x2( [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept { -# if _CCCL_HAS_SIMD_SAT_INTRINSICS() +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus2(__lhs, __rhs);)) -# elif _CCCL_HAS_SIMD_SAT_PTX() +# elif _CCCL_HAS_SIMD_SAT_PTX() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ ::cuda::std::uint32_t __result{}; asm("add.sat.u16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); return __result; })) -# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u16x2: Unsupported architecture"); return ::cuda::std::uint32_t{}; } @@ -51,15 +53,15 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s16x2( [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept { -# if _CCCL_HAS_SIMD_SAT_INTRINSICS() +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss2(__lhs, __rhs);)) -# elif _CCCL_HAS_SIMD_SAT_PTX() +# elif _CCCL_HAS_SIMD_SAT_PTX() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ ::cuda::std::uint32_t __result{}; asm("add.sat.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); return __result; })) -# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s16x2: Unsupported architecture"); return ::cuda::std::uint32_t{}; } @@ -67,15 +69,15 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_u8x4( [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept { -# if _CCCL_HAS_SIMD_SAT_INTRINSICS() +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddus4(__lhs, __rhs);)) -# elif _CCCL_HAS_SIMD_SAT_PTX() +# elif _CCCL_HAS_SIMD_SAT_PTX() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ ::cuda::std::uint32_t __result{}; asm("add.sat.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); return __result; })) -# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_u8x4: Unsupported architecture"); return ::cuda::std::uint32_t{}; } @@ -83,22 +85,56 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD [[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vadd_sat_s8x4( [[maybe_unused]] const ::cuda::std::uint32_t __lhs, [[maybe_unused]] const ::cuda::std::uint32_t __rhs) noexcept { -# if _CCCL_HAS_SIMD_SAT_INTRINSICS() +# if _CCCL_HAS_SIMD_SAT_INTRINSICS() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, (return ::__vaddss4(__lhs, __rhs);)) -# elif _CCCL_HAS_SIMD_SAT_PTX() +# elif _CCCL_HAS_SIMD_SAT_PTX() NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, ({ ::cuda::std::uint32_t __result{}; asm("add.sat.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); return __result; })) -# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() +# endif // _CCCL_HAS_SIMD_SAT_INTRINSICS() || _CCCL_HAS_SIMD_SAT_PTX() _CCCL_VERIFY(false, "cuda::__simd::__vadd_sat_s8x4: Unsupported architecture"); return ::cuda::std::uint32_t{}; } +# endif // _CCCL_HAS_SIMD_SAT() + +# if _CCCL_HAS_SIMD_VABSDIFF() + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vabsdiff_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::uint32_t __c) noexcept +{ + NV_IF_TARGET(NV_IS_DEVICE, ({ + ::cuda::std::uint32_t __result{}; + asm("vabsdiff4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__c)); + return __result; + })) + _CCCL_VERIFY(false, "cuda::__simd::__vabsdiff_u8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __vabsdiff_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::uint32_t __c) noexcept +{ + NV_IF_TARGET(NV_IS_DEVICE, ({ + ::cuda::std::uint32_t __result{}; + asm("vabsdiff4.u32.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__c)); + return __result; + })) + _CCCL_VERIFY(false, "cuda::__simd::__vabsdiff_s8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +# endif // _CCCL_HAS_SIMD_VABSDIFF() + _CCCL_END_NAMESPACE_CUDA_SIMD # include -#endif // _CCCL_HAS_SIMD_SAT() +#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() #endif // _CUDA___SIMD_SIMD_INTRINSICS_H diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h index 71fd0613702..4a056ba6292 100644 --- a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if _CCCL_HAS_SIMD_SAT() +#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() # include # include @@ -35,6 +35,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD //---------------------------------------------------------------------------------------------------------------------- // device-only functions +# if _CCCL_HAS_SIMD_SAT() + template [[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vadd_sat_16bit_x2( const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u) noexcept @@ -75,9 +77,37 @@ template return __result_u; } +# endif // _CCCL_HAS_SIMD_SAT() + +# if _CCCL_HAS_SIMD_VABSDIFF() + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr ::cuda::std::simd::__array_u32_t<_Np> __vabsdiff_8bit_x4( + const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, + const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u, + const ::cuda::std::simd::__array_u32_t<_Np>& __c_u) noexcept +{ + ::cuda::std::simd::__array_u32_t<_Np> __result_u; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (::cuda::std::is_unsigned_v<_Tp>) + { + __result_u[__i] = ::cuda::simd::__vabsdiff_u8x4(__lhs_u[__i], __rhs_u[__i], __c_u[__i]); + } + else + { + __result_u[__i] = ::cuda::simd::__vabsdiff_s8x4(__lhs_u[__i], __rhs_u[__i], __c_u[__i]); + } + } + return __result_u; +} + +# endif // _CCCL_HAS_SIMD_VABSDIFF() + _CCCL_END_NAMESPACE_CUDA_SIMD # include -#endif // _CCCL_HAS_SIMD_SAT() +#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() #endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H diff --git a/libcudacxx/include/cuda/__simd/vabsdiff.h b/libcudacxx/include/cuda/__simd/vabsdiff.h new file mode 100644 index 00000000000..2e02dd49615 --- /dev/null +++ b/libcudacxx/include/cuda/__simd/vabsdiff.h @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___SIMD_VABSDIFF_H +#define _CUDA___SIMD_VABSDIFF_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#if _CCCL_HAS_SIMD_VABSDIFF() +# include +# include +#endif // _CCCL_HAS_SIMD_VABSDIFF() + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_SIMD + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) +[[nodiscard]] _CCCL_API constexpr ::cuda::std::simd::basic_vec<::cuda::std::make_unsigned_t<_Tp>, _Abi> abs_diff( + const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs, const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __rhs) noexcept +{ + using __result_value_t = ::cuda::std::make_unsigned_t<_Tp>; + using __basic_vec_t = ::cuda::std::simd::basic_vec<_Tp, _Abi>; + using __result_vec_t = ::cuda::std::simd::basic_vec<__result_value_t, _Abi>; + using __result_storage_t = typename __result_vec_t::_Storage; + constexpr auto __size = __basic_vec_t::__size; + +#if _CCCL_HAS_SIMD_VABSDIFF() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (sizeof(_Tp) == 1) + { + NV_IF_TARGET(NV_IS_DEVICE, ({ + constexpr ::cuda::std::simd::__simd_storage_u32_t<__result_storage_t> __c_u{}; + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_); + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_); + const auto __result_u = ::cuda::simd::__vabsdiff_8bit_x4<_Tp>(__lhs_u, __rhs_u, __c_u); + const auto __result_s = + ::cuda::std::simd::__copy_from_unsigned_storage<__result_storage_t>(__result_u); + return __result_vec_t{__result_s, __result_vec_t::__storage_tag}; + })); + } + } +#endif // _CCCL_HAS_SIMD_VABSDIFF() + + __result_storage_t __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::simd::__simd_size_type __i = 0; __i < __size; ++__i) + { + const auto __lhs_i = static_cast<__result_value_t>(__lhs.__s_.__data[__i]); + const auto __rhs_i = static_cast<__result_value_t>(__rhs.__s_.__data[__i]); + const auto __is_less = (__lhs.__s_.__data[__i] < __rhs.__s_.__data[__i]); + __result.__data[__i] = __is_less ? (__rhs_i - __lhs_i) : (__lhs_i - __rhs_i); + } + return __result_vec_t{__result, __result_vec_t::__storage_tag}; +} + +_CCCL_END_NAMESPACE_CUDA_SIMD + +#include + +#endif // _CUDA___SIMD_VABSDIFF_H diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd index 327ba22f527..c6c4889bc30 100644 --- a/libcudacxx/include/cuda/simd +++ b/libcudacxx/include/cuda/simd @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #endif // _CUDA_SIMD diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h index a9b01dfc274..be22b4b9fb9 100644 --- a/libcudacxx/include/cuda/std/__internal/features.h +++ b/libcudacxx/include/cuda/std/__internal/features.h @@ -114,8 +114,20 @@ // TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics #define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 #define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) -#define _CCCL_HAS_SIMD_SAT() \ - (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() +#define _CCCL_HAS_SIMD_SAT() \ + ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \ + && !_CCCL_TILE_COMPILATION()) + +#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() + +// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics +#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 +#define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) +#define _CCCL_HAS_SIMD_SAT() \ + ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \ + && !_CCCL_TILE_COMPILATION()) + +#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() // TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics #define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 diff --git a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h index 6e7cd7caffe..3cae299eb76 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/simd_intrinsics.h @@ -42,14 +42,13 @@ __vadd_u16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint3 [[nodiscard]] _CCCL_DEVICE_API inline uint32_t __vadd_s16x2([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept { - // prevent MSVC warning + uint32_t __result{}; NV_IF_TARGET(NV_PROVIDES_SM_90, - ({ - uint32_t __result{}; - asm("add.s16x2 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); - return __result; - }), - (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture"); return uint32_t{};)); + ({ asm("add.s16x2 %0, %1, %2;" + : "=r"(__result) + : "r"(__lhs), "r"(__rhs)); }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s16x2: Unsupported architecture");)); + return __result; } # if _CCCL_HAS_SIMD_8BIT() @@ -62,26 +61,26 @@ __vadd_u8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32 (return ::__vadd4(__lhs, __rhs);), // (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};)); # else // ^^^ _CCCL_HAS_SIMD_8BIT_INTRINSICS() ^^^ / vvv !_CCCL_HAS_SIMD_8BIT_INTRINSICS() vvv + uint32_t __result{}; NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, - ({ - uint32_t __result{}; - asm("add.u8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); - return __result; - }), - (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture"); return uint32_t{};)); + ({ asm("add.u8x4 %0, %1, %2;" + : "=r"(__result) + : "r"(__lhs), "r"(__rhs)); }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_u8x4: Unsupported architecture");)); + return __result; # endif // _CCCL_HAS_SIMD_8BIT() } [[nodiscard]] _CCCL_DEVICE_API inline uint32_t __vadd_s8x4([[maybe_unused]] const uint32_t __lhs, [[maybe_unused]] const uint32_t __rhs) noexcept { + uint32_t __result{}; NV_IF_TARGET(NV_HAS_FEATURE_SM_120f, - ({ - uint32_t __result{}; - asm("add.s8x4 %0, %1, %2;" : "=r"(__result) : "r"(__lhs), "r"(__rhs)); - return __result; - }), - (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture"); return uint32_t{};)); + ({ asm("add.s8x4 %0, %1, %2;" + : "=r"(__result) + : "r"(__lhs), "r"(__rhs)); }), + (_CCCL_VERIFY(false, "cuda::std::simd::__vadd_s8x4: Unsupported architecture");)); + return __result; } # endif // _CCCL_HAS_SIMD_8BIT() diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp new file mode 100644 index 00000000000..628d602e726 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/vabsdiff.pass.cpp @@ -0,0 +1,133 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template +// constexpr basic_vec, Abi> cuda::simd::abs_diff( +// const basic_vec& lhs, const basic_vec& rhs) noexcept; + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +template +using fixed_size_vec = simd::basic_vec>; + +template +inline constexpr bool has_abs_diff = false; + +template +inline constexpr bool has_abs_diff< + Vec, + cuda::std::void_t(), cuda::std::declval()))>> = true; + +template +TEST_FUNC constexpr cuda::std::make_unsigned_t scalar_abs_diff(T lhs, T rhs) +{ + using ResultT = cuda::std::make_unsigned_t; + ResultT lhs_u = static_cast(lhs); + ResultT rhs_u = static_cast(rhs); + return (lhs < rhs) ? (rhs_u - lhs_u) : (lhs_u - rhs_u); +} + +template +TEST_FUNC constexpr void test_values(cuda::std::array lhs_values, cuda::std::array rhs_values) +{ + using Vec = simd::basic_vec>; + using ResultVec = simd::basic_vec, simd::fixed_size>; + Vec lhs(lhs_values); + Vec rhs(rhs_values); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::simd::abs_diff(lhs, rhs))); + + ResultVec result = cuda::simd::abs_diff(lhs, rhs); + for (int i = 0; i < N; ++i) + { + assert(result[i] == scalar_abs_diff(lhs_values[i], rhs_values[i])); + } +} + +template +TEST_FUNC constexpr void test_size() +{ + constexpr auto min_val = cuda::std::numeric_limits::min(); + constexpr auto max_val = cuda::std::numeric_limits::max(); + + if constexpr (cuda::std::is_signed_v) + { + cuda::std::array lhs_values{min_val, max_val, T{-10}}; + cuda::std::array rhs_values{max_val, min_val, T{20}}; + if constexpr (N > 3) + { + lhs_values[3] = T{-5}; + rhs_values[3] = T{-5}; + } + test_values(lhs_values, rhs_values); + } + else + { + cuda::std::array lhs_values{min_val, max_val, T{10}}; + cuda::std::array rhs_values{max_val, min_val, T{20}}; + if constexpr (N > 3) + { + lhs_values[3] = T{5}; + rhs_values[3] = T{5}; + } + test_values(lhs_values, rhs_values); + } +} + +template +TEST_FUNC constexpr void test() +{ + test_size(); + test_size(); +} + +TEST_FUNC constexpr bool test_all() +{ + static_assert(!has_abs_diff>); + + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__int128_t>(); +#endif // _CCCL_HAS_INT128() + + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + + return true; +} + +int main(int, char**) +{ + assert(test_all()); + static_assert(test_all()); + return 0; +} diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 79b98ac4693..ab3c686467e 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -31,7 +31,12 @@ find_program(bash "bash" REQUIRED) set(libcudacxx_simd_codegen_tests) if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - file(GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" "integer/*.cu") + file( + GLOB libcudacxx_simd_codegen_tests + "floating_point/*.cu" + "integer/*.cu" + "vabsdiff/*.cu" + ) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.2) file( GLOB libcudacxx_simd_codegen_saturation_add_tests diff --git a/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu b/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu new file mode 100644 index 00000000000..71c0d8ed48d --- /dev/null +++ b/libcudacxx/test/simd_codegen/vabsdiff/vabsdiff.cu @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep + +namespace simd = cuda::std::simd; + +using Vec_s8_x4 = simd::basic_vec>; +using Vec_u8_x4 = simd::basic_vec>; + +__device__ Vec_u8_x4 test_abs_diff_u8_x4(Vec_u8_x4 lhs, Vec_u8_x4 rhs) +{ + return cuda::simd::abs_diff(lhs, rhs); +} + +__device__ Vec_u8_x4 test_abs_diff_s8_x4(Vec_s8_x4 lhs, Vec_s8_x4 rhs) +{ + return cuda::simd::abs_diff(lhs, rhs); +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_abs_diff_s8_x4.*}} +; SM80: {{.*VABSDIFF4 .*}} +; SM90: {{.*VABSDIFF4 .*}} +; SM100: {{.*VABSDIFF4 .*}} +; SM120: {{.*VIMNMX\.S8x4.*}} +; SM120: {{.*VIMNMX\.S8x4.*}} + +; SMXX-LABEL: {{[[:space:]]*}}Function : {{.*test_abs_diff_u8_x4.*}} +; SM80: {{.*VABSDIFF4\.U8.*}} +; SM90: {{.*VABSDIFF4\.U8.*}} +; SM100: {{.*VABSDIFF4\.U8.*}} +; SM120: {{.*VIMNMX\.U8x4.*}} +; SM120: {{.*VIMNMX\.U8x4.*}} + +*/ From 4e548e6ee2ca8cc5f2653a6a50d7b7fd54978262 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 19 May 2026 15:44:17 -0700 Subject: [PATCH 4/4] `cuda::simd` Integer dot product --- docs/libcudacxx/extended_api/simd.rst | 6 + docs/libcudacxx/extended_api/simd/idot.rst | 79 +++++++ libcudacxx/include/cuda/__simd/idot.h | 108 +++++++++ .../include/cuda/__simd/simd_intrinsics.h | 218 +++++++++++++++++- .../cuda/__simd/simd_intrinsics_array.h | 98 +++++++- libcudacxx/include/cuda/simd | 1 + .../include/cuda/std/__internal/features.h | 17 +- .../numerics/simd/simd.non_std/idot.pass.cpp | 196 ++++++++++++++++ libcudacxx/test/simd_codegen/CMakeLists.txt | 1 + libcudacxx/test/simd_codegen/idot/idp2.cu | 95 ++++++++ libcudacxx/test/simd_codegen/idot/idp4.cu | 53 +++++ 11 files changed, 855 insertions(+), 17 deletions(-) create mode 100644 docs/libcudacxx/extended_api/simd/idot.rst create mode 100644 libcudacxx/include/cuda/__simd/idot.h create mode 100644 libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp create mode 100644 libcudacxx/test/simd_codegen/idot/idp2.cu create mode 100644 libcudacxx/test/simd_codegen/idot/idp4.cu diff --git a/docs/libcudacxx/extended_api/simd.rst b/docs/libcudacxx/extended_api/simd.rst index 524b1bb8cb1..29ec24da4a3 100644 --- a/docs/libcudacxx/extended_api/simd.rst +++ b/docs/libcudacxx/extended_api/simd.rst @@ -9,6 +9,7 @@ SIMD simd/saturating_add simd/abs_diff + simd/idot .. list-table:: :widths: 25 45 30 30 @@ -28,3 +29,8 @@ SIMD - Performs element-wise absolute difference of two integer ``basic_vec`` objects - CCCL 3.5.0 - CUDA 13.5 + + * - :ref:`cuda::simd::idot ` + - Computes the integer dot product of two ``basic_vec`` objects and an accumulator + - CCCL 3.5.0 + - CUDA 13.5 diff --git a/docs/libcudacxx/extended_api/simd/idot.rst b/docs/libcudacxx/extended_api/simd/idot.rst new file mode 100644 index 00000000000..a90730f6b90 --- /dev/null +++ b/docs/libcudacxx/extended_api/simd/idot.rst @@ -0,0 +1,79 @@ +.. _libcudacxx-extended-api-simd-idot: + +``cuda::simd::idot`` +==================== + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda::simd { + + template + [[nodiscard]] __host__ __device__ constexpr + AccT idot( + const cuda::std::simd::basic_vec& lhs, + const cuda::std::simd::basic_vec& rhs, + AccT acc) noexcept; + + } // namespace cuda::simd + +The function ``cuda::simd::idot`` computes the dot product of two integer ``cuda::std::simd::basic_vec`` objects and adds the result to an accumulator. + +For each element ``i`` in the input vectors, the result is equivalent to: + +.. code:: cuda + + acc += static_cast(lhs[i]) * static_cast(rhs[i]) + +**Parameters** + +- ``lhs``: The left-hand side input vector. +- ``rhs``: The right-hand side input vector. +- ``acc``: The initial accumulator value. + +**Return value** + +Returns ``acc`` plus the integer dot product of ``lhs`` and ``rhs``. + +**Constraints** + +- ``T``, ``U``, and ``AccT`` must be integer types. + +**Performance considerations** + +- Packed 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP4A`` on ``SM61`` and newer device targets. +- Packed 16-bit by 8-bit input vectors with 32-bit accumulators (same sign) use ``IDP2A`` on ``SM61`` and newer device targets. +- Other integer input and accumulator combinations use the scalar fallback. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + namespace simd = cuda::std::simd; + + __global__ void kernel() + { + using vec_t = simd::basic_vec>; + + cuda::std::array lhs_values{1, 2, 3, 4}; + cuda::std::array rhs_values{5, 6, 7, 8}; + vec_t lhs(lhs_values); + vec_t rhs(rhs_values); + + int32_t result = cuda::simd::idot(lhs, rhs, int32_t{10}); + + assert(result == 80); + } + + int main() + { + kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + } diff --git a/libcudacxx/include/cuda/__simd/idot.h b/libcudacxx/include/cuda/__simd/idot.h new file mode 100644 index 00000000000..c2d9a99102c --- /dev/null +++ b/libcudacxx/include/cuda/__simd/idot.h @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___SIMD_IDOT_H +#define _CUDA___SIMD_IDOT_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#if _CCCL_HAS_SIMD_IDOT() +# include +# include +#endif // _CCCL_HAS_SIMD_IDOT() + +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_SIMD + +_CCCL_TEMPLATE(typename _Tp, typename _Up, typename _Abi, typename _AccumT) +_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND ::cuda::std::__cccl_is_integer_v<_Up> + _CCCL_AND ::cuda::std::__cccl_is_integer_v<_AccumT>) +[[nodiscard]] _CCCL_API constexpr _AccumT +idot(const ::cuda::std::simd::basic_vec<_Tp, _Abi>& __lhs, + const ::cuda::std::simd::basic_vec<_Up, _Abi>& __rhs, + const _AccumT __acc) noexcept +{ +#if _CCCL_HAS_SIMD_IDOT() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + using ::cuda::std::is_signed_v; + using ::cuda::std::is_unsigned_v; + constexpr bool __is_unsigned_dot = is_unsigned_v<_Tp> && is_unsigned_v<_Up> && is_unsigned_v<_AccumT>; + constexpr bool __is_signed_dot = (is_signed_v<_Tp> || is_signed_v<_Up>) && is_signed_v<_AccumT>; + constexpr bool __has_matching_sign = __is_unsigned_dot || __is_signed_dot; + + constexpr bool __is_dp4 = sizeof(_Tp) == 1 && sizeof(_Up) == 1 && sizeof(_AccumT) == 4 && __has_matching_sign; + + constexpr bool __is_dp2_16bitx2_8bitx4 = + (sizeof(_Tp) == 2 && sizeof(_Up) == 1) && sizeof(_AccumT) == 4 && __has_matching_sign; + constexpr bool __is_dp2_8bitx4_16bitx2 = + (sizeof(_Tp) == 1 && sizeof(_Up) == 2) && sizeof(_AccumT) == 4 && __has_matching_sign; + + if constexpr (__is_dp4) + { + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_); + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_); + return ::cuda::simd::__dp4a_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc); + })) + } + else if constexpr (__is_dp2_16bitx2_8bitx4) + { + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_); + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_); + return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Tp, _Up>(__lhs_u, __rhs_u, __acc); + })) + } + else if constexpr (__is_dp2_8bitx4_16bitx2) + { + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + const auto __lhs_u = ::cuda::std::simd::__to_unsigned_storage(__lhs.__s_); + const auto __rhs_u = ::cuda::std::simd::__to_unsigned_storage(__rhs.__s_); + return ::cuda::simd::__dp2a_16bit_x2_8bit_x4<_Up, _Tp>(__rhs_u, __lhs_u, __acc); + })) + } + } +#endif // _CCCL_HAS_SIMD_IDOT() + + _AccumT __result = __acc; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::simd::__simd_size_type __i = 0; __i < __lhs.__size; ++__i) + { + const auto __lhs_value = static_cast<_AccumT>(__lhs.__s_.__data[__i]); + const auto __rhs_value = static_cast<_AccumT>(__rhs.__s_.__data[__i]); + const auto __product = static_cast<_AccumT>(__lhs_value * __rhs_value); + __result = static_cast<_AccumT>(__result + __product); + } + return __result; +} + +_CCCL_END_NAMESPACE_CUDA_SIMD + +#include + +#endif // _CUDA___SIMD_IDOT_H diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics.h b/libcudacxx/include/cuda/__simd/simd_intrinsics.h index cb1902eaa62..82623d16479 100644 --- a/libcudacxx/include/cuda/__simd/simd_intrinsics.h +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() +#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT() # include # include @@ -132,9 +132,223 @@ _CCCL_BEGIN_NAMESPACE_CUDA_SIMD # endif // _CCCL_HAS_SIMD_VABSDIFF() +# if _CCCL_HAS_SIMD_IDOT() + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp4a_u8x4_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp4a(__lhs, __rhs, __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::uint32_t __result{}; + asm("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp4a_u8x4_u8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_s8x4_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET( + NV_PROVIDES_SM_61, + (return ::__dp4a(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp4a_s8x4_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_u8x4_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp4a.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp4a_u8x4_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp4a_s8x4_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp4a.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp4a_s8x4_u8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp2a_lo_u16x2_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp2a_lo(__lhs, __rhs, __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::uint32_t __result{}; + asm("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_u16x2_u8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::uint32_t __dp2a_hi_u16x2_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::uint32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET(NV_PROVIDES_SM_61, (return ::__dp2a_hi(__lhs, __rhs, __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::uint32_t __result{}; + asm("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_u16x2_u8x4: Unsupported architecture"); + return ::cuda::std::uint32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_s16x2_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET( + NV_PROVIDES_SM_61, + (return ::__dp2a_lo(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_s16x2_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_u16x2_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.lo.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_u16x2_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_lo_s16x2_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.lo.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_lo_s16x2_u8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_s16x2_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_INTRINSICS() + NV_IF_TARGET( + NV_PROVIDES_SM_61, + (return ::__dp2a_hi(static_cast<::cuda::std::int32_t>(__lhs), static_cast<::cuda::std::int32_t>(__rhs), __acc);)) +# elif _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_s16x2_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_u16x2_s8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.hi.u32.s32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_u16x2_s8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +[[nodiscard]] _CCCL_DEVICE_API inline ::cuda::std::int32_t __dp2a_hi_s16x2_u8x4( + [[maybe_unused]] const ::cuda::std::uint32_t __lhs, + [[maybe_unused]] const ::cuda::std::uint32_t __rhs, + [[maybe_unused]] const ::cuda::std::int32_t __acc) noexcept +{ +# if _CCCL_HAS_SIMD_IDOT_PTX() + NV_IF_TARGET(NV_PROVIDES_SM_61, ({ + ::cuda::std::int32_t __result{}; + asm("dp2a.hi.s32.u32 %0, %1, %2, %3;" : "=r"(__result) : "r"(__lhs), "r"(__rhs), "r"(__acc)); + return __result; + })) +# endif // _CCCL_HAS_SIMD_IDOT_PTX() + _CCCL_VERIFY(false, "cuda::__simd::__dp2a_hi_s16x2_u8x4: Unsupported architecture"); + return ::cuda::std::int32_t{}; +} + +# endif // _CCCL_HAS_SIMD_IDOT() + _CCCL_END_NAMESPACE_CUDA_SIMD # include -#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() +#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT() #endif // _CUDA___SIMD_SIMD_INTRINSICS_H diff --git a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h index 4a056ba6292..f8765b85480 100644 --- a/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h +++ b/libcudacxx/include/cuda/__simd/simd_intrinsics_array.h @@ -21,12 +21,13 @@ # pragma system_header #endif // no system header -#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() +#if _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT() # include # include # include # include +# include # include @@ -105,9 +106,102 @@ template # endif // _CCCL_HAS_SIMD_VABSDIFF() +# if _CCCL_HAS_SIMD_IDOT() + +template +[[nodiscard]] _CCCL_DEVICE_API _AccumT __dp4a_8bit_x4( + const ::cuda::std::simd::__array_u32_t<_Np>& __lhs_u, + const ::cuda::std::simd::__array_u32_t<_Np>& __rhs_u, + const _AccumT __acc) noexcept +{ + _AccumT __result = __acc; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < _Np; ++__i) + { + if constexpr (::cuda::std::is_unsigned_v<_Tp> && ::cuda::std::is_unsigned_v<_Up>) + { + __result = ::cuda::simd::__dp4a_u8x4_u8x4(__lhs_u[__i], __rhs_u[__i], __result); + } + else if constexpr (::cuda::std::is_unsigned_v<_Tp>) + { + __result = ::cuda::simd::__dp4a_u8x4_s8x4(__lhs_u[__i], __rhs_u[__i], __result); + } + else if constexpr (::cuda::std::is_unsigned_v<_Up>) + { + __result = ::cuda::simd::__dp4a_s8x4_u8x4(__lhs_u[__i], __rhs_u[__i], __result); + } + else + { + __result = ::cuda::simd::__dp4a_s8x4_s8x4(__lhs_u[__i], __rhs_u[__i], __result); + } + } + return __result; +} + +template +[[nodiscard]] _CCCL_DEVICE_API _AccumT __dp2a_16bit_x2_8bit_x4( + const ::cuda::std::simd::__array_u32_t<_N_16Bit>& __lhs_u16, + const ::cuda::std::simd::__array_u32_t<_N_8Bit>& __rhs_u8, + const _AccumT __acc) noexcept +{ + _AccumT __result = __acc; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < _N_16Bit; ++__i) + { + const auto __rhs_u = __rhs_u8[__i / 2]; + if constexpr (::cuda::std::is_unsigned_v<_Tp> && ::cuda::std::is_unsigned_v<_Up>) + { + if (__i % 2 == 0) + { + __result = ::cuda::simd::__dp2a_lo_u16x2_u8x4(__lhs_u16[__i], __rhs_u, __result); + } + else + { + __result = ::cuda::simd::__dp2a_hi_u16x2_u8x4(__lhs_u16[__i], __rhs_u, __result); + } + } + else if constexpr (::cuda::std::is_unsigned_v<_Tp>) + { + if (__i % 2 == 0) + { + __result = ::cuda::simd::__dp2a_lo_u16x2_s8x4(__lhs_u16[__i], __rhs_u, __result); + } + else + { + __result = ::cuda::simd::__dp2a_hi_u16x2_s8x4(__lhs_u16[__i], __rhs_u, __result); + } + } + else if constexpr (::cuda::std::is_unsigned_v<_Up>) + { + if (__i % 2 == 0) + { + __result = ::cuda::simd::__dp2a_lo_s16x2_u8x4(__lhs_u16[__i], __rhs_u, __result); + } + else + { + __result = ::cuda::simd::__dp2a_hi_s16x2_u8x4(__lhs_u16[__i], __rhs_u, __result); + } + } + else + { + if (__i % 2 == 0) + { + __result = ::cuda::simd::__dp2a_lo_s16x2_s8x4(__lhs_u16[__i], __rhs_u, __result); + } + else + { + __result = ::cuda::simd::__dp2a_hi_s16x2_s8x4(__lhs_u16[__i], __rhs_u, __result); + } + } + } + return __result; +} + +# endif // _CCCL_HAS_SIMD_IDOT() + _CCCL_END_NAMESPACE_CUDA_SIMD # include -#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() +#endif // _CCCL_HAS_SIMD_SAT() || _CCCL_HAS_SIMD_VABSDIFF() || _CCCL_HAS_SIMD_IDOT() #endif // _CUDA___SIMD_SIMD_INTRINSICS_ARRAY_H diff --git a/libcudacxx/include/cuda/simd b/libcudacxx/include/cuda/simd index c6c4889bc30..9ede1dabd80 100644 --- a/libcudacxx/include/cuda/simd +++ b/libcudacxx/include/cuda/simd @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__internal/features.h b/libcudacxx/include/cuda/std/__internal/features.h index be22b4b9fb9..f2660f646e0 100644 --- a/libcudacxx/include/cuda/std/__internal/features.h +++ b/libcudacxx/include/cuda/std/__internal/features.h @@ -120,21 +120,12 @@ #define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() -// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics -#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 -#define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) -#define _CCCL_HAS_SIMD_SAT() \ - ((_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() \ +#define _CCCL_HAS_SIMD_IDOT_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 2) && _CCCL_HAS_CTK()) +#define _CCCL_HAS_SIMD_IDOT_PTX() (__cccl_ptx_isa >= 500ULL) +#define _CCCL_HAS_SIMD_IDOT() \ + ((_CCCL_HAS_SIMD_IDOT_INTRINSICS() || _CCCL_HAS_SIMD_IDOT_PTX()) && _CCCL_CUDA_COMPILATION() \ && !_CCCL_TILE_COMPILATION()) -#define _CCCL_HAS_SIMD_VABSDIFF() _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() - -// TODO(fbusato): CTK 13.2 produces non-optimal code for SIMD SAT intrinsics -#define _CCCL_HAS_SIMD_SAT_INTRINSICS() 0 -#define _CCCL_HAS_SIMD_SAT_PTX() (__cccl_ptx_isa >= 920ULL) -#define _CCCL_HAS_SIMD_SAT() \ - (_CCCL_HAS_SIMD_SAT_PTX() || _CCCL_HAS_SIMD_SAT_INTRINSICS()) && _CCCL_CUDA_COMPILATION() && !_CCCL_TILE_COMPILATION() - // Third party libraries #if (__has_include() || __has_include()) && \ diff --git a/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp new file mode 100644 index 00000000000..2f3a7f84ed4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/simd/simd.non_std/idot.pass.cpp @@ -0,0 +1,196 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template +// constexpr AccT cuda::simd::idot( +// const basic_vec& lhs, const basic_vec& rhs, AccT acc) noexcept; + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +namespace simd = cuda::std::simd; + +template +using fixed_size_vec = simd::basic_vec>; + +template +inline constexpr bool has_idot = false; + +template +inline constexpr bool + has_idot(), cuda::std::declval(), cuda::std::declval()))>> = true; + +template +TEST_FUNC constexpr AccT +scalar_idot(const cuda::std::array& lhs_values, const cuda::std::array& rhs_values, AccT acc) +{ + AccT result = acc; + for (int i = 0; i < N; ++i) + { + AccT lhs_value = static_cast(lhs_values[i]); + AccT rhs_value = static_cast(rhs_values[i]); + AccT product = static_cast(lhs_value * rhs_value); + result = static_cast(result + product); + } + return result; +} + +template +TEST_FUNC constexpr void test_values(cuda::std::array lhs_values, cuda::std::array rhs_values, AccT acc) +{ + using LhsVec = simd::basic_vec>; + using RhsVec = simd::basic_vec>; + LhsVec lhs(lhs_values); + RhsVec rhs(rhs_values); + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::simd::idot(lhs, rhs, acc))); + + AccT result = cuda::simd::idot(lhs, rhs, acc); + AccT expected = scalar_idot(lhs_values, rhs_values, acc); + assert(result == expected); +} + +template +TEST_FUNC constexpr void test_generated(AccT acc) +{ + cuda::std::array lhs_values{}; + cuda::std::array rhs_values{}; + for (int i = 0; i < N; ++i) + { + if constexpr (cuda::std::is_signed_v) + { + lhs_values[i] = static_cast((i % 5) - 2); + } + else + { + lhs_values[i] = static_cast((i % 5) + 1); + } + + if constexpr (cuda::std::is_signed_v) + { + rhs_values[i] = static_cast((i % 7) - 3); + } + else + { + rhs_values[i] = static_cast((i % 7) + 2); + } + } + test_values(lhs_values, rhs_values, acc); +} + +TEST_FUNC constexpr void test_8bit_dp4a() +{ + { + cuda::std::array lhs_values{-8, -3, 2, 7}; + cuda::std::array rhs_values{4, -5, 6, -7}; + test_values(lhs_values, rhs_values, int32_t{11}); + } + { + cuda::std::array lhs_values{1, 2, 3, 4, 5, 6, 7}; + cuda::std::array rhs_values{8, 7, 6, 5, 4, 3, 2}; + test_values(lhs_values, rhs_values, uint32_t{13}); + } + { + cuda::std::array lhs_values{1, 2, 3, 4, 5}; + cuda::std::array rhs_values{-1, 2, -3, 4, -5}; + test_values(lhs_values, rhs_values, int32_t{-17}); + } + { + cuda::std::array lhs_values{-4, 5, -6}; + cuda::std::array rhs_values{7, 8, 9}; + test_values(lhs_values, rhs_values, int32_t{19}); + } +} + +TEST_FUNC constexpr void test_16bit_8bit_dp2a() +{ + { + cuda::std::array lhs_values{-300, 20, 45, -12, 17}; + cuda::std::array rhs_values{3, -4, 5, -6, 7}; + test_values(lhs_values, rhs_values, int32_t{23}); + } + { + cuda::std::array lhs_values{3, -4, 5, -6, 7}; + cuda::std::array rhs_values{-300, 20, 45, -12, 17}; + test_values(lhs_values, rhs_values, int32_t{29}); + } + { + cuda::std::array lhs_values{300, 20, 45, 12, 17}; + cuda::std::array rhs_values{3, 4, 5, 6, 7}; + test_values(lhs_values, rhs_values, uint32_t{31}); + } + { + cuda::std::array lhs_values{3, 4, 5, 6, 7}; + cuda::std::array rhs_values{300, 20, 45, 12, 17}; + test_values(lhs_values, rhs_values, uint32_t{37}); + } + { + cuda::std::array lhs_values{-300, 20, 45, -12, 17}; + cuda::std::array rhs_values{3, 200, 5, 255, 7}; + test_values(lhs_values, rhs_values, int32_t{41}); + } + { + cuda::std::array lhs_values{3, 200, 5, 255, 7}; + cuda::std::array rhs_values{-300, 20, 45, -12, 17}; + test_values(lhs_values, rhs_values, int32_t{-47}); + } + { + cuda::std::array lhs_values{300, 40000, 45, 65535, 17}; + cuda::std::array rhs_values{3, -4, 5, -6, 7}; + test_values(lhs_values, rhs_values, int32_t{43}); + } + { + cuda::std::array lhs_values{3, -4, 5, -6, 7}; + cuda::std::array rhs_values{300, 40000, 45, 65535, 17}; + test_values(lhs_values, rhs_values, int32_t{-53}); + } +} + +TEST_FUNC constexpr bool test_all() +{ + static_assert(!has_idot, fixed_size_vec, int>); + static_assert(!has_idot, fixed_size_vec, int>); + static_assert(!has_idot, fixed_size_vec, float>); + + test_8bit_dp4a(); + test_16bit_8bit_dp2a(); + + test_generated(5); + test_generated(-7); + test_generated(9); + test_generated(11); + test_generated(13); +#if _CCCL_HAS_INT128() + test_generated<__int128_t, __uint128_t, __int128_t, 3>(__int128_t{17}); + test_generated<__uint128_t, __uint128_t, __uint128_t, 5>(__uint128_t{19}); +#endif // _CCCL_HAS_INT128() + + return true; +} + +int main(int, char**) +{ + assert(test_all()); + static_assert(test_all()); + return 0; +} diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index ab3c686467e..68b884b04a9 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -34,6 +34,7 @@ if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") file( GLOB libcudacxx_simd_codegen_tests "floating_point/*.cu" + "idot/*.cu" "integer/*.cu" "vabsdiff/*.cu" ) diff --git a/libcudacxx/test/simd_codegen/idot/idp2.cu b/libcudacxx/test/simd_codegen/idot/idp2.cu new file mode 100644 index 00000000000..0073adeacea --- /dev/null +++ b/libcudacxx/test/simd_codegen/idot/idp2.cu @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace simd = cuda::std::simd; + +using Vec_s8_x4 = simd::basic_vec>; +using Vec_u8_x4 = simd::basic_vec>; +using Vec_s16_x4 = simd::basic_vec>; +using Vec_u16_x4 = simd::basic_vec>; + +__device__ cuda::std::int32_t test_idot_s16_s8(Vec_s16_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_s8_s16(Vec_s8_x4 lhs, Vec_s16_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_s16_u8(Vec_s16_x4 lhs, Vec_u8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_u8_s16(Vec_u8_x4 lhs, Vec_s16_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_u16_s8(Vec_u16_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_s8_u16(Vec_s8_x4 lhs, Vec_u16_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::uint32_t test_idot_u16_u8(Vec_u16_x4 lhs, Vec_u8_x4 rhs, cuda::std::uint32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::uint32_t test_idot_u8_u16(Vec_u8_x4 lhs, Vec_u16_x4 rhs, cuda::std::uint32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +/* + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s16_s8.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_s16.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s16_u8.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_s16.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u16_s8.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_u16.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u16_u8.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_u16.*}} +; SMXX-DAG: {{.*IDP\.2A.*LO.*}} +; SMXX-DAG: {{.*IDP\.2A.*HI.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/idot/idp4.cu b/libcudacxx/test/simd_codegen/idot/idp4.cu new file mode 100644 index 00000000000..836dfc376ba --- /dev/null +++ b/libcudacxx/test/simd_codegen/idot/idp4.cu @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace simd = cuda::std::simd; + +using Vec_s8_x4 = simd::basic_vec>; +using Vec_u8_x4 = simd::basic_vec>; + +__device__ cuda::std::int32_t test_idot_s8_s8(Vec_s8_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::uint32_t test_idot_u8_u8(Vec_u8_x4 lhs, Vec_u8_x4 rhs, cuda::std::uint32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_u8_s8(Vec_u8_x4 lhs, Vec_s8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +__device__ cuda::std::int32_t test_idot_s8_u8(Vec_s8_x4 lhs, Vec_u8_x4 rhs, cuda::std::int32_t acc) +{ + return cuda::simd::idot(lhs, rhs, acc); +} + +/* + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_s8.*}} +; SMXX-DAG: {{.*IDP\.4A\.S8\.S8.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_u8.*}} +; SMXX-DAG: {{.*IDP\.4A\.U8\.U8.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_u8_s8.*}} +; SMXX-DAG: {{.*IDP\.4A\.U8\.S8.*}} + +; SMXX-DAG: {{[[:space:]]*}}Function : {{.*test_idot_s8_u8.*}} +; SMXX-DAG: {{.*IDP\.4A\.S8\.U8.*}} + +*/