From 849077c9cf9f9da66eaec39da71907e775e2afca Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Fri, 20 Feb 2026 15:14:18 -0800 Subject: [PATCH] remove deprecated qkv/kv_packed apis Signed-off-by: Sudhakar Singh --- .../common/fused_attn/fused_attn.cpp | 650 ------------------ .../include/transformer_engine/fused_attn.h | 284 -------- 2 files changed, 934 deletions(-) diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp index 4f8367aac7..496eba7421 100644 --- a/transformer_engine/common/fused_attn/fused_attn.cpp +++ b/transformer_engine/common/fused_attn/fused_attn.cpp @@ -15,74 +15,6 @@ #include "fused_attn_fp8.h" #include "utils.h" -namespace { -// Helper function to create a tensor view with modified shape and optional pointer offset -transformer_engine::Tensor make_tensor_view(const transformer_engine::Tensor *source, - const std::vector &shape, - size_t offset_bytes = 0) { - transformer_engine::Tensor view = *source; - if (offset_bytes > 0) { - view.data.dptr = static_cast(static_cast(source->data.dptr) + offset_bytes); - } - view.data.shape = shape; - view.nvte_tensor = 0; // Mark as unmanaged/local tensor view - return view; -} - -// Helper function to calculate stride in bytes for packed QKV tensor unpacking -size_t calculate_qkv_stride(NVTE_QKV_Layout_Group layout_group, transformer_engine::DType dtype, - size_t h, size_t d) { - size_t stride = 0; - if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) { - stride = (transformer_engine::typeToNumBits(dtype) * h * d) / 8; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) { - stride = (transformer_engine::typeToNumBits(dtype) * d) / 8; - } - return stride; -} - -// Helper function to determine unpacked shape for QKV packed tensor -std::vector calculate_qkv_unpacked_shape(const transformer_engine::Tensor *qkv_tensor, - size_t h, size_t d) { - std::vector unpacked_shape; - if (qkv_tensor->data.shape.size() == 4) { - // T3HD or TH3D (4D) -> THD (3D): remove dimension "3" at position 1 - unpacked_shape = {qkv_tensor->data.shape[0], h, d}; - } else { - // BS3HD/SB3HD or BSH3D/SBH3D (5D) -> BSHD/SBHD (4D): remove dimension "3" at position 2 - unpacked_shape = {qkv_tensor->data.shape[0], qkv_tensor->data.shape[1], h, d}; - } - return unpacked_shape; -} - -// Helper function to calculate stride for packed KV tensor unpacking -size_t calculate_kv_stride(NVTE_QKV_Layout_Group layout_group, transformer_engine::DType dtype, - size_t h_kv, size_t d) { - size_t stride = 0; - if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) { - stride = (transformer_engine::typeToNumBits(dtype) * h_kv * d) / 8; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { - stride = (transformer_engine::typeToNumBits(dtype) * d) / 8; - } - return stride; -} - -// Helper function to determine unpacked shape for KV packed tensor -std::vector calculate_kv_unpacked_shape(const transformer_engine::Tensor *kv_tensor, - NVTE_QKV_Layout_Group layout_group, - NVTE_QKV_Format kv_format, size_t t_kv, size_t h_kv, - size_t d) { - std::vector unpacked_kv_shape; - if (kv_format == NVTE_QKV_Format::NVTE_THD) { - unpacked_kv_shape = {t_kv, h_kv, d}; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD || - layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { - unpacked_kv_shape = {kv_tensor->data.shape[0], kv_tensor->data.shape[1], h_kv, d}; - } - return unpacked_kv_shape; -} -} // namespace - // map NVTE_QKV_Layout to NVTE_QKV_Layout_Group NVTE_QKV_Layout_Group nvte_get_qkv_layout_group(NVTE_QKV_Layout qkv_layout) { switch (qkv_layout) { @@ -516,588 +448,6 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( return backend; } -// NVTE fused attention FWD with packed QKV -// DEPRECATED: This API is deprecated. -// Please use nvte_fused_attn_fwd with separate Q, K, V tensors instead. -void nvte_fused_attn_fwd_qkvpacked( - const NVTETensor QKV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S, - NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens, - const NVTETensor cu_seqlens_padded, const NVTETensor rng_state, size_t max_seqlen, - bool is_training, bool return_max_logit, bool cuda_graph, float attn_scale, float dropout, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right, - bool bottom_right_diagonal, NVTETensor workspace, cudaStream_t stream) { - NVTE_API_CALL(nvte_flash_attn_fwd_qkvpacked); - using namespace transformer_engine; - - const Tensor *input_cu_seqlens = convertNVTETensorCheck(cu_seqlens); - const Tensor *input_cu_seqlens_padded = convertNVTETensorCheck(cu_seqlens_padded); - const Tensor *input_rng_state = convertNVTETensorCheck(rng_state); - const Tensor *input_QKV = convertNVTETensorCheck(QKV); - const Tensor *input_Bias = convertNVTETensorCheck(Bias); - const Tensor *input_SoftmaxOffset = convertNVTETensorCheck(SoftmaxOffset); - Tensor *input_output_S = convertNVTETensorCheck(S); - Tensor *output_O = convertNVTETensorCheck(O); - Tensor *wkspace = convertNVTETensor(workspace); - - auto ndim = input_QKV->data.shape.size(); - size_t b = input_cu_seqlens->data.shape[0] - 1; - size_t h = 0; - NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); - if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) { - h = input_QKV->data.shape[ndim - 2]; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) { - h = input_QKV->data.shape[ndim - 3]; - } else { - NVTE_ERROR("nvte_fused_attn_fwd_qkvpacked only supports H3D and 3HD layouts!"); - } - size_t d = input_QKV->data.shape[ndim - 1]; - size_t t = 0; - NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); - if (qkv_format == NVTE_QKV_Format::NVTE_THD) { - t = input_QKV->data.shape[0]; - } - - auto handle = cudnnExecutionPlanManager::Instance().GetHandle(); - const NVTEDType QKV_type = static_cast(input_QKV->data.dtype); - - NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend( - is_training, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, - h, h, max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, return_max_logit, - cuda_graph, false); - - if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { -#if (CUDNN_VERSION >= 8901) - // Unpack QKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - fused_attn_max_512_fwd(b, h, max_seqlen, max_seqlen, d, is_training, attn_scale, dropout, - qkv_layout, bias_type, attn_mask_type, &Q_view, &K_view, &V_view, - input_Bias, output_O, Aux_CTX_Tensors, input_cu_seqlens, - input_cu_seqlens, input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { -#if (CUDNN_VERSION >= 8900) - // Unpack QKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - fused_attn_arbitrary_seqlen_fwd( - b, h, h, max_seqlen, max_seqlen, d, d, t, t, 0, 0, 0, 0, 0, 0, is_training, - return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, - window_size_left, window_size_right, bottom_right_diagonal, &Q_view, &K_view, &V_view, - input_Bias, input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens, - input_cu_seqlens, input_cu_seqlens_padded, input_cu_seqlens_padded, nullptr, nullptr, - input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR( - "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. " - "\n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { -#if (CUDNN_VERSION >= 8900) - // Unpack QKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - fused_attn_fp8_fwd(b, h, h, max_seqlen, max_seqlen, d, is_training, attn_scale, dropout, - qkv_layout, bias_type, attn_mask_type, &Q_view, &K_view, &V_view, - input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens, - input_cu_seqlens, input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); -#endif - } else { - NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); - } -} -// NVTE fused attention BWD with packed QKV -// DEPRECATED: This API is deprecated. -// Please use nvte_fused_attn_bwd with separate Q, K, V tensors instead. -void nvte_fused_attn_bwd_qkvpacked( - const NVTETensor QKV, const NVTETensor O, const NVTETensor dO, const NVTETensor S, - NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQKV, NVTETensor dBias, - NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens, const NVTETensor cu_seqlens_padded, - size_t max_seqlen, float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, - NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, - int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal, - bool deterministic, bool cuda_graph, NVTETensor workspace, cudaStream_t stream) { - NVTE_API_CALL(nvte_flash_attn_bwd_qkvpacked); - using namespace transformer_engine; - - const Tensor *input_cu_seqlens = convertNVTETensorCheck(cu_seqlens); - const Tensor *input_cu_seqlens_padded = convertNVTETensorCheck(cu_seqlens_padded); - const Tensor *input_QKV = convertNVTETensorCheck(QKV); - const Tensor *input_O = convertNVTETensorCheck(O); - const Tensor *input_dO = convertNVTETensorCheck(dO); - const Tensor *input_S = convertNVTETensorCheck(S); - Tensor *input_output_dP = convertNVTETensorCheck(dP); - Tensor *output_dQKV = convertNVTETensorCheck(dQKV); - Tensor *output_dBias = convertNVTETensorCheck(dBias); - Tensor *output_dSoftmaxOffset = convertNVTETensorCheck(dSoftmaxOffset); - Tensor *wkspace = convertNVTETensor(workspace); - - auto ndim = input_QKV->data.shape.size(); - size_t b = input_cu_seqlens->data.shape[0] - 1; - size_t h = 0; - NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); - if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) { - h = input_QKV->data.shape[ndim - 2]; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) { - h = input_QKV->data.shape[ndim - 3]; - } else { - NVTE_ERROR("nvte_fused_attn_fwd_qkvpacked only supports H3D and 3HD layouts!"); - } - size_t d = input_QKV->data.shape[ndim - 1]; - size_t t = 0; - NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout); - if (qkv_format == NVTE_QKV_Format::NVTE_THD) { - t = input_QKV->data.shape[0]; - } - - auto handle = cudnnExecutionPlanManager::Instance().GetHandle(); - const NVTEDType QKV_type = static_cast(input_QKV->data.dtype); - - NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend( - true, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h, h, - max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, false, cuda_graph, - deterministic); - - if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { -#if (CUDNN_VERSION >= 8901) - Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]); - - // Unpack QKV and dQKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V and dQ, dK, dV - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape); - Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride); - Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride); - - fused_attn_max_512_bwd(b, h, max_seqlen, max_seqlen, d, attn_scale, dropout, qkv_layout, - bias_type, attn_mask_type, &Q_view, &K_view, &V_view, input_dO, output_S, - &dQ_view, &dK_view, &dV_view, output_dBias, input_cu_seqlens, - input_cu_seqlens, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { -#if (CUDNN_VERSION >= 8900) - size_t i = 0; - Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - Tensor *input_Bias, *input_SoftmaxOffset; - if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) { - input_Bias = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - } - if (softmax_type != NVTE_VANILLA_SOFTMAX) { - input_SoftmaxOffset = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - } - - // Unpack QKV and dQKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V and dQ, dK, dV - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape); - Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride); - Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride); - - fused_attn_arbitrary_seqlen_bwd( - b, h, h, max_seqlen, max_seqlen, d, d, t, t, attn_scale, dropout, qkv_layout, bias_type, - attn_mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal, - deterministic, &Q_view, &K_view, &V_view, input_O, input_dO, input_Bias, - input_SoftmaxOffset, output_S, &dQ_view, &dK_view, &dV_view, output_dBias, - output_dSoftmaxOffset, input_cu_seqlens, input_cu_seqlens, input_cu_seqlens_padded, - input_cu_seqlens_padded, input_rng_state, wkspace, stream, handle); -#else - const char *err_msg = - "cuDNN 8.9.0 is required for BF16/FP16 fused attention " - "with arbitrary sequence length. \n"; - NVTE_ERROR(err_msg); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { -#if (CUDNN_VERSION >= 8900) - const Tensor *input_M = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]); - const Tensor *input_ZInv = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[1]); - const Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[2]); - - // Unpack QKV and dQKV and call the non-packed function - const auto QKV_type = input_QKV->data.dtype; - size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d); - std::vector unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d); - - // Create tensor views for Q, K, V and dQ, dK, dV - Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape); - Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride); - Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride); - - Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape); - Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride); - Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride); - - fused_attn_fp8_bwd(b, h, h, max_seqlen, max_seqlen, d, attn_scale, dropout, qkv_layout, - bias_type, attn_mask_type, &Q_view, &K_view, &V_view, input_O, input_dO, - input_M, input_ZInv, input_S, input_output_dP, &dQ_view, &dK_view, &dV_view, - input_cu_seqlens, input_cu_seqlens, input_rng_state, wkspace, stream, - handle); -#else - NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); -#endif - } else { - NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); - } -} -// NVTE fused attention FWD with packed KV -// DEPRECATED: This API is deprecated. -// Please use nvte_fused_attn_fwd with separate Q, K, V tensors instead. -void nvte_fused_attn_fwd_kvpacked( - const NVTETensor Q, const NVTETensor KV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, - NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens_q, - const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded, - const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k, - const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q, - size_t max_seqlen_kv, bool is_training, bool return_max_logit, bool cuda_graph, - float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, - NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left, - int64_t window_size_right, bool bottom_right_diagonal, NVTETensor workspace, - cudaStream_t stream) { - NVTE_API_CALL(nvte_flash_attn_fwd_kvpacked); - using namespace transformer_engine; - const Tensor *input_cu_seqlens_q = convertNVTETensorCheck(cu_seqlens_q); - const Tensor *input_cu_seqlens_kv = convertNVTETensorCheck(cu_seqlens_kv); - const Tensor *input_cu_seqlens_q_padded = convertNVTETensorCheck(cu_seqlens_q_padded); - const Tensor *input_cu_seqlens_kv_padded = convertNVTETensorCheck(cu_seqlens_kv_padded); - const Tensor *input_page_table_k = convertNVTETensorCheck(page_table_k); - const Tensor *input_page_table_v = convertNVTETensorCheck(page_table_v); - const Tensor *input_rng_state = convertNVTETensorCheck(rng_state); - const Tensor *input_Q = convertNVTETensorCheck(Q); - const Tensor *input_KV = convertNVTETensorCheck(KV); - const Tensor *input_Bias = convertNVTETensorCheck(Bias); - const Tensor *input_SoftmaxOffset = convertNVTETensorCheck(SoftmaxOffset); - Tensor *input_output_S = convertNVTETensorCheck(S); - Tensor *output_O = convertNVTETensorCheck(O); - Tensor *wkspace = convertNVTETensor(workspace); - - size_t b = input_cu_seqlens_q->data.shape[0] - 1; - auto ndim = input_Q->data.shape.size(); - size_t h_q = input_Q->data.shape[ndim - 2]; - size_t d = input_Q->data.shape[ndim - 1]; - auto ndim_kv = input_KV->data.shape.size(); - size_t h_kv = 0; - NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); - if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) { - h_kv = input_KV->data.shape[ndim_kv - 2]; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { - h_kv = input_KV->data.shape[ndim_kv - 3]; - } else { - NVTE_ERROR("nvte_fused_attn_fwd_kvpacked only supports HD_H2D and HD_2HD layouts!"); - } - size_t t_q = 0; - size_t t_kv = 0; - NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout); - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - if (q_format == NVTE_QKV_Format::NVTE_THD) { - t_q = input_Q->data.shape[0]; - } - if (kv_format == NVTE_QKV_Format::NVTE_THD) { - t_kv = input_KV->data.shape[0]; - } - int64_t num_pages_k = 0; - int64_t num_pages_v = 0; - int64_t page_size_k = 0; - int64_t page_size_v = 0; - int64_t max_pages_per_seq_k = 0; - int64_t max_pages_per_seq_v = 0; - if (input_page_table_k->data.dptr != nullptr) { - max_pages_per_seq_k = input_page_table_k->data.shape[1]; - } - if (input_page_table_v->data.dptr != nullptr) { - max_pages_per_seq_v = input_page_table_v->data.shape[1]; - } - if (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD) { - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - if (kv_format == NVTE_QKV_Format::NVTE_BSHD) { - num_pages_k = input_KV->data.shape[0]; - page_size_k = input_KV->data.shape[1]; - num_pages_v = num_pages_v; - page_size_v = page_size_v; - } else if (kv_format == NVTE_QKV_Format::NVTE_SBHD) { - num_pages_k = input_KV->data.shape[1]; - page_size_k = input_KV->data.shape[0]; - num_pages_v = num_pages_v; - page_size_v = page_size_v; - } - } - - auto handle = cudnnExecutionPlanManager::Instance().GetHandle(); - const NVTEDType Q_type = static_cast(input_Q->data.dtype); - const NVTEDType KV_type = static_cast(input_KV->data.dtype); - - NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend( - is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, - h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right, - return_max_logit, cuda_graph, false); - - if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { -#if (CUDNN_VERSION >= 8901) - // Unpack KV and call the non-packed function - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, input_Q->data.dtype, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - fused_attn_max_512_fwd(b, h_q, max_seqlen_q, max_seqlen_kv, d, is_training, attn_scale, dropout, - qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view, - input_Bias, output_O, Aux_CTX_Tensors, input_cu_seqlens_q, - input_cu_seqlens_kv, input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { -#if (CUDNN_VERSION >= 8903) - // Unpack KV and call the non-packed function - const auto Q_type = input_Q->data.dtype; - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - fused_attn_arbitrary_seqlen_fwd( - b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, t_q, t_kv, num_pages_k, num_pages_v, - page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training, - return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type, - window_size_left, window_size_right, bottom_right_diagonal, input_Q, &K_view, &V_view, - input_Bias, input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens_q, - input_cu_seqlens_kv, input_cu_seqlens_q_padded, input_cu_seqlens_kv_padded, - input_page_table_k, input_page_table_v, input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR( - "cuDNN 8.9.3 is required for BF16/FP16 fused attention with arbitrary sequence length. " - "\n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { -#if (CUDNN_VERSION >= 8900) - // Unpack KV and call the non-packed function - const auto Q_type = input_Q->data.dtype; - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - fused_attn_fp8_fwd(b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, is_training, attn_scale, - dropout, qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view, - input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens_q, - input_cu_seqlens_kv, input_rng_state, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); -#endif - } else { - NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); - } -} -// NVTE fused attention BWD with packed KV -// DEPRECATED: This API is deprecated. -// Please use nvte_fused_attn_bwd with separate Q, K, V tensors instead. -void nvte_fused_attn_bwd_kvpacked( - const NVTETensor Q, const NVTETensor KV, const NVTETensor O, const NVTETensor dO, - const NVTETensor S, NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQ, - NVTETensor dKV, NVTETensor dBias, NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens_q, - const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded, - const NVTETensor cu_seqlens_kv_padded, size_t max_seqlen_q, size_t max_seqlen_kv, - float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, - NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left, - int64_t window_size_right, bool bottom_right_diagonal, bool deterministic, bool cuda_graph, - NVTETensor workspace, cudaStream_t stream) { - NVTE_API_CALL(nvte_flash_attn_bwd_kvpacked); - using namespace transformer_engine; - const Tensor *input_cu_seqlens_q = convertNVTETensorCheck(cu_seqlens_q); - const Tensor *input_cu_seqlens_kv = convertNVTETensorCheck(cu_seqlens_kv); - const Tensor *input_cu_seqlens_q_padded = convertNVTETensorCheck(cu_seqlens_q_padded); - const Tensor *input_cu_seqlens_kv_padded = convertNVTETensorCheck(cu_seqlens_kv_padded); - const Tensor *input_Q = convertNVTETensorCheck(Q); - const Tensor *input_KV = convertNVTETensorCheck(KV); - const Tensor *input_O = convertNVTETensorCheck(O); - const Tensor *input_dO = convertNVTETensorCheck(dO); - const Tensor *input_S = convertNVTETensorCheck(S); - Tensor *input_output_dP = convertNVTETensorCheck(dP); - Tensor *output_dQ = convertNVTETensorCheck(dQ); - Tensor *output_dKV = convertNVTETensorCheck(dKV); - Tensor *output_dBias = convertNVTETensorCheck(dBias); - Tensor *output_dSoftmaxOffset = convertNVTETensorCheck(dSoftmaxOffset); - Tensor *wkspace = convertNVTETensor(workspace); - - size_t b = input_cu_seqlens_q->data.shape[0] - 1; - auto ndim = input_Q->data.shape.size(); - size_t h_q = input_Q->data.shape[ndim - 2]; - size_t d = input_Q->data.shape[ndim - 1]; - auto ndim_kv = input_KV->data.shape.size(); - size_t h_kv = 0; - NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); - if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) { - h_kv = input_KV->data.shape[ndim_kv - 2]; - } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) { - h_kv = input_KV->data.shape[ndim_kv - 3]; - } else { - NVTE_ERROR("nvte_fused_attn_fwd_kvpacked only supports HD_H2D and HD_2HD layouts!"); - } - size_t t_q = 0; - size_t t_kv = 0; - NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout); - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - if (q_format == NVTE_QKV_Format::NVTE_THD) { - t_q = input_Q->data.shape[0]; - } - if (kv_format == NVTE_QKV_Format::NVTE_THD) { - t_kv = input_KV->data.shape[0]; - } - - auto handle = cudnnExecutionPlanManager::Instance().GetHandle(); - const NVTEDType Q_type = static_cast(input_Q->data.dtype); - const NVTEDType KV_type = static_cast(input_KV->data.dtype); - - NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend( - true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q, - h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right, false, - cuda_graph, deterministic); - - if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) { -#if (CUDNN_VERSION >= 8901) - Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]); - - // Unpack KV and dKV and call the non-packed function - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, input_Q->data.dtype, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape); - Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride); - - fused_attn_max_512_bwd(b, h_q, max_seqlen_q, max_seqlen_kv, d, attn_scale, dropout, qkv_layout, - bias_type, attn_mask_type, input_Q, &K_view, &V_view, input_dO, output_S, - output_dQ, &dK_view, &dV_view, output_dBias, input_cu_seqlens_q, - input_cu_seqlens_kv, wkspace, stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n"); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) { -#if (CUDNN_VERSION >= 8903) - size_t i = 0; - Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - Tensor *input_Bias, *input_SoftmaxOffset; - if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) { - input_Bias = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - } - if (softmax_type != NVTE_VANILLA_SOFTMAX) { - input_SoftmaxOffset = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]); - } - - // Unpack KV and dKV and call the non-packed function - const auto Q_type = input_Q->data.dtype; - NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - // Create tensor views for dK, dV - Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape); - Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride); - - fused_attn_arbitrary_seqlen_bwd( - b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, t_q, t_kv, attn_scale, dropout, qkv_layout, - bias_type, attn_mask_type, softmax_type, window_size_left, window_size_right, - bottom_right_diagonal, deterministic, input_Q, &K_view, &V_view, input_O, input_dO, - input_Bias, input_SoftmaxOffset, output_S, output_dQ, &dK_view, &dV_view, output_dBias, - output_dSoftmaxOffset, input_cu_seqlens_q, input_cu_seqlens_kv, input_cu_seqlens_q_padded, - input_cu_seqlens_kv_padded, input_rng_state, wkspace, stream, handle); -#else - const char *err_msg = - "cuDNN 8.9.3 is required for BF16/FP16 fused attention " - "with arbitrary sequence length. \n"; - NVTE_ERROR(err_msg); -#endif - } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) { -#if (CUDNN_VERSION >= 8900) - const Tensor *input_M = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]); - const Tensor *input_ZInv = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[1]); - const Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[2]); - - // Unpack KV and dKV and call the non-packed function - const auto Q_type = input_Q->data.dtype; - NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout); - size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d); - std::vector unpacked_kv_shape = - calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d); - - Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape); - Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride); - - Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape); - Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride); - - fused_attn_fp8_bwd(b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, attn_scale, dropout, - qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view, input_O, - input_dO, input_M, input_ZInv, input_S, input_output_dP, output_dQ, &dK_view, - &dV_view, input_cu_seqlens_q, input_cu_seqlens_kv, input_rng_state, wkspace, - stream, handle); -#else - NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n"); -#endif - } else { - NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n"); - } -} // NVTE fused attention FWD with separate Q, K and V void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETensor V, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S, diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h index cddd3d7506..8169bf22e2 100644 --- a/transformer_engine/common/include/transformer_engine/fused_attn.h +++ b/transformer_engine/common/include/transformer_engine/fused_attn.h @@ -217,290 +217,6 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend( size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left, int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic); -/*! \brief Compute dot product attention with packed QKV input. - * - * \deprecated Please use `nvte_fused_attn_fwd` with separate Q, K, V tensors instead. - * - * Computes: - * - P = Q * Transpose(K) + Bias - * - S = ScaleMaskSoftmax(P) - * - D = Dropout(S) - * - O = D * Transpose(V) - * - * Support Matrix: - \verbatim - | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | 0 | FP16/BF16 | BS3HD,SB3HD | NO/POST_SCALE_BIAS | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | <= 512, % 64 == 0 | 64 | - | 1 | FP16/BF16 | BS3HD,SB3HD,BSH3D,SBH3D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | > 512, % 64 == 0 | <= 128, % 8 == 0 | - | 2 | FP8 | T3HD | NO_BIAS | PADDING_MASK | Yes | <= 512, % 64 == 0 | 64 | - \endverbatim - * - * Notes: - * - * Tensor `cu_seqlens_padded` helps identify the correct offsets of different sequences - * in tensors Q, K, V and O. - * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`, - * the offset tensor is not used in the attention calculation and can be set to empty `NVTETensor`. - * When the QKV format is `thd`, this tensor should follow the following rules. - * When there is no padding between sequences, the offset tensor should be equal to `cu_seqlens`, - * When there is padding between sequences, users are responsible to adjust the offsets as needed. - * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have - * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`. - * - * \param[in] QKV The QKV tensor in packed format, H3D or 3HD. - * \param[in] Bias The Bias tensor. - * \param[in] SoftmaxOffset The SoftmaxOffset tensor. - * \param[in,out] S The S tensor. - * \param[out] O The output O tensor. - * \param[out] Aux_CTX_Tensors Auxiliary output tensors when training, - * e.g. M, ZInv, rng_state. - * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. - * \param[in] cu_seqlens_padded Cumulative sequence offsets for QKV, [batch_size + 1]. - * \param[in] rng_state Seed and offset of CUDA random number generator. - * \param[in] max_seqlen Max sequence length used for computing, - * it may be >= max(seqlen_i) for i=0,...batch_size-1. - * \param[in] is_training Whether this is in training mode or inference. - * \param[in] return_max_logit Whether to produce Max and Sum_Exp, or Stats. - * \param[in] cuda_graph Whether cuda graph capture is enabled or not. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] softmax_type Attention softmax type. - * \param[in] window_size_left Sliding window size (the left half). - * \param[in] window_size_right Sliding window size (the right half). - * \param[in] bottom_right_diagonal Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. - */ -[[deprecated( - "nvte_fused_attn_fwd_qkvpacked() is deprecated. Please use nvte_fused_attn_fwd() with separate " - "Q, K, V tensors instead.")]] -void nvte_fused_attn_fwd_qkvpacked( - const NVTETensor QKV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S, - NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens, - const NVTETensor cu_seqlens_padded, const NVTETensor rng_state, size_t max_seqlen, - bool is_training, bool return_max_logit, bool cuda_graph, float attn_scale, float dropout, - NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, - NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right, - bool bottom_right_diagonal, NVTETensor workspace, cudaStream_t stream); - -/*! \brief Compute the backward of the dot product attention with packed QKV input. - * - * \deprecated Please use `nvte_fused_attn_bwd` with separate Q, K, V tensors instead. - * - * Support Matrix: - \verbatim - | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | 0 | FP16/BF16 | BS3HD,SB3HD | NO/POST_SCALE_BIAS | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | <= 512, % 64 == 0 | 64 | - | 1 | FP16/BF16 | BS3HD,SB3HD,BSH3D,SBH3D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | > 512, % 64 == 0 | <= 128, % 8 == 0 | - | 2 | FP8 | T3HD | NO_BIAS | PADDING_MASK | Yes | <= 512, % 64 == 0 | 64 | - \endverbatim - * - * Notes: - * - * Tensor `cu_seqlens_padded` helps identify the correct offsets of different sequences - * in tensors Q, K, V and O. - * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`, - * the offset tensor is not used in the attention calculation and can be set to empty `NVTETensor`. - * When the QKV format is `thd`, this tensor should follow the following rules. - * When there is no padding between sequences, the offset tensor should be equal to `cu_seqlens`, - * When there is padding between sequences, users are responsible to adjust the offsets as needed. - * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have - * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`. - * - * \param[in] QKV The QKV tensor in packed format, H3D or 3HD. - * \param[in] O The O tensor from forward. - * \param[in] dO The gradient of the O tensor. - * \param[in] S The S tensor. - * \param[in,out] dP The gradient of the P tensor. - * \param[in] Aux_CTX_Tensors Auxiliary tensors from context when in training mode, - * e.g. M, ZInv, rng_state. - * \param[out] dQKV The gradient of the QKV tensor. - * \param[out] dBias The gradient of the Bias tensor. - * \param[out] dSoftmaxOffset The gradient of the SoftmaxOffset tensor. - * \param[in] cu_seqlens Cumulative sequence lengths, [batch_size + 1]. - * \param[in] cu_seqlens_padded Cumulative sequence offsets for QKV, [batch_size + 1]. - * \param[in] max_seqlen Max sequence length used for computing, - * it may be >= max(seqlen_i) for i=0,...batch_size-1. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] softmax_type Attention softmax type. - * \param[in] window_size_left Sliding window size (the left half). - * \param[in] window_size_right Sliding window size (the right half). - * \param[in] bottom_right_diagonal Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix. - * \param[in] deterministic Whether to execute with deterministic behaviours. - * \param[in] cuda_graph Whether cuda graph capture is enabled or not. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. - */ -[[deprecated( - "nvte_fused_attn_bwd_qkvpacked() is deprecated. Please use nvte_fused_attn_bwd() with separate " - "Q, K, V tensors instead.")]] -void nvte_fused_attn_bwd_qkvpacked( - const NVTETensor QKV, const NVTETensor O, const NVTETensor dO, const NVTETensor S, - NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQKV, NVTETensor dBias, - NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens, const NVTETensor cu_seqlens_padded, - size_t max_seqlen, float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, - NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, - int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal, - bool deterministic, bool cuda_graph, NVTETensor workspace, cudaStream_t stream); - -/*! \brief Compute dot product attention with packed KV input. - * - * \deprecated Please use `nvte_fused_attn_fwd` with separate Q, K, V tensors instead. - * - * Computes: - * - P = Q * Transpose(K) + Bias - * - S = ScaleMaskSoftmax(P) - * - D = Dropout(S) - * - O = D * Transpose(V) - * - * Support Matrix: - \verbatim - | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | 0 | FP16/BF16 | BSHD_BS2HD,SBHD_SB2HD | NO/POST_SCALE_BIAS | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | <= 512, % 64 == 0 | 64 | - | 1 | FP16/BF16 | BSHD_BS2HD,BSHD_BSH2D,SBHD_SB2HD,SBHD_SBH2D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | > 512, % 64 == 0 | <= 128, % 8 == 0 | - \endverbatim - * - * Notes: - * - * Tensors `cu_seqlens_q_padded` and `cu_seqlens_kv_padded` - * help identify the correct offsets of different sequences in tensors Q, K, V and O. - * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`, - * offset tensors are not used in the attention calculation and can be set to empty `NVTETensor`s. - * When the QKV format is `thd`, these tensors should follow the following rules. - * When there is no padding between sequences, the offset tensors should be equal to - * `cu_seqlens_q` and `cu_seqlens_kv` respectively. - * When there is padding between sequences, users are responsible to adjust the offsets as needed. - * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have - * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`. - * - * \param[in] Q The Q tensor, in HD layouts. - * \param[in] KV The KV tensor, in 2HD or H2D layouts. - * \param[in] Bias The Bias tensor. - * \param[in] SoftmaxOffset The SoftmaxOffset tensor. - * \param[in,out] S The S tensor. - * \param[out] O The output O tensor. - * \param[out] Aux_CTX_Tensors Auxiliary output tensors when training, - * e.g. M, ZInv, rng_state. - * \param[in] cu_seqlens_q Cumulative sequence lengths for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv Cumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] cu_seqlens_q_padded Cumulative sequence offsets for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv_padded Cumulative sequence offsets for KV, [batch_size + 1]. - * \param[in] page_table_k Page table for K cache, [batch_size, max_pages_per_seq_k]. - * \param[in] page_table_v Page table for V cache, [batch_size, max_pages_per_seq_v]. - * \param[in] rng_state Seed and offset of CUDA random number generator. - * \param[in] max_seqlen_q Max sequence length used for computing for Q. - * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. - * \param[in] max_seqlen_kv Max sequence length used for computing for KV. - * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. - * \param[in] is_training Whether this is in training mode or inference. - * \param[in] return_max_logit Whether to produce Max and Sum_Exp, or Stats. - * \param[in] cuda_graph Whether cuda graph capture is enabled or not. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] softmax_type Attention softmax type. - * \param[in] window_size_left Sliding window size (the left half). - * \param[in] window_size_right Sliding window size (the right half). - * \param[in] bottom_right_diagonal Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. - */ -[[deprecated( - "nvte_fused_attn_fwd_kvpacked() is deprecated. Please use nvte_fused_attn_fwd() with separate " - "Q, K, V tensors instead.")]] -void nvte_fused_attn_fwd_kvpacked( - const NVTETensor Q, const NVTETensor KV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, - NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens_q, - const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded, - const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k, - const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q, - size_t max_seqlen_kv, bool is_training, bool return_max_logit, bool cuda_graph, - float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, - NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left, - int64_t window_size_right, bool bottom_right_diagonal, NVTETensor workspace, - cudaStream_t stream); - -/*! \brief Compute the backward of the dot product attention with packed KV input. - * - * \deprecated Please use `nvte_fused_attn_bwd` with separate Q, K, V tensors instead. - * - * Support Matrix: - \verbatim - | backend | precision | qkv layout | bias | mask | dropout | sequence length | head_dim | - | 0 | FP16/BF16 | BSHD_BS2HD,SBHD_SB2HD | NO/POST_SCALE_BIAS | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | <= 512, % 64 == 0 | 64 | - | 1 | FP16/BF16 | BSHD_BS2HD,BSHD_BSH2D,SBHD_SB2HD,SBHD_SBH2D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK | Yes | > 512, % 64 == 0 | <= 128, % 8 == 0 | - \endverbatim - * - * Notes: - * - * Tensors `cu_seqlens_q_padded` and `cu_seqlens_kv_padded` - * help identify the correct offsets of different sequences in tensors Q, K, V and O. - * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`, - * offset tensors are not used in the attention calculation and can be set to empty `NVTETensor`s. - * When the QKV format is `thd`, these tensors should follow the following rules. - * When there is no padding between sequences, the offset tensors should be equal to - * `cu_seqlens_q` and `cu_seqlens_kv` respectively. - * When there is padding between sequences, users are responsible to adjust the offsets as needed. - * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have - * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`. - * - * \param[in] Q The Q tensor, in HD layouts. - * \param[in] KV The KV tensor, in H2D or 2HD layouts. - * \param[in] O The O tensor from forward. - * \param[in] dO The gradient of the O tensor. - * \param[in] S The S tensor. - * \param[in,out] dP The gradient of the P tensor. - * \param[in] Aux_CTX_Tensors Auxiliary tensors from context when in training mode, - * e.g. M, ZInv, rng_state. - * \param[out] dQ The gradient of the Q tensor. - * \param[out] dKV The gradient of the KV tensor. - * \param[out] dBias The gradient of the Bias tensor. - * \param[out] dSoftmaxOffset The gradient of the SoftmaxOffset tensor. - * \param[in] cu_seqlens_q Cumulative sequence lengths for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv Cumulative sequence lengths for KV, [batch_size + 1]. - * \param[in] cu_seqlens_q_padded Cumulative sequence offsets for Q, [batch_size + 1]. - * \param[in] cu_seqlens_kv_padded Cumulative sequence offsets for KV, [batch_size + 1]. - * \param[in] max_seqlen_q Max sequence length used for computing for Q. - * it may be >= max(seqlen_q_i) for i=0,...batch_size-1. - * \param[in] max_seqlen_kv Max sequence length used for computing for KV. - * it may be >= max(seqlen_kv_i) for i=0,...batch_size-1. - * \param[in] attn_scale Scaling factor for Q * K.T. - * \param[in] dropout Dropout probability. - * \param[in] qkv_layout QKV tensor's layout. - * \param[in] bias_type Bias type. - * \param[in] attn_mask_type Attention mask type. - * \param[in] softmax_type Attention softmax type. - * \param[in] window_size_left Sliding window size (the left half). - * \param[in] window_size_right Sliding window size (the right half). - * \param[in] bottom_right_diagonal Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix. - * \param[in] deterministic Whether to execute with deterministic behaviours. - * \param[in] cuda_graph Whether cuda graph capture is enabled or not. - * \param[in] workspace Workspace tensor. - * \param[in] stream CUDA stream used for this operation. - */ -[[deprecated( - "nvte_fused_attn_bwd_kvpacked() is deprecated. Please use nvte_fused_attn_bwd() with separate " - "Q, K, V tensors instead.")]] -void nvte_fused_attn_bwd_kvpacked( - const NVTETensor Q, const NVTETensor KV, const NVTETensor O, const NVTETensor dO, - const NVTETensor S, NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQ, - NVTETensor dKV, NVTETensor dBias, NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens_q, - const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded, - const NVTETensor cu_seqlens_kv_padded, size_t max_seqlen_q, size_t max_seqlen_kv, - float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, - NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left, - int64_t window_size_right, bool bottom_right_diagonal, bool deterministic, bool cuda_graph, - NVTETensor workspace, cudaStream_t stream); - /*! \brief Compute dot product attention with separate Q, K and V. * * Computes: