From 849077c9cf9f9da66eaec39da71907e775e2afca Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Fri, 20 Feb 2026 15:14:18 -0800
Subject: [PATCH] remove deprecated qkv/kv_packed apis

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 .../common/fused_attn/fused_attn.cpp          | 650 ------------------
 .../include/transformer_engine/fused_attn.h   | 284 --------
 2 files changed, 934 deletions(-)
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index 4f8367aac7..496eba7421 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -15,74 +15,6 @@
 #include "fused_attn_fp8.h"
 #include "utils.h"
 
-namespace {
-// Helper function to create a tensor view with modified shape and optional pointer offset
-transformer_engine::Tensor make_tensor_view(const transformer_engine::Tensor *source,
-                                            const std::vector<size_t> &shape,
-                                            size_t offset_bytes = 0) {
-  transformer_engine::Tensor view = *source;
-  if (offset_bytes > 0) {
-    view.data.dptr = static_cast<void *>(static_cast<int8_t *>(source->data.dptr) + offset_bytes);
-  }
-  view.data.shape = shape;
-  view.nvte_tensor = 0;  // Mark as unmanaged/local tensor view
-  return view;
-}
-
-// Helper function to calculate stride in bytes for packed QKV tensor unpacking
-size_t calculate_qkv_stride(NVTE_QKV_Layout_Group layout_group, transformer_engine::DType dtype,
-                            size_t h, size_t d) {
-  size_t stride = 0;
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) {
-    stride = (transformer_engine::typeToNumBits(dtype) * h * d) / 8;
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) {
-    stride = (transformer_engine::typeToNumBits(dtype) * d) / 8;
-  }
-  return stride;
-}
-
-// Helper function to determine unpacked shape for QKV packed tensor
-std::vector<size_t> calculate_qkv_unpacked_shape(const transformer_engine::Tensor *qkv_tensor,
-                                                 size_t h, size_t d) {
-  std::vector<size_t> unpacked_shape;
-  if (qkv_tensor->data.shape.size() == 4) {
-    // T3HD or TH3D (4D) -> THD (3D): remove dimension "3" at position 1
-    unpacked_shape = {qkv_tensor->data.shape[0], h, d};
-  } else {
-    // BS3HD/SB3HD or BSH3D/SBH3D (5D) -> BSHD/SBHD (4D): remove dimension "3" at position 2
-    unpacked_shape = {qkv_tensor->data.shape[0], qkv_tensor->data.shape[1], h, d};
-  }
-  return unpacked_shape;
-}
-
-// Helper function to calculate stride for packed KV tensor unpacking
-size_t calculate_kv_stride(NVTE_QKV_Layout_Group layout_group, transformer_engine::DType dtype,
-                           size_t h_kv, size_t d) {
-  size_t stride = 0;
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) {
-    stride = (transformer_engine::typeToNumBits(dtype) * h_kv * d) / 8;
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) {
-    stride = (transformer_engine::typeToNumBits(dtype) * d) / 8;
-  }
-  return stride;
-}
-
-// Helper function to determine unpacked shape for KV packed tensor
-std::vector<size_t> calculate_kv_unpacked_shape(const transformer_engine::Tensor *kv_tensor,
-                                                NVTE_QKV_Layout_Group layout_group,
-                                                NVTE_QKV_Format kv_format, size_t t_kv, size_t h_kv,
-                                                size_t d) {
-  std::vector<size_t> unpacked_kv_shape;
-  if (kv_format == NVTE_QKV_Format::NVTE_THD) {
-    unpacked_kv_shape = {t_kv, h_kv, d};
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD ||
-             layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) {
-    unpacked_kv_shape = {kv_tensor->data.shape[0], kv_tensor->data.shape[1], h_kv, d};
-  }
-  return unpacked_kv_shape;
-}
-}  // namespace
-
 // map NVTE_QKV_Layout to NVTE_QKV_Layout_Group
 NVTE_QKV_Layout_Group nvte_get_qkv_layout_group(NVTE_QKV_Layout qkv_layout) {
   switch (qkv_layout) {
@@ -516,588 +448,6 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
   return backend;
 }
 
-// NVTE fused attention FWD with packed QKV
-// DEPRECATED: This API is deprecated.
-// Please use nvte_fused_attn_fwd with separate Q, K, V tensors instead.
-void nvte_fused_attn_fwd_qkvpacked(
-    const NVTETensor QKV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
-    NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens,
-    const NVTETensor cu_seqlens_padded, const NVTETensor rng_state, size_t max_seqlen,
-    bool is_training, bool return_max_logit, bool cuda_graph, float attn_scale, float dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, NVTETensor workspace, cudaStream_t stream) {
-  NVTE_API_CALL(nvte_flash_attn_fwd_qkvpacked);
-  using namespace transformer_engine;
-
-  const Tensor *input_cu_seqlens = convertNVTETensorCheck(cu_seqlens);
-  const Tensor *input_cu_seqlens_padded = convertNVTETensorCheck(cu_seqlens_padded);
-  const Tensor *input_rng_state = convertNVTETensorCheck(rng_state);
-  const Tensor *input_QKV = convertNVTETensorCheck(QKV);
-  const Tensor *input_Bias = convertNVTETensorCheck(Bias);
-  const Tensor *input_SoftmaxOffset = convertNVTETensorCheck(SoftmaxOffset);
-  Tensor *input_output_S = convertNVTETensorCheck(S);
-  Tensor *output_O = convertNVTETensorCheck(O);
-  Tensor *wkspace = convertNVTETensor(workspace);
-
-  auto ndim = input_QKV->data.shape.size();
-  size_t b = input_cu_seqlens->data.shape[0] - 1;
-  size_t h = 0;
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) {
-    h = input_QKV->data.shape[ndim - 2];
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) {
-    h = input_QKV->data.shape[ndim - 3];
-  } else {
-    NVTE_ERROR("nvte_fused_attn_fwd_qkvpacked only supports H3D and 3HD layouts!");
-  }
-  size_t d = input_QKV->data.shape[ndim - 1];
-  size_t t = 0;
-  NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  if (qkv_format == NVTE_QKV_Format::NVTE_THD) {
-    t = input_QKV->data.shape[0];
-  }
-
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
-  const NVTEDType QKV_type = static_cast<NVTEDType>(input_QKV->data.dtype);
-
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h, h, max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, return_max_logit,
-      cuda_graph, false);
-
-  if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
-#if (CUDNN_VERSION >= 8901)
-    // Unpack QKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    fused_attn_max_512_fwd(b, h, max_seqlen, max_seqlen, d, is_training, attn_scale, dropout,
-                           qkv_layout, bias_type, attn_mask_type, &Q_view, &K_view, &V_view,
-                           input_Bias, output_O, Aux_CTX_Tensors, input_cu_seqlens,
-                           input_cu_seqlens, input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-#if (CUDNN_VERSION >= 8900)
-    // Unpack QKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    fused_attn_arbitrary_seqlen_fwd(
-        b, h, h, max_seqlen, max_seqlen, d, d, t, t, 0, 0, 0, 0, 0, 0, is_training,
-        return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-        window_size_left, window_size_right, bottom_right_diagonal, &Q_view, &K_view, &V_view,
-        input_Bias, input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens,
-        input_cu_seqlens, input_cu_seqlens_padded, input_cu_seqlens_padded, nullptr, nullptr,
-        input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR(
-        "cuDNN 8.9.0 is required for BF16/FP16 fused attention with arbitrary sequence length. "
-        "\n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) {
-#if (CUDNN_VERSION >= 8900)
-    // Unpack QKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    fused_attn_fp8_fwd(b, h, h, max_seqlen, max_seqlen, d, is_training, attn_scale, dropout,
-                       qkv_layout, bias_type, attn_mask_type, &Q_view, &K_view, &V_view,
-                       input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens,
-                       input_cu_seqlens, input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n");
-#endif
-  } else {
-    NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n");
-  }
-}
-// NVTE fused attention BWD with packed QKV
-// DEPRECATED: This API is deprecated.
-// Please use nvte_fused_attn_bwd with separate Q, K, V tensors instead.
-void nvte_fused_attn_bwd_qkvpacked(
-    const NVTETensor QKV, const NVTETensor O, const NVTETensor dO, const NVTETensor S,
-    NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQKV, NVTETensor dBias,
-    NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens, const NVTETensor cu_seqlens_padded,
-    size_t max_seqlen, float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal,
-    bool deterministic, bool cuda_graph, NVTETensor workspace, cudaStream_t stream) {
-  NVTE_API_CALL(nvte_flash_attn_bwd_qkvpacked);
-  using namespace transformer_engine;
-
-  const Tensor *input_cu_seqlens = convertNVTETensorCheck(cu_seqlens);
-  const Tensor *input_cu_seqlens_padded = convertNVTETensorCheck(cu_seqlens_padded);
-  const Tensor *input_QKV = convertNVTETensorCheck(QKV);
-  const Tensor *input_O = convertNVTETensorCheck(O);
-  const Tensor *input_dO = convertNVTETensorCheck(dO);
-  const Tensor *input_S = convertNVTETensorCheck(S);
-  Tensor *input_output_dP = convertNVTETensorCheck(dP);
-  Tensor *output_dQKV = convertNVTETensorCheck(dQKV);
-  Tensor *output_dBias = convertNVTETensorCheck(dBias);
-  Tensor *output_dSoftmaxOffset = convertNVTETensorCheck(dSoftmaxOffset);
-  Tensor *wkspace = convertNVTETensor(workspace);
-
-  auto ndim = input_QKV->data.shape.size();
-  size_t b = input_cu_seqlens->data.shape[0] - 1;
-  size_t h = 0;
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_3HD) {
-    h = input_QKV->data.shape[ndim - 2];
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_H3D) {
-    h = input_QKV->data.shape[ndim - 3];
-  } else {
-    NVTE_ERROR("nvte_fused_attn_fwd_qkvpacked only supports H3D and 3HD layouts!");
-  }
-  size_t d = input_QKV->data.shape[ndim - 1];
-  size_t t = 0;
-  NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  if (qkv_format == NVTE_QKV_Format::NVTE_THD) {
-    t = input_QKV->data.shape[0];
-  }
-
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
-  const NVTEDType QKV_type = static_cast<NVTEDType>(input_QKV->data.dtype);
-
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      true, QKV_type, QKV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h, h,
-      max_seqlen, max_seqlen, d, d, window_size_left, window_size_right, false, cuda_graph,
-      deterministic);
-
-  if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
-#if (CUDNN_VERSION >= 8901)
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
-
-    // Unpack QKV and dQKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V and dQ, dK, dV
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape);
-    Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride);
-    Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride);
-
-    fused_attn_max_512_bwd(b, h, max_seqlen, max_seqlen, d, attn_scale, dropout, qkv_layout,
-                           bias_type, attn_mask_type, &Q_view, &K_view, &V_view, input_dO, output_S,
-                           &dQ_view, &dK_view, &dV_view, output_dBias, input_cu_seqlens,
-                           input_cu_seqlens, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-#if (CUDNN_VERSION >= 8900)
-    size_t i = 0;
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    Tensor *input_Bias, *input_SoftmaxOffset;
-    if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) {
-      input_Bias = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    }
-    if (softmax_type != NVTE_VANILLA_SOFTMAX) {
-      input_SoftmaxOffset = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    }
-
-    // Unpack QKV and dQKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V and dQ, dK, dV
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape);
-    Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride);
-    Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride);
-
-    fused_attn_arbitrary_seqlen_bwd(
-        b, h, h, max_seqlen, max_seqlen, d, d, t, t, attn_scale, dropout, qkv_layout, bias_type,
-        attn_mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
-        deterministic, &Q_view, &K_view, &V_view, input_O, input_dO, input_Bias,
-        input_SoftmaxOffset, output_S, &dQ_view, &dK_view, &dV_view, output_dBias,
-        output_dSoftmaxOffset, input_cu_seqlens, input_cu_seqlens, input_cu_seqlens_padded,
-        input_cu_seqlens_padded, input_rng_state, wkspace, stream, handle);
-#else
-    const char *err_msg =
-        "cuDNN 8.9.0 is required for BF16/FP16 fused attention "
-        "with arbitrary sequence length. \n";
-    NVTE_ERROR(err_msg);
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) {
-#if (CUDNN_VERSION >= 8900)
-    const Tensor *input_M = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
-    const Tensor *input_ZInv = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[1]);
-    const Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[2]);
-
-    // Unpack QKV and dQKV and call the non-packed function
-    const auto QKV_type = input_QKV->data.dtype;
-    size_t stride = calculate_qkv_stride(layout_group, QKV_type, h, d);
-    std::vector<size_t> unpacked_shape = calculate_qkv_unpacked_shape(input_QKV, h, d);
-
-    // Create tensor views for Q, K, V and dQ, dK, dV
-    Tensor Q_view = make_tensor_view(input_QKV, unpacked_shape);
-    Tensor K_view = make_tensor_view(input_QKV, unpacked_shape, stride);
-    Tensor V_view = make_tensor_view(input_QKV, unpacked_shape, 2 * stride);
-
-    Tensor dQ_view = make_tensor_view(output_dQKV, unpacked_shape);
-    Tensor dK_view = make_tensor_view(output_dQKV, unpacked_shape, stride);
-    Tensor dV_view = make_tensor_view(output_dQKV, unpacked_shape, 2 * stride);
-
-    fused_attn_fp8_bwd(b, h, h, max_seqlen, max_seqlen, d, attn_scale, dropout, qkv_layout,
-                       bias_type, attn_mask_type, &Q_view, &K_view, &V_view, input_O, input_dO,
-                       input_M, input_ZInv, input_S, input_output_dP, &dQ_view, &dK_view, &dV_view,
-                       input_cu_seqlens, input_cu_seqlens, input_rng_state, wkspace, stream,
-                       handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n");
-#endif
-  } else {
-    NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n");
-  }
-}
-// NVTE fused attention FWD with packed KV
-// DEPRECATED: This API is deprecated.
-// Please use nvte_fused_attn_fwd with separate Q, K, V tensors instead.
-void nvte_fused_attn_fwd_kvpacked(
-    const NVTETensor Q, const NVTETensor KV, const NVTETensor Bias, const NVTETensor SoftmaxOffset,
-    NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens_q,
-    const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
-    const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
-    const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q,
-    size_t max_seqlen_kv, bool is_training, bool return_max_logit, bool cuda_graph,
-    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, NVTETensor workspace,
-    cudaStream_t stream) {
-  NVTE_API_CALL(nvte_flash_attn_fwd_kvpacked);
-  using namespace transformer_engine;
-  const Tensor *input_cu_seqlens_q = convertNVTETensorCheck(cu_seqlens_q);
-  const Tensor *input_cu_seqlens_kv = convertNVTETensorCheck(cu_seqlens_kv);
-  const Tensor *input_cu_seqlens_q_padded = convertNVTETensorCheck(cu_seqlens_q_padded);
-  const Tensor *input_cu_seqlens_kv_padded = convertNVTETensorCheck(cu_seqlens_kv_padded);
-  const Tensor *input_page_table_k = convertNVTETensorCheck(page_table_k);
-  const Tensor *input_page_table_v = convertNVTETensorCheck(page_table_v);
-  const Tensor *input_rng_state = convertNVTETensorCheck(rng_state);
-  const Tensor *input_Q = convertNVTETensorCheck(Q);
-  const Tensor *input_KV = convertNVTETensorCheck(KV);
-  const Tensor *input_Bias = convertNVTETensorCheck(Bias);
-  const Tensor *input_SoftmaxOffset = convertNVTETensorCheck(SoftmaxOffset);
-  Tensor *input_output_S = convertNVTETensorCheck(S);
-  Tensor *output_O = convertNVTETensorCheck(O);
-  Tensor *wkspace = convertNVTETensor(workspace);
-
-  size_t b = input_cu_seqlens_q->data.shape[0] - 1;
-  auto ndim = input_Q->data.shape.size();
-  size_t h_q = input_Q->data.shape[ndim - 2];
-  size_t d = input_Q->data.shape[ndim - 1];
-  auto ndim_kv = input_KV->data.shape.size();
-  size_t h_kv = 0;
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) {
-    h_kv = input_KV->data.shape[ndim_kv - 2];
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) {
-    h_kv = input_KV->data.shape[ndim_kv - 3];
-  } else {
-    NVTE_ERROR("nvte_fused_attn_fwd_kvpacked only supports HD_H2D and HD_2HD layouts!");
-  }
-  size_t t_q = 0;
-  size_t t_kv = 0;
-  NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
-  NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-  if (q_format == NVTE_QKV_Format::NVTE_THD) {
-    t_q = input_Q->data.shape[0];
-  }
-  if (kv_format == NVTE_QKV_Format::NVTE_THD) {
-    t_kv = input_KV->data.shape[0];
-  }
-  int64_t num_pages_k = 0;
-  int64_t num_pages_v = 0;
-  int64_t page_size_k = 0;
-  int64_t page_size_v = 0;
-  int64_t max_pages_per_seq_k = 0;
-  int64_t max_pages_per_seq_v = 0;
-  if (input_page_table_k->data.dptr != nullptr) {
-    max_pages_per_seq_k = input_page_table_k->data.shape[1];
-  }
-  if (input_page_table_v->data.dptr != nullptr) {
-    max_pages_per_seq_v = input_page_table_v->data.shape[1];
-  }
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD) {
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    if (kv_format == NVTE_QKV_Format::NVTE_BSHD) {
-      num_pages_k = input_KV->data.shape[0];
-      page_size_k = input_KV->data.shape[1];
-      num_pages_v = num_pages_v;
-      page_size_v = page_size_v;
-    } else if (kv_format == NVTE_QKV_Format::NVTE_SBHD) {
-      num_pages_k = input_KV->data.shape[1];
-      page_size_k = input_KV->data.shape[0];
-      num_pages_v = num_pages_v;
-      page_size_v = page_size_v;
-    }
-  }
-
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
-  const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
-  const NVTEDType KV_type = static_cast<NVTEDType>(input_KV->data.dtype);
-
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, false);
-
-  if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
-#if (CUDNN_VERSION >= 8901)
-    // Unpack KV and call the non-packed function
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, input_Q->data.dtype, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    fused_attn_max_512_fwd(b, h_q, max_seqlen_q, max_seqlen_kv, d, is_training, attn_scale, dropout,
-                           qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view,
-                           input_Bias, output_O, Aux_CTX_Tensors, input_cu_seqlens_q,
-                           input_cu_seqlens_kv, input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-#if (CUDNN_VERSION >= 8903)
-    // Unpack KV and call the non-packed function
-    const auto Q_type = input_Q->data.dtype;
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    fused_attn_arbitrary_seqlen_fwd(
-        b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, t_q, t_kv, num_pages_k, num_pages_v,
-        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, is_training,
-        return_max_logit, attn_scale, dropout, qkv_layout, bias_type, attn_mask_type, softmax_type,
-        window_size_left, window_size_right, bottom_right_diagonal, input_Q, &K_view, &V_view,
-        input_Bias, input_SoftmaxOffset, output_O, Aux_CTX_Tensors, input_cu_seqlens_q,
-        input_cu_seqlens_kv, input_cu_seqlens_q_padded, input_cu_seqlens_kv_padded,
-        input_page_table_k, input_page_table_v, input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR(
-        "cuDNN 8.9.3 is required for BF16/FP16 fused attention with arbitrary sequence length. "
-        "\n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) {
-#if (CUDNN_VERSION >= 8900)
-    // Unpack KV and call the non-packed function
-    const auto Q_type = input_Q->data.dtype;
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    fused_attn_fp8_fwd(b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, is_training, attn_scale,
-                       dropout, qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view,
-                       input_output_S, output_O, Aux_CTX_Tensors, input_cu_seqlens_q,
-                       input_cu_seqlens_kv, input_rng_state, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n");
-#endif
-  } else {
-    NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n");
-  }
-}
-// NVTE fused attention BWD with packed KV
-// DEPRECATED: This API is deprecated.
-// Please use nvte_fused_attn_bwd with separate Q, K, V tensors instead.
-void nvte_fused_attn_bwd_kvpacked(
-    const NVTETensor Q, const NVTETensor KV, const NVTETensor O, const NVTETensor dO,
-    const NVTETensor S, NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQ,
-    NVTETensor dKV, NVTETensor dBias, NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens_q,
-    const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
-    const NVTETensor cu_seqlens_kv_padded, size_t max_seqlen_q, size_t max_seqlen_kv,
-    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, bool deterministic, bool cuda_graph,
-    NVTETensor workspace, cudaStream_t stream) {
-  NVTE_API_CALL(nvte_flash_attn_bwd_kvpacked);
-  using namespace transformer_engine;
-  const Tensor *input_cu_seqlens_q = convertNVTETensorCheck(cu_seqlens_q);
-  const Tensor *input_cu_seqlens_kv = convertNVTETensorCheck(cu_seqlens_kv);
-  const Tensor *input_cu_seqlens_q_padded = convertNVTETensorCheck(cu_seqlens_q_padded);
-  const Tensor *input_cu_seqlens_kv_padded = convertNVTETensorCheck(cu_seqlens_kv_padded);
-  const Tensor *input_Q = convertNVTETensorCheck(Q);
-  const Tensor *input_KV = convertNVTETensorCheck(KV);
-  const Tensor *input_O = convertNVTETensorCheck(O);
-  const Tensor *input_dO = convertNVTETensorCheck(dO);
-  const Tensor *input_S = convertNVTETensorCheck(S);
-  Tensor *input_output_dP = convertNVTETensorCheck(dP);
-  Tensor *output_dQ = convertNVTETensorCheck(dQ);
-  Tensor *output_dKV = convertNVTETensorCheck(dKV);
-  Tensor *output_dBias = convertNVTETensorCheck(dBias);
-  Tensor *output_dSoftmaxOffset = convertNVTETensorCheck(dSoftmaxOffset);
-  Tensor *wkspace = convertNVTETensor(workspace);
-
-  size_t b = input_cu_seqlens_q->data.shape[0] - 1;
-  auto ndim = input_Q->data.shape.size();
-  size_t h_q = input_Q->data.shape[ndim - 2];
-  size_t d = input_Q->data.shape[ndim - 1];
-  auto ndim_kv = input_KV->data.shape.size();
-  size_t h_kv = 0;
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_2HD) {
-    h_kv = input_KV->data.shape[ndim_kv - 2];
-  } else if (layout_group == NVTE_QKV_Layout_Group::NVTE_HD_H2D) {
-    h_kv = input_KV->data.shape[ndim_kv - 3];
-  } else {
-    NVTE_ERROR("nvte_fused_attn_fwd_kvpacked only supports HD_H2D and HD_2HD layouts!");
-  }
-  size_t t_q = 0;
-  size_t t_kv = 0;
-  NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
-  NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-  if (q_format == NVTE_QKV_Format::NVTE_THD) {
-    t_q = input_Q->data.shape[0];
-  }
-  if (kv_format == NVTE_QKV_Format::NVTE_THD) {
-    t_kv = input_KV->data.shape[0];
-  }
-
-  auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
-  const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
-  const NVTEDType KV_type = static_cast<NVTEDType>(input_KV->data.dtype);
-
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q,
-      h_kv, max_seqlen_q, max_seqlen_kv, d, d, window_size_left, window_size_right, false,
-      cuda_graph, deterministic);
-
-  if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_max512_seqlen) {
-#if (CUDNN_VERSION >= 8901)
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
-
-    // Unpack KV and dKV and call the non-packed function
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, input_Q->data.dtype, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape);
-    Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride);
-
-    fused_attn_max_512_bwd(b, h_q, max_seqlen_q, max_seqlen_kv, d, attn_scale, dropout, qkv_layout,
-                           bias_type, attn_mask_type, input_Q, &K_view, &V_view, input_dO, output_S,
-                           output_dQ, &dK_view, &dV_view, output_dBias, input_cu_seqlens_q,
-                           input_cu_seqlens_kv, wkspace, stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.1 is required for BF16/FP16 fused attention with max_seqlen<=512. \n");
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-#if (CUDNN_VERSION >= 8903)
-    size_t i = 0;
-    Tensor *output_S = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    Tensor *input_Bias, *input_SoftmaxOffset;
-    if ((bias_type != NVTE_NO_BIAS) && (bias_type != NVTE_ALIBI)) {
-      input_Bias = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    }
-    if (softmax_type != NVTE_VANILLA_SOFTMAX) {
-      input_SoftmaxOffset = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[i++]);
-    }
-
-    // Unpack KV and dKV and call the non-packed function
-    const auto Q_type = input_Q->data.dtype;
-    NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    // Create tensor views for dK, dV
-    Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape);
-    Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride);
-
-    fused_attn_arbitrary_seqlen_bwd(
-        b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, d, t_q, t_kv, attn_scale, dropout, qkv_layout,
-        bias_type, attn_mask_type, softmax_type, window_size_left, window_size_right,
-        bottom_right_diagonal, deterministic, input_Q, &K_view, &V_view, input_O, input_dO,
-        input_Bias, input_SoftmaxOffset, output_S, output_dQ, &dK_view, &dV_view, output_dBias,
-        output_dSoftmaxOffset, input_cu_seqlens_q, input_cu_seqlens_kv, input_cu_seqlens_q_padded,
-        input_cu_seqlens_kv_padded, input_rng_state, wkspace, stream, handle);
-#else
-    const char *err_msg =
-        "cuDNN 8.9.3 is required for BF16/FP16 fused attention "
-        "with arbitrary sequence length. \n";
-    NVTE_ERROR(err_msg);
-#endif
-  } else if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_FP8) {
-#if (CUDNN_VERSION >= 8900)
-    const Tensor *input_M = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[0]);
-    const Tensor *input_ZInv = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[1]);
-    const Tensor *input_rng_state = convertNVTETensorCheck(Aux_CTX_Tensors->tensors[2]);
-
-    // Unpack KV and dKV and call the non-packed function
-    const auto Q_type = input_Q->data.dtype;
-    NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-    size_t stride = calculate_kv_stride(layout_group, Q_type, h_kv, d);
-    std::vector<size_t> unpacked_kv_shape =
-        calculate_kv_unpacked_shape(input_KV, layout_group, kv_format, t_kv, h_kv, d);
-
-    Tensor K_view = make_tensor_view(input_KV, unpacked_kv_shape);
-    Tensor V_view = make_tensor_view(input_KV, unpacked_kv_shape, stride);
-
-    Tensor dK_view = make_tensor_view(output_dKV, unpacked_kv_shape);
-    Tensor dV_view = make_tensor_view(output_dKV, unpacked_kv_shape, stride);
-
-    fused_attn_fp8_bwd(b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d, attn_scale, dropout,
-                       qkv_layout, bias_type, attn_mask_type, input_Q, &K_view, &V_view, input_O,
-                       input_dO, input_M, input_ZInv, input_S, input_output_dP, output_dQ, &dK_view,
-                       &dV_view, input_cu_seqlens_q, input_cu_seqlens_kv, input_rng_state, wkspace,
-                       stream, handle);
-#else
-    NVTE_ERROR("cuDNN 8.9.0 is required for FP8 fused attention. \n");
-#endif
-  } else {
-    NVTE_ERROR("Invalid combination of data type and sequence length for fused attention. \n");
-  }
-}
 // NVTE fused attention FWD with separate Q, K and V
 void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETensor V,
                          const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index cddd3d7506..8169bf22e2 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -217,290 +217,6 @@ NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
     size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
     int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic);
 
-/*! \brief Compute dot product attention with packed QKV input.
- *
- * \deprecated Please use `nvte_fused_attn_fwd` with separate Q, K, V tensors instead.
- *
- * Computes:
- *  - P = Q * Transpose(K) + Bias
- *  - S = ScaleMaskSoftmax(P)
- *  - D = Dropout(S)
- *  - O = D * Transpose(V)
- *
- * Support Matrix:
-   \verbatim
-   | backend | precision |        qkv layout       |           bias           |                 mask                  | dropout |  sequence length  | head_dim         |
-   |   0     | FP16/BF16 |       BS3HD,SB3HD       |   NO/POST_SCALE_BIAS     | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   | <= 512, % 64 == 0 |    64            |
-   |   1     | FP16/BF16 | BS3HD,SB3HD,BSH3D,SBH3D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   |  > 512, % 64 == 0 | <= 128, % 8 == 0 |
-   |   2     |   FP8     |          T3HD           |          NO_BIAS         |               PADDING_MASK            |   Yes   | <= 512, % 64 == 0 |    64            |
-   \endverbatim
- *
- * Notes:
- *
- * Tensor `cu_seqlens_padded` helps identify the correct offsets of different sequences
- * in tensors Q, K, V and O.
- * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`,
- * the offset tensor is not used in the attention calculation and can be set to empty `NVTETensor`.
- * When the QKV format is `thd`, this tensor should follow the following rules.
- * When there is no padding between sequences, the offset tensor should be equal to `cu_seqlens`,
- * When there is padding between sequences, users are responsible to adjust the offsets as needed.
- * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have
- * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`.
- *
- *  \param[in]     QKV                      The QKV tensor in packed format, H3D or 3HD.
- *  \param[in]     Bias                     The Bias tensor.
- *  \param[in]     SoftmaxOffset            The SoftmaxOffset tensor.
- *  \param[in,out] S                        The S tensor.
- *  \param[out]    O                        The output O tensor.
- *  \param[out]    Aux_CTX_Tensors          Auxiliary output tensors when training,
- *                                          e.g. M, ZInv, rng_state.
- *  \param[in]     cu_seqlens               Cumulative sequence lengths, [batch_size + 1].
- *  \param[in]     cu_seqlens_padded        Cumulative sequence offsets for QKV, [batch_size + 1].
- *  \param[in]     rng_state                Seed and offset of CUDA random number generator.
- *  \param[in]     max_seqlen               Max sequence length used for computing,
- *                                          it may be >= max(seqlen_i) for i=0,...batch_size-1.
- *  \param[in]     is_training              Whether this is in training mode or inference.
- *  \param[in]     return_max_logit         Whether to produce Max and Sum_Exp, or Stats.
- *  \param[in]     cuda_graph               Whether cuda graph capture is enabled or not.
- *  \param[in]     attn_scale               Scaling factor for Q * K.T.
- *  \param[in]     dropout                  Dropout probability.
- *  \param[in]     qkv_layout               QKV tensor's layout.
- *  \param[in]     bias_type                Bias type.
- *  \param[in]     attn_mask_type           Attention mask type.
- *  \param[in]     softmax_type             Attention softmax type.
- *  \param[in]     window_size_left         Sliding window size (the left half).
- *  \param[in]     window_size_right        Sliding window size (the right half).
- *  \param[in]     bottom_right_diagonal    Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix.
- *  \param[in]     workspace                Workspace tensor.
- *  \param[in]     stream                   CUDA stream used for this operation.
- */
-[[deprecated(
-    "nvte_fused_attn_fwd_qkvpacked() is deprecated. Please use nvte_fused_attn_fwd() with separate "
-    "Q, K, V tensors instead.")]]
-void nvte_fused_attn_fwd_qkvpacked(
-    const NVTETensor QKV, const NVTETensor Bias, const NVTETensor SoftmaxOffset, NVTETensor S,
-    NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens,
-    const NVTETensor cu_seqlens_padded, const NVTETensor rng_state, size_t max_seqlen,
-    bool is_training, bool return_max_logit, bool cuda_graph, float attn_scale, float dropout,
-    NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
-    NVTE_Softmax_Type softmax_type, int64_t window_size_left, int64_t window_size_right,
-    bool bottom_right_diagonal, NVTETensor workspace, cudaStream_t stream);
-
-/*! \brief Compute the backward of the dot product attention with packed QKV input.
- *
- * \deprecated Please use `nvte_fused_attn_bwd` with separate Q, K, V tensors instead.
- *
- * Support Matrix:
-   \verbatim
-   | backend | precision |        qkv layout       |           bias           |                 mask                  | dropout |  sequence length  | head_dim         |
-   |   0     | FP16/BF16 |       BS3HD,SB3HD       |   NO/POST_SCALE_BIAS     | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   | <= 512, % 64 == 0 |    64            |
-   |   1     | FP16/BF16 | BS3HD,SB3HD,BSH3D,SBH3D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   |  > 512, % 64 == 0 | <= 128, % 8 == 0 |
-   |   2     |   FP8     |          T3HD           |          NO_BIAS         |               PADDING_MASK            |   Yes   | <= 512, % 64 == 0 |    64            |
-   \endverbatim
- *
- * Notes:
- *
- * Tensor `cu_seqlens_padded` helps identify the correct offsets of different sequences
- * in tensors Q, K, V and O.
- * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`,
- * the offset tensor is not used in the attention calculation and can be set to empty `NVTETensor`.
- * When the QKV format is `thd`, this tensor should follow the following rules.
- * When there is no padding between sequences, the offset tensor should be equal to `cu_seqlens`,
- * When there is padding between sequences, users are responsible to adjust the offsets as needed.
- * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have
- * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`.
- *
- *  \param[in]     QKV                      The QKV tensor in packed format, H3D or 3HD.
- *  \param[in]     O                        The O tensor from forward.
- *  \param[in]     dO                       The gradient of the O tensor.
- *  \param[in]     S                        The S tensor.
- *  \param[in,out] dP                       The gradient of the P tensor.
- *  \param[in]     Aux_CTX_Tensors          Auxiliary tensors from context when in training mode,
- *                                          e.g. M, ZInv, rng_state.
- *  \param[out]    dQKV                     The gradient of the QKV tensor.
- *  \param[out]    dBias                    The gradient of the Bias tensor.
- *  \param[out]    dSoftmaxOffset           The gradient of the SoftmaxOffset tensor.
- *  \param[in]     cu_seqlens               Cumulative sequence lengths, [batch_size + 1].
- *  \param[in]     cu_seqlens_padded        Cumulative sequence offsets for QKV, [batch_size + 1].
- *  \param[in]     max_seqlen               Max sequence length used for computing,
- *                                          it may be >= max(seqlen_i) for i=0,...batch_size-1.
- *  \param[in]     attn_scale               Scaling factor for Q * K.T.
- *  \param[in]     dropout                  Dropout probability.
- *  \param[in]     qkv_layout               QKV tensor's layout.
- *  \param[in]     bias_type                Bias type.
- *  \param[in]     attn_mask_type           Attention mask type.
- *  \param[in]     softmax_type             Attention softmax type.
- *  \param[in]     window_size_left         Sliding window size (the left half).
- *  \param[in]     window_size_right        Sliding window size (the right half).
- *  \param[in]     bottom_right_diagonal    Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix.
- *  \param[in]     deterministic            Whether to execute with deterministic behaviours.
- *  \param[in]     cuda_graph               Whether cuda graph capture is enabled or not.
- *  \param[in]     workspace                Workspace tensor.
- *  \param[in]     stream                   CUDA stream used for this operation.
- */
-[[deprecated(
-    "nvte_fused_attn_bwd_qkvpacked() is deprecated. Please use nvte_fused_attn_bwd() with separate "
-    "Q, K, V tensors instead.")]]
-void nvte_fused_attn_bwd_qkvpacked(
-    const NVTETensor QKV, const NVTETensor O, const NVTETensor dO, const NVTETensor S,
-    NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQKV, NVTETensor dBias,
-    NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens, const NVTETensor cu_seqlens_padded,
-    size_t max_seqlen, float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal,
-    bool deterministic, bool cuda_graph, NVTETensor workspace, cudaStream_t stream);
-
-/*! \brief Compute dot product attention with packed KV input.
- *
- * \deprecated Please use `nvte_fused_attn_fwd` with separate Q, K, V tensors instead.
- *
- * Computes:
- *  - P = Q * Transpose(K) + Bias
- *  - S = ScaleMaskSoftmax(P)
- *  - D = Dropout(S)
- *  - O = D * Transpose(V)
- *
- * Support Matrix:
-   \verbatim
-   | backend | precision |                 qkv layout                  |           bias           |                 mask                  | dropout |  sequence length  | head_dim         |
-   |   0     | FP16/BF16 |            BSHD_BS2HD,SBHD_SB2HD            |   NO/POST_SCALE_BIAS     | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   | <= 512, % 64 == 0 |    64            |
-   |   1     | FP16/BF16 | BSHD_BS2HD,BSHD_BSH2D,SBHD_SB2HD,SBHD_SBH2D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   |  > 512, % 64 == 0 | <= 128, % 8 == 0 |
-   \endverbatim
- *
- * Notes:
- *
- * Tensors `cu_seqlens_q_padded` and `cu_seqlens_kv_padded`
- * help identify the correct offsets of different sequences in tensors Q, K, V and O.
- * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`,
- * offset tensors are not used in the attention calculation and can be set to empty `NVTETensor`s.
- * When the QKV format is `thd`, these tensors should follow the following rules.
- * When there is no padding between sequences, the offset tensors should be equal to
- * `cu_seqlens_q` and `cu_seqlens_kv` respectively.
- * When there is padding between sequences, users are responsible to adjust the offsets as needed.
- * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have
- * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`.
- *
- *  \param[in]     Q                         The Q tensor, in HD layouts.
- *  \param[in]     KV                        The KV tensor, in 2HD or H2D layouts.
- *  \param[in]     Bias                      The Bias tensor.
- *  \param[in]     SoftmaxOffset             The SoftmaxOffset tensor.
- *  \param[in,out] S                         The S tensor.
- *  \param[out]    O                         The output O tensor.
- *  \param[out]    Aux_CTX_Tensors           Auxiliary output tensors when training,
- *                                           e.g. M, ZInv, rng_state.
- *  \param[in]     cu_seqlens_q              Cumulative sequence lengths for Q, [batch_size + 1].
- *  \param[in]     cu_seqlens_kv             Cumulative sequence lengths for KV, [batch_size + 1].
- *  \param[in]     cu_seqlens_q_padded       Cumulative sequence offsets for Q, [batch_size + 1].
- *  \param[in]     cu_seqlens_kv_padded      Cumulative sequence offsets for KV, [batch_size + 1].
- *  \param[in]     page_table_k              Page table for K cache, [batch_size, max_pages_per_seq_k].
- *  \param[in]     page_table_v              Page table for V cache, [batch_size, max_pages_per_seq_v].
- *  \param[in]     rng_state                 Seed and offset of CUDA random number generator.
- *  \param[in]     max_seqlen_q              Max sequence length used for computing for Q.
- *                                           it may be >= max(seqlen_q_i) for i=0,...batch_size-1.
- *  \param[in]     max_seqlen_kv             Max sequence length used for computing for KV.
- *                                           it may be >= max(seqlen_kv_i) for i=0,...batch_size-1.
- *  \param[in]     is_training               Whether this is in training mode or inference.
- *  \param[in]     return_max_logit          Whether to produce Max and Sum_Exp, or Stats.
- *  \param[in]     cuda_graph                Whether cuda graph capture is enabled or not.
- *  \param[in]     attn_scale                Scaling factor for Q * K.T.
- *  \param[in]     dropout                   Dropout probability.
- *  \param[in]     qkv_layout                QKV tensor's layout.
- *  \param[in]     bias_type                 Bias type.
- *  \param[in]     attn_mask_type            Attention mask type.
- *  \param[in]     softmax_type              Attention softmax type.
- *  \param[in]     window_size_left          Sliding window size (the left half).
- *  \param[in]     window_size_right         Sliding window size (the right half).
- *  \param[in]     bottom_right_diagonal     Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix.
- *  \param[in]     workspace                 Workspace tensor.
- *  \param[in]     stream                    CUDA stream used for this operation.
- */
-[[deprecated(
-    "nvte_fused_attn_fwd_kvpacked() is deprecated. Please use nvte_fused_attn_fwd() with separate "
-    "Q, K, V tensors instead.")]]
-void nvte_fused_attn_fwd_kvpacked(
-    const NVTETensor Q, const NVTETensor KV, const NVTETensor Bias, const NVTETensor SoftmaxOffset,
-    NVTETensor S, NVTETensor O, NVTETensorPack *Aux_CTX_Tensors, const NVTETensor cu_seqlens_q,
-    const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
-    const NVTETensor cu_seqlens_kv_padded, const NVTETensor page_table_k,
-    const NVTETensor page_table_v, const NVTETensor rng_state, size_t max_seqlen_q,
-    size_t max_seqlen_kv, bool is_training, bool return_max_logit, bool cuda_graph,
-    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, NVTETensor workspace,
-    cudaStream_t stream);
-
-/*! \brief Compute the backward of the dot product attention with packed KV input.
- *
- * \deprecated Please use `nvte_fused_attn_bwd` with separate Q, K, V tensors instead.
- *
- * Support Matrix:
-   \verbatim
-   | backend | precision |                 qkv layout                  |           bias           |                 mask                  | dropout |  sequence length  | head_dim         |
-   |   0     | FP16/BF16 |            BSHD_BS2HD,SBHD_SB2HD            |   NO/POST_SCALE_BIAS     | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   | <= 512, % 64 == 0 |    64            |
-   |   1     | FP16/BF16 | BSHD_BS2HD,BSHD_BSH2D,SBHD_SB2HD,SBHD_SBH2D | NO/POST_SCALE_BIAS/ALIBI | NO/PADDING/CAUSAL/PADDING_CAUSAL_MASK |   Yes   |  > 512, % 64 == 0 | <= 128, % 8 == 0 |
-   \endverbatim
- *
- * Notes:
- *
- * Tensors `cu_seqlens_q_padded` and `cu_seqlens_kv_padded`
- * help identify the correct offsets of different sequences in tensors Q, K, V and O.
- * When the QKV format (`nvte_get_qkv_format(qkv_layout)`) is `bshd` or `sbhd`,
- * offset tensors are not used in the attention calculation and can be set to empty `NVTETensor`s.
- * When the QKV format is `thd`, these tensors should follow the following rules.
- * When there is no padding between sequences, the offset tensors should be equal to
- * `cu_seqlens_q` and `cu_seqlens_kv` respectively.
- * When there is padding between sequences, users are responsible to adjust the offsets as needed.
- * For example, a tensor of 4 sequences `[a, PAD, b, b, c, PAD, PAD, d, d]` should have
- * `cu_seqlens = [0, 1, 3, 4, 6]` and `cu_seqlens_padded= [0, 2, 4, 7, 9]`.
- *
- *  \param[in]     Q                         The Q tensor, in HD layouts.
- *  \param[in]     KV                        The KV tensor, in H2D or 2HD layouts.
- *  \param[in]     O                         The O tensor from forward.
- *  \param[in]     dO                        The gradient of the O tensor.
- *  \param[in]     S                         The S tensor.
- *  \param[in,out] dP                        The gradient of the P tensor.
- *  \param[in]     Aux_CTX_Tensors           Auxiliary tensors from context when in training mode,
- *                                           e.g. M, ZInv, rng_state.
- *  \param[out]    dQ                        The gradient of the Q tensor.
- *  \param[out]    dKV                       The gradient of the KV tensor.
- *  \param[out]    dBias                     The gradient of the Bias tensor.
- *  \param[out]    dSoftmaxOffset            The gradient of the SoftmaxOffset tensor.
- *  \param[in]     cu_seqlens_q              Cumulative sequence lengths for Q, [batch_size + 1].
- *  \param[in]     cu_seqlens_kv             Cumulative sequence lengths for KV, [batch_size + 1].
- *  \param[in]     cu_seqlens_q_padded       Cumulative sequence offsets for Q, [batch_size + 1].
- *  \param[in]     cu_seqlens_kv_padded      Cumulative sequence offsets for KV, [batch_size + 1].
- *  \param[in]     max_seqlen_q              Max sequence length used for computing for Q.
- *                                           it may be >= max(seqlen_q_i) for i=0,...batch_size-1.
- *  \param[in]     max_seqlen_kv             Max sequence length used for computing for KV.
- *                                           it may be >= max(seqlen_kv_i) for i=0,...batch_size-1.
- *  \param[in]     attn_scale                Scaling factor for Q * K.T.
- *  \param[in]     dropout                   Dropout probability.
- *  \param[in]     qkv_layout                QKV tensor's layout.
- *  \param[in]     bias_type                 Bias type.
- *  \param[in]     attn_mask_type            Attention mask type.
- *  \param[in]     softmax_type              Attention softmax type.
- *  \param[in]     window_size_left          Sliding window size (the left half).
- *  \param[in]     window_size_right         Sliding window size (the right half).
- *  \param[in]     bottom_right_diagonal     Whether to align sliding window and ALiBi diagonal to the bottom right corner of the softmax matrix.
- *  \param[in]     deterministic             Whether to execute with deterministic behaviours.
- *  \param[in]     cuda_graph                Whether cuda graph capture is enabled or not.
- *  \param[in]     workspace                 Workspace tensor.
- *  \param[in]     stream                    CUDA stream used for this operation.
- */
-[[deprecated(
-    "nvte_fused_attn_bwd_kvpacked() is deprecated. Please use nvte_fused_attn_bwd() with separate "
-    "Q, K, V tensors instead.")]]
-void nvte_fused_attn_bwd_kvpacked(
-    const NVTETensor Q, const NVTETensor KV, const NVTETensor O, const NVTETensor dO,
-    const NVTETensor S, NVTETensor dP, const NVTETensorPack *Aux_CTX_Tensors, NVTETensor dQ,
-    NVTETensor dKV, NVTETensor dBias, NVTETensor dSoftmaxOffset, const NVTETensor cu_seqlens_q,
-    const NVTETensor cu_seqlens_kv, const NVTETensor cu_seqlens_q_padded,
-    const NVTETensor cu_seqlens_kv_padded, size_t max_seqlen_q, size_t max_seqlen_kv,
-    float attn_scale, float dropout, NVTE_QKV_Layout qkv_layout, NVTE_Bias_Type bias_type,
-    NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type, int64_t window_size_left,
-    int64_t window_size_right, bool bottom_right_diagonal, bool deterministic, bool cuda_graph,
-    NVTETensor workspace, cudaStream_t stream);
-
 /*! \brief Compute dot product attention with separate Q, K and V.
  *
  * Computes: