From bdb65c89d146821f065ac169d176850c6315e6fe Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sat, 4 Apr 2026 21:00:34 +0800 Subject: [PATCH 01/13] add nvtx --- custom_ops/gpu_ops/append_attention.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/append_attention.cu b/custom_ops/gpu_ops/append_attention.cu index c1586945cc5..c4cad17cbf7 100644 --- a/custom_ops/gpu_ops/append_attention.cu +++ b/custom_ops/gpu_ops/append_attention.cu @@ -170,7 +170,7 @@ void AppendAttentionKernel( speculate_max_draft_token_num, causal, lambda_is_decoder, - lambda_enable_prefill, + false, lambda_stream, &fmha_out, sliding_window, From e6473f3f9cdd2afaf98208f2f5fb66bce3578c04 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sat, 4 Apr 2026 22:07:28 +0800 Subject: [PATCH 02/13] add nvtx --- custom_ops/gpu_ops/append_attention.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/custom_ops/gpu_ops/append_attention.cu b/custom_ops/gpu_ops/append_attention.cu index c4cad17cbf7..38b77cdff69 100644 --- a/custom_ops/gpu_ops/append_attention.cu +++ b/custom_ops/gpu_ops/append_attention.cu @@ -170,7 +170,7 @@ void AppendAttentionKernel( speculate_max_draft_token_num, causal, lambda_is_decoder, - false, + lambda_enable_prefill, lambda_stream, &fmha_out, sliding_window, @@ -440,8 +440,8 @@ void AppendAttentionKernel( decoder_num_blocks_data, decoder_block_shape_q, max_kv_len_this_time, - !speculate_decoder, - !speculate_decoder, + false, + false, exec_stream); } if (max_enc_len_this_time > 0) { From 7b515c430c3342fc72ee8afad5e2bfc1ea7c35ae Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sun, 5 Apr 2026 08:46:26 +0800 Subject: [PATCH 03/13] add nvtx --- .../append_attn/append_attention_c16_impl.cuh | 105 +++++++++--------- .../gpu_ops/append_attn/template_config.json | 2 +- 2 files changed, 52 insertions(+), 55 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh index 70329c9366a..7616f860a51 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh @@ -72,60 +72,57 @@ void CascadeAppendAttentionC16Kernel( DISPATCH_CAUSAL( causal, CAUSAL, - {DISPATCH_ENABLE_PREFILL( - enable_prefill, - ENABLE_PREFILL, - {DISPATCH_GQA_GROUP_SIZE( - group_size, - GROUP_SIZE, - {DISPATCH_HEAD_DIM( - head_dim, - HEAD_DIM, - {DISPATCH_BLOCK_SIZE( - block_size, - BLOCK_SIZE, - {DISPATCH_BLOCKSHAPE_Q( - block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, { - MultiQueryAppendAttention( - meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - shift_bias, - smooth_weight, - sinks, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - is_decoder, - stream, - out, - sliding_window, - sink_size); - })})})})})}) + {DISPATCH_GQA_GROUP_SIZE( + group_size, + GROUP_SIZE, + {DISPATCH_HEAD_DIM(head_dim, + HEAD_DIM, + {DISPATCH_BLOCK_SIZE( + block_size, + BLOCK_SIZE, + {DISPATCH_BLOCKSHAPE_Q( + block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, { + MultiQueryAppendAttention( + meta_data, + qkv, + cache_k, + cache_v, + attn_mask, + shift_bias, + smooth_weight, + sinks, + seq_lens_q, + seq_lens_kv, + seq_lens_encoder, + batch_id_per_token, + cu_seqlens_q, + block_table, + batch_ids, + tile_ids_per_batch, + num_blocks, + max_seq_len, + max_dec_len, + quant_max_bound, + quant_min_bound, + in_scale, + max_partition_size, + encoder_max_partition_size, + speculate_max_draft_token_num, + is_decoder, + stream, + out, + sliding_window, + sink_size); + })})})})}) +}) } template void diff --git a/custom_ops/gpu_ops/append_attn/template_config.json b/custom_ops/gpu_ops/append_attn/template_config.json index 22eb9d18e19..34ff1b52c3d 100644 --- a/custom_ops/gpu_ops/append_attn/template_config.json +++ b/custom_ops/gpu_ops/append_attn/template_config.json @@ -94,7 +94,7 @@ "BLOCK_SIZE": [64], "CAUSAL": [0, 1], "BLOCK_SHAPE_Q": [16, 32, 64, 128], - "ENABLE_PREFILL": [0, 1] + "ENABLE_PREFILL": [0] }, "data_types": [ ["paddle::float16", "paddle::float16", "float16_float16"], From 6bd9af8d6a47b859202b8481ed1b6d6d74d4fb0f Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sun, 5 Apr 2026 12:17:39 +0800 Subject: [PATCH 04/13] add nvtx --- custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh | 1 - custom_ops/gpu_ops/append_attn/template_config.json | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh index 7616f860a51..20053cd2f6c 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh @@ -122,7 +122,6 @@ void CascadeAppendAttentionC16Kernel( sliding_window, sink_size); })})})})}) -}) } template void diff --git a/custom_ops/gpu_ops/append_attn/template_config.json b/custom_ops/gpu_ops/append_attn/template_config.json index 34ff1b52c3d..22eb9d18e19 100644 --- a/custom_ops/gpu_ops/append_attn/template_config.json +++ b/custom_ops/gpu_ops/append_attn/template_config.json @@ -94,7 +94,7 @@ "BLOCK_SIZE": [64], "CAUSAL": [0, 1], "BLOCK_SHAPE_Q": [16, 32, 64, 128], - "ENABLE_PREFILL": [0] + "ENABLE_PREFILL": [0, 1] }, "data_types": [ ["paddle::float16", "paddle::float16", "float16_float16"], From 8f7e6a589fe0a16639a7720ecb083a43ee9104eb Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sun, 5 Apr 2026 22:19:02 +0800 Subject: [PATCH 05/13] add nvtx --- .../multiquery_attention_c16_impl.cuh | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index 3bd148bb601..e39d4e6486b 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -525,17 +525,11 @@ __global__ void multi_query_append_attention_warp1_4_kernel( if (!partition_kv || num_chunks_this_seq <= 1) { o_base_ptr_int8 = out + o_offset; } else { - if (ENABLE_PREFILL) { - o_base_ptr_T = tmp_workspace + batch_id * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); - } else { - o_base_ptr_T = - tmp_workspace + - batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + - chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + - tid % 8 * num_elems_per_128b(); - } + o_base_ptr_T = + tmp_workspace + + batch_id * speculate_max_draft_token_num * num_chunks * q_n_stride + + chunk_idx * q_n_stride + q_head_idx * HEAD_DIM + + tid % 8 * num_elems_per_128b(); } const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr; @@ -799,18 +793,12 @@ __global__ void multi_query_append_attention_warp1_4_kernel( const uint32_t qo_idx = q_start_seq_id + qo_idx_now / GROUP_SIZE; if (qo_idx - q_start_seq_id < q_len) { - uint32_t offset; - if (ENABLE_PREFILL) { - offset = (batch_id * num_chunks + chunk_idx) * q_num_heads + - qo_head_idx; - } else { - offset = ((batch_id * speculate_max_draft_token_num + - qo_idx_now / GROUP_SIZE) * - num_chunks + - chunk_idx) * - q_num_heads + - qo_head_idx; - } + const uint32_t offset = ((batch_id * speculate_max_draft_token_num + + qo_idx_now / GROUP_SIZE) * + num_chunks + + chunk_idx) * + q_num_heads + + qo_head_idx; tmp_m[offset] = m_frag[fx][j]; tmp_d[offset] = d_frag[fx][j]; } From f4728b1b60a18bc44303175efbedf921d2e8fc81 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sun, 5 Apr 2026 22:20:46 +0800 Subject: [PATCH 06/13] add nvtx --- .../append_attn/append_attention_c16_impl.cuh | 104 +++++++++--------- 1 file changed, 54 insertions(+), 50 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh index 20053cd2f6c..70329c9366a 100644 --- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh @@ -72,56 +72,60 @@ void CascadeAppendAttentionC16Kernel( DISPATCH_CAUSAL( causal, CAUSAL, - {DISPATCH_GQA_GROUP_SIZE( - group_size, - GROUP_SIZE, - {DISPATCH_HEAD_DIM(head_dim, - HEAD_DIM, - {DISPATCH_BLOCK_SIZE( - block_size, - BLOCK_SIZE, - {DISPATCH_BLOCKSHAPE_Q( - block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, { - MultiQueryAppendAttention( - meta_data, - qkv, - cache_k, - cache_v, - attn_mask, - shift_bias, - smooth_weight, - sinks, - seq_lens_q, - seq_lens_kv, - seq_lens_encoder, - batch_id_per_token, - cu_seqlens_q, - block_table, - batch_ids, - tile_ids_per_batch, - num_blocks, - max_seq_len, - max_dec_len, - quant_max_bound, - quant_min_bound, - in_scale, - max_partition_size, - encoder_max_partition_size, - speculate_max_draft_token_num, - is_decoder, - stream, - out, - sliding_window, - sink_size); - })})})})}) + {DISPATCH_ENABLE_PREFILL( + enable_prefill, + ENABLE_PREFILL, + {DISPATCH_GQA_GROUP_SIZE( + group_size, + GROUP_SIZE, + {DISPATCH_HEAD_DIM( + head_dim, + HEAD_DIM, + {DISPATCH_BLOCK_SIZE( + block_size, + BLOCK_SIZE, + {DISPATCH_BLOCKSHAPE_Q( + block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, { + MultiQueryAppendAttention( + meta_data, + qkv, + cache_k, + cache_v, + attn_mask, + shift_bias, + smooth_weight, + sinks, + seq_lens_q, + seq_lens_kv, + seq_lens_encoder, + batch_id_per_token, + cu_seqlens_q, + block_table, + batch_ids, + tile_ids_per_batch, + num_blocks, + max_seq_len, + max_dec_len, + quant_max_bound, + quant_min_bound, + in_scale, + max_partition_size, + encoder_max_partition_size, + speculate_max_draft_token_num, + is_decoder, + stream, + out, + sliding_window, + sink_size); + })})})})})}) } template void From 4150355928529f1be7616462a2a6747c3672bb61 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Sun, 5 Apr 2026 23:12:29 +0800 Subject: [PATCH 07/13] add nvtx --- custom_ops/gpu_ops/append_attention.cu | 4 ++-- .../append_attn/multiquery_attention_c16_impl.cuh | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/custom_ops/gpu_ops/append_attention.cu b/custom_ops/gpu_ops/append_attention.cu index 38b77cdff69..c1586945cc5 100644 --- a/custom_ops/gpu_ops/append_attention.cu +++ b/custom_ops/gpu_ops/append_attention.cu @@ -440,8 +440,8 @@ void AppendAttentionKernel( decoder_num_blocks_data, decoder_block_shape_q, max_kv_len_this_time, - false, - false, + !speculate_decoder, + !speculate_decoder, exec_stream); } if (max_enc_len_this_time > 0) { diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index e39d4e6486b..c18f914a25c 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -430,8 +430,7 @@ template + typename OutT = T> __global__ void multi_query_append_attention_warp1_4_kernel( T *__restrict__ q, // [token_num, (num_heads + 2* kv_num_head) * head_dim] T *__restrict__ cache_k, // [max_block_num, num_heads, block_size, @@ -1111,8 +1110,7 @@ void MultiQueryAppendAttention( num_frags_x, num_frags_z, num_frags_y, - OUT_NV_TYPE, - ENABLE_PREFILL>; + OUT_NV_TYPE>; if (smem_size >= 48 * 1024) { cudaFuncSetAttribute(split_kv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, @@ -1157,8 +1155,7 @@ void MultiQueryAppendAttention( num_frags_x, num_frags_z, num_frags_y, - OUT_NV_TYPE, - ENABLE_PREFILL>; + OUT_NV_TYPE>; if (smem_size >= 48 * 1024) { cudaFuncSetAttribute(nosplit_kv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, From f922a0d8f6d8b3f7e72d47be543352e9688639f9 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 09:16:20 +0800 Subject: [PATCH 08/13] add nvtx --- .../model_executor/layers/attention/append_attn_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 81eab7cce86..c73283b48de 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -146,6 +146,8 @@ def __init__( self.causal: bool = getattr(fd_config.model_config, "causal", True) self.speculative_method = fd_config.speculative_config.method self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens + if self.speculative_method is None: + self.speculate_max_draft_token_num = 0 self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp" self.num_layers_draft_model: int = int(fd_config.speculative_config.method == SpecMethod.MTP) From 67f5031cbcbf9e664fbdd3e69fc0f9f74afb36a2 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 12:23:56 +0800 Subject: [PATCH 09/13] add nvtx --- .../model_executor/layers/attention/flash_attn_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index b51dce1449d..3ac0bfafbb6 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -258,6 +258,8 @@ def __init__( self.speculative_method = fd_config.speculative_config.method self.use_speculate = self.speculative_method is not None self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens + if not self.use_speculate: + self.speculate_max_draft_token_num = 0 self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp" self.num_layers_draft_model: int = int(fd_config.speculative_config.method == SpecMethod.MTP) From bdab175b7589bed3d6f9cb7239658db8e912a0cd Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 21:18:33 +0800 Subject: [PATCH 10/13] add nvtx --- custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index c18f914a25c..58c5d6c442e 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -1219,6 +1219,7 @@ void MultiQueryAppendAttention( static_cast(bsz * num_chunks * num_heads)); } else { if (ENABLE_PREFILL) { + exit(0); tmp_workspace = allocator->Allocate(phi::SizeOf(qkv.dtype()) * static_cast(token_num * num_chunks * From 844c20fc7358db892a30890ffe7ded1ad5a38b7b Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 22:06:58 +0800 Subject: [PATCH 11/13] add nvtx --- .../multiquery_attention_c16_impl.cuh | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index 58c5d6c442e..dc9d3d30533 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -1218,32 +1218,18 @@ void MultiQueryAppendAttention( phi::SizeOf(paddle::DataType::FLOAT32) * static_cast(bsz * num_chunks * num_heads)); } else { - if (ENABLE_PREFILL) { - exit(0); - tmp_workspace = - allocator->Allocate(phi::SizeOf(qkv.dtype()) * - static_cast(token_num * num_chunks * - num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(token_num * num_chunks * num_heads)); - } else { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads)); - } + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * + num_chunks * num_heads)); } launchWithPdlWhenEnabled( split_kv_kernel, From 3e3e619149f830e1a6b7dcc2b3c340c3774cb181 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 22:10:51 +0800 Subject: [PATCH 12/13] add nvtx --- .../multiquery_attention_c16_impl.cuh | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh index dc9d3d30533..e7463154c43 100644 --- a/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/multiquery_attention_c16_impl.cuh @@ -1207,30 +1207,18 @@ void MultiQueryAppendAttention( sink_size); } else { phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d; - if (is_decoder) { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(bsz * num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(bsz * num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(bsz * num_chunks * num_heads)); - } else { - tmp_workspace = allocator->Allocate( - phi::SizeOf(qkv.dtype()) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads * HEAD_DIM)); - tmp_m = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads)); - tmp_d = allocator->Allocate( - phi::SizeOf(paddle::DataType::FLOAT32) * - static_cast(speculate_max_draft_token_num * bsz * - num_chunks * num_heads)); - } + tmp_workspace = allocator->Allocate( + phi::SizeOf(qkv.dtype()) * + static_cast(speculate_max_draft_token_num * bsz * num_chunks * + num_heads * HEAD_DIM)); + tmp_m = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * num_chunks * + num_heads)); + tmp_d = allocator->Allocate( + phi::SizeOf(paddle::DataType::FLOAT32) * + static_cast(speculate_max_draft_token_num * bsz * num_chunks * + num_heads)); launchWithPdlWhenEnabled( split_kv_kernel, grids, From e2470346efd3db96ae73452db0b5e64e12fbc330 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 6 Apr 2026 22:15:46 +0800 Subject: [PATCH 13/13] add nvtx --- .../model_executor/layers/attention/flash_mask_attn_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py index 35d27504ab5..6e05ca0c3b8 100644 --- a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py @@ -109,6 +109,8 @@ def __init__( self.speculative_method = fd_config.speculative_config.method self.use_speculate = self.speculative_method is not None self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens + if not self.use_speculate: + self.speculate_max_draft_token_num = 0 self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp" self.num_layers_draft_model: int = int(fd_config.speculative_config.method == SpecMethod.MTP)