From 8021e7d799b818e01be8a689f2e9641d7b310bcf Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Tue, 7 Apr 2026 13:05:30 +0800 Subject: [PATCH 1/8] tp ok --- .../layers/moe/fused_moe_cutlass_backend.py | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 0c86270c630..581bd06b61b 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -28,7 +28,11 @@ from .fused_moe_backend_base import UnquantizedFusedMoEMethod if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce + from fastdeploy.model_executor.ops.gpu import ( + count_tokens_per_expert_func, + moe_expert_dispatch, + moe_expert_reduce, + ) try: from fastdeploy.model_executor.ops.gpu import ( @@ -286,6 +290,70 @@ def apply_tp( layer.gate_correction_bias, getattr(layer, "renormalize", True), ) + if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": + # moe_permute path: CUDA-graph safe, no D2H copies + print("use moe_permute in tp") + topk_idx_i32 = topk_idx.astype(paddle.int32) + override_buffer_size = x.shape[0] * layer.top_k + layer.num_experts * (128 - 1) + ( + permute_input, + permute_indices_per_token, # zipped_expertwise_rowmap + dst_weights, + _scale_out, + _m_indices, + ) = paddle.nn.functional.moe_permute( + hidden_states=x, + scale=None, + expert_routemap_topk=topk_idx_i32, + expert_prob_topk=topk_weights, + num_experts=layer.num_experts, + tokens_per_expert=[], + padding_alignment=128, + return_expert_indices=True, + override_buffer_size=override_buffer_size, + ) + + # Compute token_nums_per_expert (prefix sum) on GPU. + # Use PADDED counts (row 1) because moe_permute with padding_alignment=128 + # lays out each expert's tokens in 128-aligned blocks. Using actual counts + # (row 0) would cause moe_expert_ffn to read wrong positions. + # Use matmul with a cached lower-triangular matrix instead of + # paddle.cumsum, because CUB inclusive_scan allocates temp memory + # which is forbidden during CUDA graph capture. + padded_counts = count_tokens_per_expert_func(topk_idx, layer.num_experts)[ + 1 + ] # [num_experts], int32, 128-aligned + if not hasattr(self, "_cumsum_tril") or self._cumsum_tril.shape[0] != layer.num_experts: + self._cumsum_tril = paddle.tril( + paddle.ones([layer.num_experts, layer.num_experts], dtype="float32") + ) + token_nums_per_expert = paddle.mv(self._cumsum_tril, padded_counts.cast("float32")).cast(paddle.int64) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + + ffn_out = self.compute_ffn( + layer, + permute_input, + token_nums_per_expert, + None, # expert_idx_per_token not needed for w16a16 without bias + False, + -1, + None, # dequant_scale + None, # max_tokens_per_expert + ) + + fused_moe_out, _out_probs = paddle.nn.functional.moe_unpermute( + hidden_states_unzipped=ffn_out, + zipped_expertwise_rowmap=permute_indices_per_token, + expert_routemap_topk=topk_idx_i32, + token_prob_unzipped=dst_weights, + total_zipped_tokens=x.shape[0], + num_experts=layer.num_experts, + using_weighted_combine=True, + ) + return fused_moe_out + ( permute_input, token_nums_per_expert, From 2f9f76da49a42724a26e96e8d7a7d9e1aa8eab54 Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Tue, 7 Apr 2026 17:27:49 +0800 Subject: [PATCH 2/8] opt tp --- custom_ops/gpu_ops/moe/deepgemm_preprocess.cu | 20 +- .../layers/moe/fused_moe_cutlass_backend.py | 203 +++++++++++------- 2 files changed, 137 insertions(+), 86 deletions(-) diff --git a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu index 6eda3598cd3..08e337f816f 100644 --- a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu +++ b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu @@ -19,6 +19,7 @@ template __global__ void cuda_kernel(const scalar_t *__restrict__ topk_ids, int32_t *__restrict__ res, int32_t *__restrict__ res_padded, + int32_t *__restrict__ res_padded_cumsum, size_t numel, int num_experts) { extern __shared__ int32_t tokens_per_ep[]; @@ -34,10 +35,16 @@ __global__ void cuda_kernel(const scalar_t *__restrict__ topk_ids, } __syncthreads(); - - for (size_t i = threadIdx.x; i < num_experts; i += blockDim.x) { - res[i] = tokens_per_ep[i]; - res_padded[i] = (res[i] + 127) / 128 * 128; + if (threadIdx.x == 0) { + int32_t running_sum = 0; + for (int i = 0; i < num_experts; i++) { + int32_t count = tokens_per_ep[i]; + int32_t padded = (count + 127) / 128 * 128; + res[i] = count; + res_padded[i] = padded; + running_sum += padded; + res_padded_cumsum[i] = running_sum; + } } } @@ -46,7 +53,7 @@ std::vector count_tokens_per_expert_func( int topk_ids_numel = topk_ids.shape()[0] * topk_ids.shape()[1]; auto token_nums_per_expert = paddle::empty( - {2, num_experts}, paddle::DataType::INT32, topk_ids.place()); + {3, num_experts}, paddle::DataType::INT32, topk_ids.place()); auto stream = topk_ids.stream(); using scalar_t = int64_t; @@ -56,6 +63,7 @@ std::vector count_tokens_per_expert_func( topk_ids.data(), token_nums_per_expert.data(), token_nums_per_expert.data() + num_experts, + token_nums_per_expert.data() + 2 * num_experts, topk_ids_numel, num_experts); @@ -70,7 +78,7 @@ std::vector count_tokens_per_expert_func_infer_dtype( std::vector> count_tokens_per_expert_func_infer_shape( const std::vector &topk_ids_shape, int64_t num_experts) { - return {{2, num_experts}}; + return {{3, num_experts}}; } PD_BUILD_STATIC_OP(count_tokens_per_expert_func) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 581bd06b61b..98ca385965a 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -17,6 +17,7 @@ from typing import Callable import paddle +from fxy_debug import dump_tensor from paddle import nn from paddle.nn.quant import weight_quantize from paddleformers.utils.log import logger @@ -150,54 +151,92 @@ def apply_ep_prefill( # 3. Compute ffn if token_all_num > 0: logger.debug(f"token_all_num {token_all_num}") - ( - permute_input, - permute_indices_per_token, - recv_num_tokens_per_expert_list_cumsum, - dst_weights, - dst_indices, - cumsum_idx_gpu, - expert_idx_per_token, - dequant_scale, - ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch( - recv_x, - recv_topk_idx, - recv_topk_weights, - (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None), - recv_num_tokens_per_expert_list, - token_all_num, - self.moe_quant_type, - ) - if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8": - # only w4a8 and w4afp8 need expert_idx_per_token - # Other need not this tensor, so we make it None. - expert_idx_per_token = None + + if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": + # --- moe_permute / moe_unpermute path --- + recv_topk_idx_i32 = recv_topk_idx.astype(paddle.int32) + (permute_input, permute_indices_per_token, dst_weights, _scale_out) = paddle.nn.functional.moe_permute( + hidden_states=recv_x, + scale=None, + expert_routemap_topk=recv_topk_idx_i32, + expert_prob_topk=recv_topk_weights, + num_experts=layer.num_local_experts, + tokens_per_expert=[], + padding_alignment=128, + override_buffer_size=token_all_num, + ) + + token_nums_per_expert_cumsum = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)[ + 2 + ].cast(paddle.int64) + + ffn_out = self.compute_ffn( + layer, + permute_input, + token_nums_per_expert_cumsum, + None, + False, + -1, + None, + None, + ) + + tmp_ffn_out, _out_probs = paddle.nn.functional.moe_unpermute( + hidden_states_unzipped=ffn_out, + zipped_expertwise_rowmap=permute_indices_per_token, + expert_routemap_topk=recv_topk_idx_i32, + token_prob_unzipped=dst_weights, + total_zipped_tokens=recv_x.shape[0], + num_experts=layer.num_local_experts, + using_weighted_combine=True, + ) else: - expert_idx_per_token = expert_idx_per_token.cast("int64") + # --- original ep_moe_expert_dispatch / combine path --- + ( + permute_input, + permute_indices_per_token, + recv_num_tokens_per_expert_list_cumsum, + dst_weights, + dst_indices, + cumsum_idx_gpu, + expert_idx_per_token, + dequant_scale, + ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch( + recv_x, + recv_topk_idx, + recv_topk_weights, + (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None), + recv_num_tokens_per_expert_list, + token_all_num, + self.moe_quant_type, + ) + if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8": + expert_idx_per_token = None + else: + expert_idx_per_token = expert_idx_per_token.cast("int64") - if hasattr(layer, "up_gate_proj_in_scale"): - dequant_scale = None + if hasattr(layer, "up_gate_proj_in_scale"): + dequant_scale = None - ffn_out = self.compute_ffn( - layer, - permute_input, - recv_num_tokens_per_expert_list_cumsum, - expert_idx_per_token, - False, - -1, - dequant_scale, - ) + ffn_out = self.compute_ffn( + layer, + permute_input, + recv_num_tokens_per_expert_list_cumsum, + expert_idx_per_token, + False, + -1, + dequant_scale, + ) - # prmt back per rank - tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine( - ffn_out, - dst_weights, - permute_indices_per_token, - dst_indices, - None, # down_proj_bias, - False, # norm_topk_prob - 1.0, - ) + tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine( + ffn_out, + dst_weights, + permute_indices_per_token, + dst_indices, + None, # down_proj_bias, + False, # norm_topk_prob + 1.0, + ) else: tmp_ffn_out = recv_x @@ -291,51 +330,43 @@ def apply_tp( getattr(layer, "renormalize", True), ) if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": - # moe_permute path: CUDA-graph safe, no D2H copies - print("use moe_permute in tp") topk_idx_i32 = topk_idx.astype(paddle.int32) override_buffer_size = x.shape[0] * layer.top_k + layer.num_experts * (128 - 1) - ( + (permute_input, permute_indices_per_token, dst_weights, _scale_out) = ( # zipped_expertwise_rowmap + paddle.nn.functional.moe_permute( + hidden_states=x, + scale=None, + expert_routemap_topk=topk_idx_i32, + expert_prob_topk=topk_weights, + num_experts=layer.num_experts, + tokens_per_expert=[], + padding_alignment=128, + override_buffer_size=override_buffer_size, + ) + ) + + # Row 2 of count_tokens_per_expert_func is the prefix sum token_nums_per_expert. + token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts)[2].cast( + paddle.int64 + ) + dump_tensor( + x, + topk_idx_i32, + topk_weights, + layer.num_experts, + override_buffer_size, permute_input, - permute_indices_per_token, # zipped_expertwise_rowmap + permute_indices_per_token, dst_weights, - _scale_out, - _m_indices, - ) = paddle.nn.functional.moe_permute( - hidden_states=x, - scale=None, - expert_routemap_topk=topk_idx_i32, - expert_prob_topk=topk_weights, - num_experts=layer.num_experts, - tokens_per_expert=[], - padding_alignment=128, - return_expert_indices=True, - override_buffer_size=override_buffer_size, + token_nums_per_expert_cumsum, ) - - # Compute token_nums_per_expert (prefix sum) on GPU. - # Use PADDED counts (row 1) because moe_permute with padding_alignment=128 - # lays out each expert's tokens in 128-aligned blocks. Using actual counts - # (row 0) would cause moe_expert_ffn to read wrong positions. - # Use matmul with a cached lower-triangular matrix instead of - # paddle.cumsum, because CUB inclusive_scan allocates temp memory - # which is forbidden during CUDA graph capture. - padded_counts = count_tokens_per_expert_func(topk_idx, layer.num_experts)[ - 1 - ] # [num_experts], int32, 128-aligned - if not hasattr(self, "_cumsum_tril") or self._cumsum_tril.shape[0] != layer.num_experts: - self._cumsum_tril = paddle.tril( - paddle.ones([layer.num_experts, layer.num_experts], dtype="float32") - ) - token_nums_per_expert = paddle.mv(self._cumsum_tril, padded_counts.cast("float32")).cast(paddle.int64) - if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) ffn_out = self.compute_ffn( layer, permute_input, - token_nums_per_expert, + token_nums_per_expert_cumsum, None, # expert_idx_per_token not needed for w16a16 without bias False, -1, @@ -352,6 +383,7 @@ def apply_tp( num_experts=layer.num_experts, using_weighted_combine=True, ) + dump_tensor(ffn_out, permute_indices_per_token, topk_idx_i32, fused_moe_out, dst_weights) return fused_moe_out ( @@ -375,6 +407,17 @@ def apply_tp( self.moe_quant_type, topk_only_mode=True, ) + dump_tensor( + x, + gate_out, + token_nums_per_expert, + permute_input, + topk_weights, + topk_idx, + expert_idx_per_token, + dequant_scale, + max_tokens_per_expert, + ) else: ( permute_input, @@ -408,7 +451,7 @@ def apply_tp( expert_idx_per_token = None else: expert_idx_per_token = expert_idx_per_token.cast("int64") - + dump_tensor(permute_input, token_nums_per_expert, expert_idx_per_token, dequant_scale, max_tokens_per_expert) ffn_out = self.compute_ffn( layer, permute_input, @@ -430,7 +473,7 @@ def apply_tp( norm_topk_prob=False if layer.topk_method == "noaux_tc" else True, routed_scaling_factor=1.0, ) - + dump_tensor(ffn_out, topk_weights, permute_indices_per_token, topk_idx, fused_moe_out) return fused_moe_out From 9efe026dcb8b3d5b66eb19ba5ef5f6e3157b8c8c Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Wed, 8 Apr 2026 11:31:31 +0800 Subject: [PATCH 3/8] ep ok --- .../layers/moe/fused_moe_cutlass_backend.py | 30 ++----------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 98ca385965a..a0b424f6276 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -17,7 +17,6 @@ from typing import Callable import paddle -from fxy_debug import dump_tensor from paddle import nn from paddle.nn.quant import weight_quantize from paddleformers.utils.log import logger @@ -131,6 +130,7 @@ def apply_ep_prefill( # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) # 2. EP Dispatch + dispatch_kwargs = {"expert_alignment": 128} if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE else {} ( recv_x, recv_topk_idx, @@ -138,7 +138,7 @@ def apply_ep_prefill( recv_num_tokens_per_expert_list, handle, event, - ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights) + ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights, **dispatch_kwargs) if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) @@ -169,7 +169,6 @@ def apply_ep_prefill( token_nums_per_expert_cumsum = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)[ 2 ].cast(paddle.int64) - ffn_out = self.compute_ffn( layer, permute_input, @@ -349,17 +348,6 @@ def apply_tp( token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts)[2].cast( paddle.int64 ) - dump_tensor( - x, - topk_idx_i32, - topk_weights, - layer.num_experts, - override_buffer_size, - permute_input, - permute_indices_per_token, - dst_weights, - token_nums_per_expert_cumsum, - ) if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids=topk_idx) @@ -383,7 +371,6 @@ def apply_tp( num_experts=layer.num_experts, using_weighted_combine=True, ) - dump_tensor(ffn_out, permute_indices_per_token, topk_idx_i32, fused_moe_out, dst_weights) return fused_moe_out ( @@ -407,17 +394,6 @@ def apply_tp( self.moe_quant_type, topk_only_mode=True, ) - dump_tensor( - x, - gate_out, - token_nums_per_expert, - permute_input, - topk_weights, - topk_idx, - expert_idx_per_token, - dequant_scale, - max_tokens_per_expert, - ) else: ( permute_input, @@ -451,7 +427,6 @@ def apply_tp( expert_idx_per_token = None else: expert_idx_per_token = expert_idx_per_token.cast("int64") - dump_tensor(permute_input, token_nums_per_expert, expert_idx_per_token, dequant_scale, max_tokens_per_expert) ffn_out = self.compute_ffn( layer, permute_input, @@ -473,7 +448,6 @@ def apply_tp( norm_topk_prob=False if layer.topk_method == "noaux_tc" else True, routed_scaling_factor=1.0, ) - dump_tensor(ffn_out, topk_weights, permute_indices_per_token, topk_idx, fused_moe_out) return fused_moe_out From 4db4771b94e6e7cf5d8288e89cde0eb5d313f27c Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Wed, 8 Apr 2026 15:01:00 +0800 Subject: [PATCH 4/8] fix cuda_kernel --- custom_ops/gpu_ops/cpp_extensions.cc | 3 +- custom_ops/gpu_ops/moe/deepgemm_preprocess.cu | 71 ++++++++++++------- .../layers/moe/fused_moe_cutlass_backend.py | 4 +- 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 40898434bf1..df660645c55 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -537,7 +537,8 @@ std::vector TextImageGatherScatter( const bool is_scatter); std::vector count_tokens_per_expert_func( - const paddle::Tensor& topk_ids, int64_t num_experts); + const paddle::Tensor& topk_ids, int64_t num_experts, + bool compute_padded_cumsum = false); void GetPositionIdsAndMaskEncoderBatch( const paddle::Tensor& seq_lens_encoder, const paddle::Tensor& seq_lens_decoder, diff --git a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu index 08e337f816f..a33c638d378 100644 --- a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu +++ b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu @@ -15,7 +15,7 @@ #include "helper.h" #include "paddle/extension.h" -template +template __global__ void cuda_kernel(const scalar_t *__restrict__ topk_ids, int32_t *__restrict__ res, int32_t *__restrict__ res_padded, @@ -35,56 +35,77 @@ __global__ void cuda_kernel(const scalar_t *__restrict__ topk_ids, } __syncthreads(); - if (threadIdx.x == 0) { - int32_t running_sum = 0; - for (int i = 0; i < num_experts; i++) { - int32_t count = tokens_per_ep[i]; - int32_t padded = (count + 127) / 128 * 128; - res[i] = count; - res_padded[i] = padded; - running_sum += padded; - res_padded_cumsum[i] = running_sum; + + if constexpr (kComputeCumsum) { + if (threadIdx.x == 0) { + int32_t running_sum = 0; + for (int i = 0; i < num_experts; i++) { + int32_t count = tokens_per_ep[i]; + int32_t padded = (count + 127) / 128 * 128; + res[i] = count; + res_padded[i] = padded; + running_sum += padded; + res_padded_cumsum[i] = running_sum; + } + } + } else { + for (size_t i = threadIdx.x; i < num_experts; i += blockDim.x) { + res[i] = tokens_per_ep[i]; + res_padded[i] = (tokens_per_ep[i] + 127) / 128 * 128; } } } std::vector count_tokens_per_expert_func( - const paddle::Tensor &topk_ids, int64_t num_experts) { + const paddle::Tensor &topk_ids, int64_t num_experts, + bool compute_padded_cumsum) { int topk_ids_numel = topk_ids.shape()[0] * topk_ids.shape()[1]; + int64_t num_rows = compute_padded_cumsum ? 3 : 2; auto token_nums_per_expert = paddle::empty( - {3, num_experts}, paddle::DataType::INT32, topk_ids.place()); + {num_rows, num_experts}, paddle::DataType::INT32, topk_ids.place()); auto stream = topk_ids.stream(); using scalar_t = int64_t; - // CUDA_CHECK(cudaGetLastError()); - cuda_kernel<<<1, 1024, num_experts * sizeof(int32_t), stream>>>( - topk_ids.data(), - token_nums_per_expert.data(), - token_nums_per_expert.data() + num_experts, - token_nums_per_expert.data() + 2 * num_experts, - topk_ids_numel, - num_experts); + if (compute_padded_cumsum) { + cuda_kernel<<<1, 1024, num_experts * sizeof(int32_t), stream>>>( + topk_ids.data(), + token_nums_per_expert.data(), + token_nums_per_expert.data() + num_experts, + token_nums_per_expert.data() + 2 * num_experts, + topk_ids_numel, + num_experts); + } else { + cuda_kernel<<<1, 1024, num_experts * sizeof(int32_t), stream>>>( + topk_ids.data(), + token_nums_per_expert.data(), + token_nums_per_expert.data() + num_experts, + nullptr, + topk_ids_numel, + num_experts); + } - // CUDA_CHECK(cudaGetLastError()); return {token_nums_per_expert}; } std::vector count_tokens_per_expert_func_infer_dtype( - const paddle::DataType &topk_ids_dtype, int64_t num_experts) { + const paddle::DataType &topk_ids_dtype, int64_t num_experts, + bool compute_padded_cumsum) { return {paddle::DataType::INT32}; } std::vector> count_tokens_per_expert_func_infer_shape( - const std::vector &topk_ids_shape, int64_t num_experts) { - return {{3, num_experts}}; + const std::vector &topk_ids_shape, int64_t num_experts, + bool compute_padded_cumsum) { + int64_t num_rows = compute_padded_cumsum ? 3 : 2; + return {{num_rows, num_experts}}; } PD_BUILD_STATIC_OP(count_tokens_per_expert_func) .Inputs({"topk_ids"}) .Outputs({"token_nums_per_expert"}) - .Attrs({"num_experts:int64_t"}) + .Attrs({"num_experts:int64_t", "compute_padded_cumsum:bool"}) .SetKernelFn(PD_KERNEL(count_tokens_per_expert_func)) .SetInferShapeFn(PD_INFER_SHAPE(count_tokens_per_expert_func_infer_shape)) .SetInferDtypeFn(PD_INFER_DTYPE(count_tokens_per_expert_func_infer_dtype)); diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index a0b424f6276..08fe2bfe781 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -166,7 +166,7 @@ def apply_ep_prefill( override_buffer_size=token_all_num, ) - token_nums_per_expert_cumsum = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)[ + token_nums_per_expert_cumsum = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts, True)[ 2 ].cast(paddle.int64) ffn_out = self.compute_ffn( @@ -345,7 +345,7 @@ def apply_tp( ) # Row 2 of count_tokens_per_expert_func is the prefix sum token_nums_per_expert. - token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts)[2].cast( + token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts, True)[2].cast( paddle.int64 ) if topk_ids_hookfunc is not None: From ef57f4eda1f5b82f5c6bd302ca3657ceca16dcd8 Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Wed, 8 Apr 2026 15:05:43 +0800 Subject: [PATCH 5/8] fix cuda_kernel --- custom_ops/gpu_ops/cpp_extensions.cc | 3 +- custom_ops/gpu_ops/moe/deepgemm_preprocess.cu | 39 +++++++++++-------- .../layers/moe/fused_moe_cutlass_backend.py | 6 +-- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index df660645c55..8e9cf6a3ddc 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -537,7 +537,8 @@ std::vector TextImageGatherScatter( const bool is_scatter); std::vector count_tokens_per_expert_func( - const paddle::Tensor& topk_ids, int64_t num_experts, + const paddle::Tensor& topk_ids, + int64_t num_experts, bool compute_padded_cumsum = false); void GetPositionIdsAndMaskEncoderBatch( const paddle::Tensor& seq_lens_encoder, diff --git a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu index a33c638d378..4316aa5cbda 100644 --- a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu +++ b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu @@ -57,7 +57,8 @@ __global__ void cuda_kernel(const scalar_t *__restrict__ topk_ids, } std::vector count_tokens_per_expert_func( - const paddle::Tensor &topk_ids, int64_t num_experts, + const paddle::Tensor &topk_ids, + int64_t num_experts, bool compute_padded_cumsum) { int topk_ids_numel = topk_ids.shape()[0] * topk_ids.shape()[1]; @@ -69,34 +70,38 @@ std::vector count_tokens_per_expert_func( using scalar_t = int64_t; if (compute_padded_cumsum) { - cuda_kernel<<<1, 1024, num_experts * sizeof(int32_t), stream>>>( - topk_ids.data(), - token_nums_per_expert.data(), - token_nums_per_expert.data() + num_experts, - token_nums_per_expert.data() + 2 * num_experts, - topk_ids_numel, - num_experts); + cuda_kernel + <<<1, 1024, num_experts * sizeof(int32_t), stream>>>( + topk_ids.data(), + token_nums_per_expert.data(), + token_nums_per_expert.data() + num_experts, + token_nums_per_expert.data() + 2 * num_experts, + topk_ids_numel, + num_experts); } else { - cuda_kernel<<<1, 1024, num_experts * sizeof(int32_t), stream>>>( - topk_ids.data(), - token_nums_per_expert.data(), - token_nums_per_expert.data() + num_experts, - nullptr, - topk_ids_numel, - num_experts); + cuda_kernel + <<<1, 1024, num_experts * sizeof(int32_t), stream>>>( + topk_ids.data(), + token_nums_per_expert.data(), + token_nums_per_expert.data() + num_experts, + nullptr, + topk_ids_numel, + num_experts); } return {token_nums_per_expert}; } std::vector count_tokens_per_expert_func_infer_dtype( - const paddle::DataType &topk_ids_dtype, int64_t num_experts, + const paddle::DataType &topk_ids_dtype, + int64_t num_experts, bool compute_padded_cumsum) { return {paddle::DataType::INT32}; } std::vector> count_tokens_per_expert_func_infer_shape( - const std::vector &topk_ids_shape, int64_t num_experts, + const std::vector &topk_ids_shape, + int64_t num_experts, bool compute_padded_cumsum) { int64_t num_rows = compute_padded_cumsum ? 3 : 2; return {{num_rows, num_experts}}; diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 08fe2bfe781..5b00cd46e25 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -166,9 +166,9 @@ def apply_ep_prefill( override_buffer_size=token_all_num, ) - token_nums_per_expert_cumsum = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts, True)[ - 2 - ].cast(paddle.int64) + token_nums_per_expert_cumsum = count_tokens_per_expert_func( + recv_topk_idx, layer.num_local_experts, True + )[2].cast(paddle.int64) ffn_out = self.compute_ffn( layer, permute_input, From 442b5764cfe458a336b476f60a8e68bbbc3449c7 Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Wed, 8 Apr 2026 17:41:53 +0800 Subject: [PATCH 6/8] Complete the replacement of moe_permute in non-noaux_tc mode --- .../layers/moe/fused_moe_cutlass_backend.py | 107 +++++++++++------- 1 file changed, 63 insertions(+), 44 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 5b00cd46e25..002838697f7 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -318,6 +318,69 @@ def apply_tp( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": + if layer.topk_method == "noaux_tc": + gate_out, topk_weights, topk_idx = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + getattr(layer, "renormalize", True), + ) + else: + topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( + gate_out, + layer.gate_correction_bias, + layer.top_k, + True, # apply_norm_weight + False, + ) + topk_idx_i32 = topk_idx.astype(paddle.int32) + override_buffer_size = x.shape[0] * layer.top_k + layer.num_experts * (128 - 1) + (permute_input, permute_indices_per_token, dst_weights, _scale_out) = ( # zipped_expertwise_rowmap + paddle.nn.functional.moe_permute( + hidden_states=x, + scale=None, + expert_routemap_topk=topk_idx_i32, + expert_prob_topk=topk_weights, + num_experts=layer.num_experts, + tokens_per_expert=[], + padding_alignment=128, + override_buffer_size=override_buffer_size, + ) + ) + + # Row 2 of count_tokens_per_expert_func is the prefix sum token_nums_per_expert. + token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts, True)[2].cast( + paddle.int64 + ) + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids=topk_idx) + + ffn_out = self.compute_ffn( + layer, + permute_input, + token_nums_per_expert_cumsum, + None, # expert_idx_per_token not needed for w16a16 without bias + False, + -1, + None, # dequant_scale + None, # max_tokens_per_expert + ) + + fused_moe_out, _out_probs = paddle.nn.functional.moe_unpermute( + hidden_states_unzipped=ffn_out, + zipped_expertwise_rowmap=permute_indices_per_token, + expert_routemap_topk=topk_idx_i32, + token_prob_unzipped=dst_weights, + total_zipped_tokens=x.shape[0], + num_experts=layer.num_experts, + using_weighted_combine=True, + ) + return fused_moe_out + if layer.topk_method == "noaux_tc": gate_out, topk_weights, topk_idx = get_moe_scores( gate_out, @@ -328,50 +391,6 @@ def apply_tp( layer.gate_correction_bias, getattr(layer, "renormalize", True), ) - if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": - topk_idx_i32 = topk_idx.astype(paddle.int32) - override_buffer_size = x.shape[0] * layer.top_k + layer.num_experts * (128 - 1) - (permute_input, permute_indices_per_token, dst_weights, _scale_out) = ( # zipped_expertwise_rowmap - paddle.nn.functional.moe_permute( - hidden_states=x, - scale=None, - expert_routemap_topk=topk_idx_i32, - expert_prob_topk=topk_weights, - num_experts=layer.num_experts, - tokens_per_expert=[], - padding_alignment=128, - override_buffer_size=override_buffer_size, - ) - ) - - # Row 2 of count_tokens_per_expert_func is the prefix sum token_nums_per_expert. - token_nums_per_expert_cumsum = count_tokens_per_expert_func(topk_idx, layer.num_experts, True)[2].cast( - paddle.int64 - ) - if topk_ids_hookfunc is not None: - topk_ids_hookfunc(topk_ids=topk_idx) - - ffn_out = self.compute_ffn( - layer, - permute_input, - token_nums_per_expert_cumsum, - None, # expert_idx_per_token not needed for w16a16 without bias - False, - -1, - None, # dequant_scale - None, # max_tokens_per_expert - ) - - fused_moe_out, _out_probs = paddle.nn.functional.moe_unpermute( - hidden_states_unzipped=ffn_out, - zipped_expertwise_rowmap=permute_indices_per_token, - expert_routemap_topk=topk_idx_i32, - token_prob_unzipped=dst_weights, - total_zipped_tokens=x.shape[0], - num_experts=layer.num_experts, - using_weighted_combine=True, - ) - return fused_moe_out ( permute_input, From 4bd6b95f5581fcdab1b31c98c6b38ee51c592a2d Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Wed, 8 Apr 2026 17:57:08 +0800 Subject: [PATCH 7/8] fix value name --- .../model_executor/layers/moe/fused_moe_cutlass_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 002838697f7..75158571e6c 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -330,7 +330,7 @@ def apply_tp( getattr(layer, "renormalize", True), ) else: - topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( + topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( gate_out, layer.gate_correction_bias, layer.top_k, From ad241481a5b5211c5eddb116d052a834d4a02343 Mon Sep 17 00:00:00 2001 From: fxyfxy777 Date: Thu, 9 Apr 2026 12:46:11 +0800 Subject: [PATCH 8/8] add ut --- .../layers/moe/fused_moe_deepgemm_backend.py | 4 +- .../layers/test_fused_moe_cutlass_backend.py | 235 ++++++++++++++++++ 2 files changed, 237 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 135fb5ecafc..626db6bc207 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -597,7 +597,7 @@ def apply_ep_prefill( using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, ) else: - token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts) + token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts, False) ( permute_input, permute_scale, @@ -893,7 +893,7 @@ def apply_tp( using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, ) else: - tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) + tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts, False) ( permute_input, permute_scale, diff --git a/tests/layers/test_fused_moe_cutlass_backend.py b/tests/layers/test_fused_moe_cutlass_backend.py index 2e8ea281daa..0a0ed29beb4 100644 --- a/tests/layers/test_fused_moe_cutlass_backend.py +++ b/tests/layers/test_fused_moe_cutlass_backend.py @@ -35,6 +35,7 @@ iluvatar_stub.prefill_fused_paged_attention = lambda *args, **kwargs: None sys.modules["fastdeploy.model_executor.ops.iluvatar"] = iluvatar_stub +import fastdeploy # noqa: E402 from fastdeploy.model_executor.layers import utils as layer_utils from fastdeploy.model_executor.layers.moe import fused_moe_cutlass_backend as backend @@ -707,3 +708,237 @@ def test_weight_only_prequanted_and_int4_create(self): int4_method.create_weights( int4_layer, num_experts=2, hidden_size=4, moe_intermediate_size=2, model_format="paddle" ) + + +# --------------------------------------------------------------------------- +# Real-op tests for FD_USE_PHI_MOE_PERMUTE=True (w16a16, moe_permute path) +# --------------------------------------------------------------------------- + +from fastdeploy.platforms import current_platform # noqa: E402 + +_CUDA_AVAILABLE = current_platform.is_cuda() +requires_cuda = pytest.mark.skipif(not _CUDA_AVAILABLE, reason="CUDA required") + + +class RealMoELayer(paddle.nn.Layer): + """Minimal bf16 MoE layer with real weights for moe_permute path testing.""" + + def __init__(self, num_experts=4, hidden_size=64, moe_intermediate_size=32, top_k=2): + super().__init__() + self.fd_config = DummyFDConfig() + self.num_experts = num_experts + self.num_local_experts = num_experts + self.hidden_size = hidden_size + self.moe_intermediate_size = moe_intermediate_size + self.top_k = top_k + self.topk_method = "noaux_tc" + self.n_group = 1 + self.topk_group = 1 + self.routed_scaling_factor = 1.0 + self.with_bias = False + self.ep_size = 1 + self.ep_rank = 0 + self.layer_idx = 0 + self.weight_dtype = "bfloat16" + self.is_quantized = False + self.activation = "swiglu" + self.moe_quant_config = types.SimpleNamespace(moe_dynamic_quant=False, hadamard_block_size=128) + self.gate_correction_bias = self.create_parameter( + shape=[1, num_experts], + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ) + paddle.seed(0) + self.up_gate_proj_weight = self.create_parameter( + shape=[num_experts, 2 * moe_intermediate_size, hidden_size], + dtype="bfloat16", + ) + self.down_proj_weight = self.create_parameter( + shape=[num_experts, hidden_size, moe_intermediate_size], + dtype="bfloat16", + ) + self.up_gate_proj_weight.set_value( + paddle.randn([num_experts, 2 * moe_intermediate_size, hidden_size]).cast("bfloat16") * 0.01 + ) + self.down_proj_weight.set_value( + paddle.randn([num_experts, hidden_size, moe_intermediate_size]).cast("bfloat16") * 0.01 + ) + + +class SimpleLinearGate(paddle.nn.Layer): + def __init__(self, hidden_size, num_experts): + super().__init__() + self.weight = self.create_parameter(shape=[hidden_size, num_experts], dtype="float32") + + def forward(self, x): + return paddle.matmul(x.cast("float32"), self.weight) + + +class TestMoePermuteTrueRealOps: + """Real-op tests for FD_USE_PHI_MOE_PERMUTE=True on the w16a16 path.""" + + def _build(self, num_experts=4, hidden_size=64, moe_intermediate_size=32, top_k=2): + layer = RealMoELayer( + num_experts=num_experts, + hidden_size=hidden_size, + moe_intermediate_size=moe_intermediate_size, + top_k=top_k, + ) + gate = SimpleLinearGate(hidden_size, num_experts) + method = backend.CutlassMoEMethod(None) + method.moe_quant_type = "w16a16" + return layer, gate, method + + @requires_cuda + def test_apply_tp_moe_permute_real_ops(self, monkeypatch): + """FD_USE_PHI_MOE_PERMUTE=True + w16a16: real moe_permute/moe_unpermute/ + count_tokens_per_expert_func/moe_expert_ffn all called end-to-end.""" + monkeypatch.setattr(backend.fastdeploy.envs, "FD_USE_PHI_MOE_PERMUTE", True) + + num_tokens, hidden_size = 8, 64 + layer, gate, method = self._build(hidden_size=hidden_size) + + paddle.seed(42) + x = paddle.randn([num_tokens, hidden_size], dtype="bfloat16") + + # Spy: confirm moe_permute is called, moe_expert_dispatch is NOT + permute_called = {"v": False} + dispatch_called = {"v": False} + original_permute = paddle.nn.functional.moe_permute + + def spy_permute(*args, **kwargs): + permute_called["v"] = True + return original_permute(*args, **kwargs) + + monkeypatch.setattr(paddle.nn.functional, "moe_permute", spy_permute) + monkeypatch.setattr( + backend, + "moe_expert_dispatch", + lambda *a, **kw: (_ for _ in ()).throw(AssertionError("moe_expert_dispatch must not be called")), + ) + + out = method.apply_tp(layer, x, gate) + + assert permute_called["v"], "moe_permute was not called" + assert not dispatch_called["v"], "moe_expert_dispatch must not be called" + assert list(out.shape) == [num_tokens, hidden_size], f"wrong output shape: {out.shape}" + assert not paddle.isnan(out).any(), "output contains NaN" + assert not paddle.isinf(out).any(), "output contains Inf" + + @requires_cuda + def test_apply_ep_prefill_moe_permute_real_ops(self, monkeypatch): + """FD_USE_PHI_MOE_PERMUTE=True + w16a16: EP prefill uses real moe_permute / + moe_unpermute / count_tokens_per_expert_func / moe_expert_ffn end-to-end. + The EP dispatch/combine are stubbed (no real NCCL needed). + Use num_tokens=128 and num_experts=4 so each expert gets exactly 64 tokens + (128 * top_k=2 / 4 experts = 64), satisfying moe_expert_ffn alignment.""" + monkeypatch.setattr(backend.fastdeploy.envs, "FD_USE_PHI_MOE_PERMUTE", True) + + # 128 tokens, top_k=2, 4 experts → 64 tokens/expert (128-aligned after padding) + num_tokens, hidden_size = 128, 64 + layer, gate, method = self._build(num_experts=4, hidden_size=hidden_size, top_k=2) + + paddle.seed(42) + x = paddle.randn([num_tokens, hidden_size], dtype="bfloat16") + + # Stub only the EP communication runner (dispatch/combine). + # All on-device compute (moe_permute, moe_expert_ffn, moe_unpermute) runs for real. + class StubEPRunner: + ep_engine = types.SimpleNamespace(async_finish=False) + + def moe_select(self, _layer, gate_out): + n = gate_out.shape[0] + # Route token i to experts (i % E) and ((i+1) % E) so all experts + # get tokens and recv_num_tokens_per_expert_list is accurate. + E = _layer.num_local_experts + idx0 = paddle.arange(n, dtype="int64") % E + idx1 = (paddle.arange(n, dtype="int64") + 1) % E + topk_ids = paddle.stack([idx0, idx1], axis=1) + topk_weights = paddle.ones([n, _layer.top_k], dtype="float32") / _layer.top_k + return topk_ids, topk_weights + + def dispatch(self, x, topk_idx, topk_weights, **kwargs): + # Pass tensors through unchanged — single-rank, no real communication. + # Compute accurate recv_num_tokens_per_expert_list from topk_idx. + E = layer.num_local_experts + counts = [int((topk_idx == e).sum().item()) for e in range(E)] + return ( + x, + topk_idx, + topk_weights, + counts, + object(), + types.SimpleNamespace(current_stream_wait=lambda: None), + ) + + def combine(self, ffn_out, handle, recv_topk_weights): + return ffn_out, types.SimpleNamespace(current_stream_wait=lambda: None) + + method.ep_prefill_runner = StubEPRunner() + + # Spy: confirm moe_permute is called inside ep_prefill + permute_called = {"v": False} + original_permute = paddle.nn.functional.moe_permute + + def spy_permute(*args, **kwargs): + permute_called["v"] = True + return original_permute(*args, **kwargs) + + monkeypatch.setattr(paddle.nn.functional, "moe_permute", spy_permute) + + out = method.apply_ep_prefill(layer, x, gate) + + assert permute_called["v"], "moe_permute was not called in ep_prefill path" + assert len(out.shape) == 2, f"wrong output ndim: {out.shape}" + assert out.shape[1] == hidden_size, f"wrong hidden_size: {out.shape}" + assert not paddle.isnan(out).any(), "output contains NaN" + assert not paddle.isinf(out).any(), "output contains Inf" + + @requires_cuda + def test_apply_tp_moe_permute_non_noaux_tc(self, monkeypatch): + """FD_USE_PHI_MOE_PERMUTE=True + w16a16 + topk_method != 'noaux_tc': + the else-branch calls moe_topk_select instead of get_moe_scores, + then proceeds through moe_permute / moe_expert_ffn / moe_unpermute.""" + monkeypatch.setattr(backend.fastdeploy.envs, "FD_USE_PHI_MOE_PERMUTE", True) + + num_tokens, hidden_size = 8, 64 + layer, gate, method = self._build(hidden_size=hidden_size) + # Switch to non-noaux_tc to exercise the else-branch (moe_topk_select) + layer.topk_method = "greedy" + + paddle.seed(7) + x = paddle.randn([num_tokens, hidden_size], dtype="bfloat16") + + # Spy on which routing function is invoked + get_moe_scores_called = {"v": False} + moe_topk_select_called = {"v": False} + permute_called = {"v": False} + + original_get_moe_scores = backend.get_moe_scores + original_moe_topk_select = fastdeploy.model_executor.ops.gpu.moe_topk_select + original_permute = paddle.nn.functional.moe_permute + + def spy_get_moe_scores(*args, **kwargs): + get_moe_scores_called["v"] = True + return original_get_moe_scores(*args, **kwargs) + + def spy_moe_topk_select(*args, **kwargs): + moe_topk_select_called["v"] = True + return original_moe_topk_select(*args, **kwargs) + + def spy_permute(*args, **kwargs): + permute_called["v"] = True + return original_permute(*args, **kwargs) + + monkeypatch.setattr(backend, "get_moe_scores", spy_get_moe_scores) + monkeypatch.setattr(fastdeploy.model_executor.ops.gpu, "moe_topk_select", spy_moe_topk_select) + monkeypatch.setattr(paddle.nn.functional, "moe_permute", spy_permute) + + out = method.apply_tp(layer, x, gate) + + assert not get_moe_scores_called["v"], "get_moe_scores must NOT be called for non-noaux_tc" + assert moe_topk_select_called["v"], "moe_topk_select must be called for non-noaux_tc" + assert permute_called["v"], "moe_permute must be called" + assert list(out.shape) == [num_tokens, hidden_size], f"wrong shape: {out.shape}" + assert not paddle.isnan(out).any(), "output contains NaN" + assert not paddle.isinf(out).any(), "output contains Inf"