From 8dd4eec40a21721fcbc3bdbabaaa9b5a72213d9b Mon Sep 17 00:00:00 2001 From: rainyfly <1435317881@qq.com> Date: Thu, 2 Apr 2026 17:24:03 +0800 Subject: [PATCH 1/3] [Feature] Support set PREEMPTED_TOKEN_ID in GET_SAVE_OUTPUT_V1 --- fastdeploy/worker/gpu_model_runner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c0e689735d4..0f1ce0d97c9 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -27,7 +27,7 @@ from paddle import nn from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig +from fastdeploy.config import PREEMPTED_TOKEN_ID, FDConfig from fastdeploy.engine.pooling_params import PoolingParams from fastdeploy.engine.request import ImagePosition, Request, RequestType from fastdeploy.model_executor.graph_optimization.utils import ( @@ -2411,6 +2411,13 @@ def _postprocess( self.share_inputs["accept_num_cpu"].copy_(self.share_inputs["accept_num"], False) self.share_inputs["seq_lens_decoder_cpu"].copy_(self.share_inputs["seq_lens_decoder"], False) self.share_inputs["prompt_lens_cpu"].copy_(self.share_inputs["prompt_lens"], False) + if envs.GET_SAVE_OUTPUT_V1: + paddle.assign( + paddle.where( + self.share_inputs["preempted_idx"], self.share_inputs["sampled_token_ids"], PREEMPTED_TOKEN_ID + ), + self.share_inputs["sampled_token_ids"], + ) post_process_event.record() # 6. Speculative decode -- proposer run (method="naive" has proposer=None, skip) From d940eb6c52b2b0130405794cd928d07207bb2071 Mon Sep 17 00:00:00 2001 From: rainyfly <1435317881@qq.com> Date: Thu, 2 Apr 2026 17:30:18 +0800 Subject: [PATCH 2/3] [Feature] Support set PREEMPTED_TOKEN_ID in GET_SAVE_OUTPUT_V1 --- fastdeploy/worker/gpu_model_runner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 0f1ce0d97c9..a5767385226 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2404,6 +2404,13 @@ def _postprocess( # 5.1. Async cpy post_process_event = paddle.device.cuda.create_event() + if envs.GET_SAVE_OUTPUT_V1: + paddle.assign( + paddle.where( + self.share_inputs["preempted_idx"] == 1, PREEMPTED_TOKEN_ID, sampler_output.sampled_token_ids + ), + sampler_output.sampled_token_ids, + ) # if not self.speculative_decoding: self.share_inputs["sampled_token_ids"].copy_(sampler_output.sampled_token_ids, False) if self.speculative_decoding: @@ -2411,13 +2418,6 @@ def _postprocess( self.share_inputs["accept_num_cpu"].copy_(self.share_inputs["accept_num"], False) self.share_inputs["seq_lens_decoder_cpu"].copy_(self.share_inputs["seq_lens_decoder"], False) self.share_inputs["prompt_lens_cpu"].copy_(self.share_inputs["prompt_lens"], False) - if envs.GET_SAVE_OUTPUT_V1: - paddle.assign( - paddle.where( - self.share_inputs["preempted_idx"], self.share_inputs["sampled_token_ids"], PREEMPTED_TOKEN_ID - ), - self.share_inputs["sampled_token_ids"], - ) post_process_event.record() # 6. Speculative decode -- proposer run (method="naive" has proposer=None, skip) From 556c78a688b1d9cdf9d5a45d2034fc6d9eb5d67c Mon Sep 17 00:00:00 2001 From: rainyfly <1435317881@qq.com> Date: Fri, 3 Apr 2026 15:13:18 +0800 Subject: [PATCH 3/3] fix --- fastdeploy/worker/gpu_model_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index a5767385226..bc5a58a30d9 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2404,10 +2404,13 @@ def _postprocess( # 5.1. Async cpy post_process_event = paddle.device.cuda.create_event() - if envs.GET_SAVE_OUTPUT_V1: + if envs.FD_USE_GET_SAVE_OUTPUT_V1: + # If one query is preempted, there is no sampled token for it, we use token_id PREEMPTED_TOKEN_ID to signal server, abort is finished. paddle.assign( paddle.where( - self.share_inputs["preempted_idx"] == 1, PREEMPTED_TOKEN_ID, sampler_output.sampled_token_ids + self.share_inputs["last_preempted_idx"][: sampler_output.sampled_token_ids.shape[0]] == 1, + PREEMPTED_TOKEN_ID, + sampler_output.sampled_token_ids, ), sampler_output.sampled_token_ids, )