Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from paddle import nn
from paddleformers.utils.log import logger

from fastdeploy.config import FDConfig
from fastdeploy.config import PREEMPTED_TOKEN_ID, FDConfig
from fastdeploy.engine.pooling_params import PoolingParams
from fastdeploy.engine.request import ImagePosition, Request, RequestType
from fastdeploy.model_executor.graph_optimization.utils import (
Expand Down Expand Up @@ -2404,6 +2404,16 @@ def _postprocess(

# 5.1. Async cpy
post_process_event = paddle.device.cuda.create_event()
if envs.FD_USE_GET_SAVE_OUTPUT_V1:
# If one query is preempted, there is no sampled token for it, we use token_id PREEMPTED_TOKEN_ID to signal server, abort is finished.
paddle.assign(
paddle.where(
self.share_inputs["last_preempted_idx"][: sampler_output.sampled_token_ids.shape[0]] == 1,
PREEMPTED_TOKEN_ID,
sampler_output.sampled_token_ids,
),
sampler_output.sampled_token_ids,
)
# if not self.speculative_decoding:
self.share_inputs["sampled_token_ids"].copy_(sampler_output.sampled_token_ids, False)
if self.speculative_decoding:
Expand Down
Loading