diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 7926fad222b..6351ab34f1e 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -909,8 +909,7 @@ def _fetch_request(): time.sleep(0.005) except RuntimeError as e: - if "cannot schedule new futures after shutdown" in str(e): - break + raise e except Exception as e: err_msg = "Error happend while insert task to engine: {}, {}.".format(e, str(traceback.format_exc())) self.llm_logger.error(err_msg) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 71b2d1711e2..6c6887fbe94 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -717,7 +717,12 @@ def _allocate_decode_and_extend(): if not preempted_reqs: skip_requests: list[Request] = [] while self.waiting and token_budget > 0: - if len(self.running) == self.max_num_seqs: + if ( + len(self.running) + + len(self.to_be_rescheduled_request_id_set) + + sum([req.status == RequestStatus.PREEMPTED for req in self.waiting]) + >= self.max_num_seqs + ): break request = self.waiting[0]