fix: address Copilot review — conditional return, defensive guards, GPU placement

cloudforge1 · cloudforge1 · commit 00a6d4cba477 · 2026-04-03T11:37:05.000+02:00
- ngram_match.cu: add remaining&lt;=0 early return, conditional return
  only when tokens produced (matches CPU continue behavior), include
  encoder-active items in Phase 2 threshold-budget scan
- ngram_match_mixed.cu: split max_draft_tokens into explicit steps to
  prevent negative intermediates, conditional return only when tokens
  produced, add seq_lens_decoder invariant comment
- ngram.py: explicit .cuda() on input_ids_len_gpu creation
- test_ngram_gpu_kernel.py: use CPUPlace() in latency benchmark to
  measure actual D2H/H2D roundtrip
diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/ngram_match_mixed.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/ngram_match_mixed.cu
@@ -61,11 +61,13 @@ __global__ void ngram_match_mixed_search_kernel(
   // Skip batch items with no active tokens
   if (ori_seq_len_this_time == 0) return;
 
-  // Compute max_draft_tokens for this batch item
-  int max_draft_tokens = static_cast<int>(min(
-      static_cast<int64_t>(max_draft_tokens_param - ori_seq_len_this_time + 1),
-      max_dec_len[batch_idx] - step_idx[batch_idx] - 1));
-  if (max_draft_tokens <= 0) return;
+  // Compute max_draft_tokens for this batch item.
+  // Split into explicit steps to avoid negative intermediate values.
+  int64_t draft_budget =
+      static_cast<int64_t>(max_draft_tokens_param) - ori_seq_len_this_time + 1;
+  int64_t remaining_dec = max_dec_len[batch_idx] - step_idx[batch_idx] - 1;
+  if (draft_budget <= 0 || remaining_dec <= 0) return;
+  int max_draft_tokens = static_cast<int>(min(draft_budget, remaining_dec));
 
   const int64_t *cur_input_ids = input_ids + batch_idx * input_ids_stride;
   const int64_t cur_input_ids_len = input_ids_len[batch_idx];
@@ -81,44 +83,45 @@ __global__ void ngram_match_mixed_search_kernel(
     int64_t pos = parallel_ngram_search(
         cur_input_ids, cur_input_ids_len, ngram, ngram_size, &s_min_pos);
     if (pos != INT64_MAX) {
-      if (threadIdx.x == 0) {
+      int64_t start_idx = pos + ngram_size;
+      int64_t end_idx = min(start_idx + static_cast<int64_t>(max_draft_tokens),
+                            cur_input_ids_len);
+      if (threadIdx.x == 0 && start_idx < end_idx) {
         // Tentative token copy to scratch
-        int64_t start_idx = pos + ngram_size;
-        int64_t end_idx =
-            min(start_idx + static_cast<int64_t>(max_draft_tokens),
-                cur_input_ids_len);
-        if (start_idx < end_idx) {
-          int64_t n = end_idx - start_idx;
-          seq_lens_this_time_copy[batch_idx] =
-              static_cast<int32_t>(ori_seq_len_this_time + n);
-          int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
-          for (int64_t k = 0; k < n; k++) {
-            dst[ori_seq_len_this_time + k] = cur_input_ids[start_idx + k];
-          }
+        int64_t n = end_idx - start_idx;
+        seq_lens_this_time_copy[batch_idx] =
+            static_cast<int32_t>(ori_seq_len_this_time + n);
+        int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
+        for (int64_t k = 0; k < n; k++) {
+          dst[ori_seq_len_this_time + k] = cur_input_ids[start_idx + k];
         }
       }
-      return;
+      // Only early-exit when tokens were actually produced
+      if (start_idx < end_idx) {
+        return;
+      }
     }
 
     pos = parallel_ngram_search(
         cur_pre_ids, cur_step_idx, ngram, ngram_size, &s_min_pos);
     if (pos != INT64_MAX) {
-      if (threadIdx.x == 0) {
+      int64_t start_idx = pos + ngram_size;
+      int64_t end_idx =
+          min(start_idx + static_cast<int64_t>(max_draft_tokens), cur_step_idx);
+      if (threadIdx.x == 0 && start_idx < end_idx) {
         // Tentative token copy to scratch
-        int64_t start_idx = pos + ngram_size;
-        int64_t end_idx = min(
-            start_idx + static_cast<int64_t>(max_draft_tokens), cur_step_idx);
-        if (start_idx < end_idx) {
-          int64_t n = end_idx - start_idx;
-          seq_lens_this_time_copy[batch_idx] =
-              static_cast<int32_t>(ori_seq_len_this_time + n);
-          int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
-          for (int64_t k = 0; k < n; k++) {
-            dst[ori_seq_len_this_time + k] = cur_pre_ids[start_idx + k];
-          }
+        int64_t n = end_idx - start_idx;
+        seq_lens_this_time_copy[batch_idx] =
+            static_cast<int32_t>(ori_seq_len_this_time + n);
+        int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
+        for (int64_t k = 0; k < n; k++) {
+          dst[ori_seq_len_this_time + k] = cur_pre_ids[start_idx + k];
         }
       }
-      return;
+      // Only early-exit when tokens were actually produced
+      if (start_idx < end_idx) {
+        return;
+      }
     }
   }
 }
@@ -389,6 +392,13 @@ void HybridMtpNgram(const paddle::Tensor &input_ids,
   if (input_ids.is_gpu()) {
     auto stream = input_ids.stream();
 
+    // NOTE: GPU path does not pass seq_lens_decoder to kernels — the mixed
+    // variant uses ori_seq_len_this_time == 0 to skip inactive items. This
+    // matches CPU behavior under the invariant that seq_lens_decoder > 0 iff
+    // ori_seq_len_this_time > 0 (holds during normal MTP decoding). The CPU
+    // path counts seq_lens_decoder > 0 for threshold budget; the GPU scan
+    // counts tentative > 0, which is equivalent under this invariant.
+
     // Allocate scratch buffers for Phase 1 → Phase 2 communication
 
     // Scratch copy of draft_tokens (Phase 1 writes tentative tokens here)
diff --git a/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu b/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu
@@ -72,6 +72,7 @@ __global__ void ngram_match_search_kernel(const int64_t *input_ids,
 
   // Compute max_draft_tokens for this batch item
   int64_t remaining = max_dec_len[batch_idx] - cur_step_idx - 1;
+  if (remaining <= 0) return;
   int max_draft_tokens = static_cast<int>(
       min(static_cast<int64_t>(draft_token_num[batch_idx]), remaining));
 
@@ -83,42 +84,43 @@ __global__ void ngram_match_search_kernel(const int64_t *input_ids,
     int64_t pos = parallel_ngram_search(
         cur_input_ids, cur_input_ids_len, ngram, ngram_size, &s_min_pos);
     if (pos != INT64_MAX) {
-      if (threadIdx.x == 0) {
+      int64_t start_idx = pos + ngram_size;
+      int64_t end_idx = min(start_idx + static_cast<int64_t>(max_draft_tokens),
+                            cur_input_ids_len);
+      if (threadIdx.x == 0 && start_idx < end_idx) {
         // Tentative token copy to scratch
-        int64_t start_idx = pos + ngram_size;
-        int64_t end_idx =
-            min(start_idx + static_cast<int64_t>(max_draft_tokens),
-                cur_input_ids_len);
-        if (start_idx < end_idx) {
-          int64_t n = end_idx - start_idx;
-          seq_lens_this_time_copy[batch_idx] = static_cast<int32_t>(1 + n);
-          int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
-          for (int64_t k = 0; k < n; k++) {
-            dst[1 + k] = cur_input_ids[start_idx + k];
-          }
+        int64_t n = end_idx - start_idx;
+        seq_lens_this_time_copy[batch_idx] = static_cast<int32_t>(1 + n);
+        int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
+        for (int64_t k = 0; k < n; k++) {
+          dst[1 + k] = cur_input_ids[start_idx + k];
         }
       }
-      return;
+      // Only early-exit when tokens were actually produced
+      if (start_idx < end_idx) {
+        return;
+      }
     }
 
     pos = parallel_ngram_search(
         cur_pre_ids, cur_step_idx, ngram, ngram_size, &s_min_pos);
     if (pos != INT64_MAX) {
-      if (threadIdx.x == 0) {
+      int64_t start_idx = pos + ngram_size;
+      int64_t end_idx =
+          min(start_idx + static_cast<int64_t>(max_draft_tokens), cur_step_idx);
+      if (threadIdx.x == 0 && start_idx < end_idx) {
         // Tentative token copy to scratch
-        int64_t start_idx = pos + ngram_size;
-        int64_t end_idx = min(
-            start_idx + static_cast<int64_t>(max_draft_tokens), cur_step_idx);
-        if (start_idx < end_idx) {
-          int64_t n = end_idx - start_idx;
-          seq_lens_this_time_copy[batch_idx] = static_cast<int32_t>(1 + n);
-          int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
-          for (int64_t k = 0; k < n; k++) {
-            dst[1 + k] = cur_pre_ids[start_idx + k];
-          }
+        int64_t n = end_idx - start_idx;
+        seq_lens_this_time_copy[batch_idx] = static_cast<int32_t>(1 + n);
+        int64_t *dst = draft_tokens_copy + batch_idx * draft_tokens_stride;
+        for (int64_t k = 0; k < n; k++) {
+          dst[1 + k] = cur_pre_ids[start_idx + k];
         }
       }
-      return;
+      // Only early-exit when tokens were actually produced
+      if (start_idx < end_idx) {
+        return;
+      }
     }
   }
 }
@@ -147,12 +149,21 @@ __global__ void ngram_match_gather_kernel(
 
   int tid = threadIdx.x;
 
-  // Load tentative values from Phase 1
+  // Load tentative values from Phase 1.
+  // Encoder-active items are included in the scan with their original
+  // seq_lens_this_time to match CPU threshold-budget accounting.
   int tentative = 0;
   int is_active = 0;
   if (tid < max_batch_size) {
-    tentative = seq_lens_this_time_copy[tid];
-    is_active = (tentative > 0) ? 1 : 0;
+    if (seq_lens_encoder[tid] > 0) {
+      // Encoder-active: contribute original token count to threshold budget.
+      // seq_lens_this_time[tid] is still unmodified at this point.
+      tentative = seq_lens_this_time[tid];
+      is_active = 1;
+    } else {
+      tentative = seq_lens_this_time_copy[tid];
+      is_active = (tentative > 0) ? 1 : 0;
+    }
   }
 
   // Scan 1: inclusive prefix sum of tentative token counts
diff --git a/fastdeploy/spec_decode/ngram.py b/fastdeploy/spec_decode/ngram.py
@@ -37,7 +37,7 @@ def __init__(self, fd_config: "FDConfig"):
         super().__init__(fd_config)
         self.max_ngram_size = self.speculative_config.max_ngram_size
         self.input_ids_len = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64").cpu()
-        self.input_ids_len_gpu = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64")
+        self.input_ids_len_gpu = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64").cuda()
 
     def update(self, bid: int, seq_len: int):
         """
diff --git a/tests/spec_decode/test_ngram_gpu_kernel.py b/tests/spec_decode/test_ngram_gpu_kernel.py
@@ -610,7 +610,7 @@ def test_latency(self):
         t0 = time.perf_counter()
         for _ in range(n_runs):
             # Simulate old path: copy all tensors to CPU then back
-            cpu_tensors = {k: paddle.to_tensor(v) for k, v in cpu_data.items()}
+            cpu_tensors = {k: paddle.to_tensor(v, place=paddle.CPUPlace()) for k, v in cpu_data.items()}
             _ = cpu_tensors["draft_tokens"].cuda()
             _ = cpu_tensors["seq_lens_this_time"].cuda()
             paddle.device.synchronize()