fix: rename benchmark for CI discovery, bump to 10k iterations, env-gate large/bench tests

cloudforge1 · cloudforge1 · commit b7155eb522f1 · 2026-04-02T21:18:25.000+02:00
- Renamed benchmark_ngram_kernel.py → test_benchmark_ngram_kernel.py
  so pytest discovers it (test_*.py pattern)
- Bumped NUM_ITERS 10→10000, WARMUP 2→5 for noise-free profiling
- Gated benchmark class with RUN_NGRAM_BENCHMARKS=1 (won't bloat CI)
- Gated test_large_batch_long_seq with RUN_LARGE_NGRAM_TESTS=1 (OOM risk)
- Gated test_latency with RUN_NGRAM_BENCHMARKS=1 (no assertions)
diff --git a/tests/spec_decode/test_benchmark_ngram_kernel.py b/tests/spec_decode/test_benchmark_ngram_kernel.py
@@ -40,8 +40,8 @@
 
 MAX_NGRAM_SIZE = 3
 MAX_DRAFT_TOKENS = 10
-NUM_ITERS = 10
-WARMUP = 2
+NUM_ITERS = 10000
+WARMUP = 5
 
 
 def _build_data(batch_size, seq_len, hit_type="low_input", seed=42):
@@ -206,6 +206,10 @@ def _print_table(title, header, rows):
     print(f"{'=' * 80}")
 
 
+@unittest.skipUnless(
+    os.environ.get("RUN_NGRAM_BENCHMARKS", "0") == "1",
+    "Set RUN_NGRAM_BENCHMARKS=1 to run multi-group profiling (slow)",
+)
 class TestNgramBenchmarkGroups(unittest.TestCase):
     """Multi-dimension benchmark matching NKNaN's 5-group methodology."""
 
diff --git a/tests/spec_decode/test_ngram_gpu_kernel.py b/tests/spec_decode/test_ngram_gpu_kernel.py
@@ -418,7 +418,12 @@ def test_large_batch_long_seq(self):
 
         Uses high threshold to ensure all batches exercise the parallel search
         path (default threshold=128 would skip all batches at bsz=256).
+
+        Gated by RUN_LARGE_NGRAM_TESTS=1 to avoid OOM / timeout on
+        memory-constrained CI nodes.
         """
+        if not os.environ.get("RUN_LARGE_NGRAM_TESTS", ""):
+            self.skipTest("Large-scale test skipped. Set RUN_LARGE_NGRAM_TESTS=1 to enable.")
         high_threshold = 100000
         data = _make_ngram_test_data(batch_size=256, input_len=131072, max_model_len=131072 + 64, seed=77)
         cpu_draft = data["draft_tokens"].copy()
@@ -558,7 +563,13 @@ def test_many_short_seqs(self):
         np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)
 
     def test_latency(self):
-        """Benchmark: GPU kernel latency vs CPU transfer overhead."""
+        """Benchmark: GPU kernel latency vs CPU transfer overhead.
+
+        Pure benchmark with no assertions — skipped in CI by default.
+        Set RUN_NGRAM_BENCHMARKS=1 to enable.
+        """
+        if not os.environ.get("RUN_NGRAM_BENCHMARKS", ""):
+            self.skipTest("Benchmark skipped. Set RUN_NGRAM_BENCHMARKS=1 to enable.")
         # Pre-create tensors on GPU (data creation excluded from timing)
         gpu_data = _to_gpu(_make_ngram_test_data(batch_size=32, input_len=512, seed=42))
         cpu_data = _make_ngram_test_data(batch_size=32, input_len=512, seed=42)
@@ -732,7 +743,12 @@ def test_large_batch_long_seq(self):
 
         Uses high threshold to ensure all batches exercise the parallel search
         path (default threshold=1024 would skip many batches at bsz=256).
+
+        Gated by RUN_LARGE_NGRAM_TESTS=1 to avoid OOM / timeout on
+        memory-constrained CI nodes.
         """
+        if not os.environ.get("RUN_LARGE_NGRAM_TESTS", ""):
+            self.skipTest("Large-scale test skipped. Set RUN_LARGE_NGRAM_TESTS=1 to enable.")
         high_threshold = 100000
         data = _make_mixed_test_data(batch_size=256, input_len=131072, pre_ids_len=131072 + 64, seed=77)
         cpu_draft = data["draft_tokens"].copy()