fix: restore Phi-3.5 as RLV default (Qwen3.5-4B: 6/20 on large-doc)

unamedkr · claude · unamedkr · commit 7e2ca31a6a36 · 2026-04-13T21:45:48.000+09:00
Qwen3.5-4B large-doc: 6/20 (30%) vs Phi-3.5: 19/20 (95%)
Phi-3.5's dense attention + 32K vocab is optimal for document QA.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -31,7 +31,7 @@
 # quant.h as a single translation unit — no sync issues.
 # Phi-3.5: ~1.15 tok/s (CPU NEON), ~6.5 tok/s reported in PR #79.
 # Q8_0 is 2x faster than Q4_K_M on NEON (simpler dequant, 3.0 vs 1.5 tok/s).
-DEFAULT_MODEL = REPO / "models" / "Qwen3.5-4B-Q4_K_M.gguf"
+DEFAULT_MODEL = REPO / "models" / "Phi-3.5-mini-instruct-Q8_0.gguf"
 DEFAULT_SERVER_BINARY = REPO / "build_metal" / "quant-server-unified"
 DEFAULT_SERVER_HOST = "127.0.0.1"
 DEFAULT_SERVER_PORT = 8421  # arbitrary, avoid conflicts with 8080
@@ -44,7 +44,7 @@
 CLIFF_BUDGET = {
     "models/Llama-3.2-3B-Instruct-Q8_0.gguf": 1024,
     "models/Llama-3.2-1B-Instruct-Q8_0.gguf": 512,
-    "models/Qwen3.5-4B-Q4_K_M.gguf": 1024,
+    "models/Phi-3.5-mini-instruct-Q8_0.gguf": 1024,
     "models/Phi-3.5-mini-instruct-Q4_K_M.gguf": 1024,
 }