fix(gemma4): numeric comparison with MLX-LM — divergence after layer 0

unamedkr · claude · unamedkr · commit 2899fb8302f4 · 2026-04-13T13:49:54.000+09:00
Layer-by-layer comparison with MLX-LM (google/gemma-4-E2B-it BF16):

Embedding (BOS token 2):
  MLX: -1.6406, -1.5312, 0.1885, -1.4844
  Ours: -1.6290, -1.5228, 0.1948, -1.4874
  Diff: &lt; 0.012 (Q5_0 vs BF16 quantization noise) ✅

Attn norm output (layer 0):
  MLX: -10.5625, -8.3125, 1.375, -12.1875
  Ours: -10.4733, -8.3217, 1.4276, -12.2401
  Diff: &lt; 0.1 ✅

Q projection (layer 0):
  MLX: -4.375, 21.25, -0.797, 5.125
  Ours: -4.306, 21.226, -0.711, 5.157
  Diff: &lt; 0.1 ✅

K projection (layer 0):
  MLX: 2.547, 3.141, -0.029, 1.133
  Ours: 2.298, 3.182, 0.165, 1.169
  Diff: &lt; 0.25 (slightly larger but within Q8_0 tolerance)

FINAL LOGITS (last position):
  MLX logits[100] (&lt;|channel&gt;): 22.88 (TOP-1)
  Ours logits[100]:            -16.90 ← WRONG
  MLX logits[0:3]: -22.38, 7.09, -3.48
  Ours logits[0:3]: -23.73, -2.68, 5.50

CONCLUSION: Embedding → attn_norm → Q/K projection are correct.
Divergence happens INSIDE or AFTER the attention computation in
layer 0, then compounds through 35 layers to produce completely
wrong final logits (~40 logit difference on critical tokens).

Next: compare attention output and FFN output at layer 0.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quant.h b/quant.h
@@ -14239,6 +14239,10 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
         } else {
             tq_matmul(s->q, s->xb, layer->wq, n_heads * head_dim, dim);
         }
+        if (pos == 0 && l == 0 && getenv("TQ_DEBUG")) {
+            fprintf(stderr, "[DEBUG] layer0 Q[0:4] = %.4f %.4f %.4f %.4f  K[0:4] = ",
+                    s->q[0], s->q[1], s->q[2], s->q[3]);
+        }
     }
     if (kv_shared_skip && kv_shared_ref_layer >= 0) {
         /* KV sharing: skip K/V projection for shared layers.
@@ -14285,6 +14289,9 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     } else {
         tq_matmul(s->k, s->xb, layer->wk, kv_dim, dim);
     }
+    if (pos == 0 && l == 0 && getenv("TQ_DEBUG")) {
+        fprintf(stderr, "%.4f %.4f %.4f %.4f\n", s->k[0], s->k[1], s->k[2], s->k[3]);
+    }
     if (has_fused_qkv_layer) {
         /* skip — handled by the fused branch */
     } else {
@@ -15447,6 +15454,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         /* Pre-attention/DeltaNet RMSNorm */
         tq_rmsnorm(s->xb, s->x, layer->attn_norm, dim, c->rms_norm_eps);
 
+        if (pos == 0 && l == 0 && getenv("TQ_DEBUG")) {
+            fprintf(stderr, "[DEBUG] layer0 attn_norm_out[0:4] = %.4f %.4f %.4f %.4f\n",
+                    s->xb[0], s->xb[1], s->xb[2], s->xb[3]);
+        }
+
         if (layer->delta_a_log) {
             /* DeltaNet layer */
             deltanet_forward(model, s, l);