quantumaikr
diff --git a/‎bindings/python/quantcpp/__init__.py‎
Lines changed: 31 additions & 4 deletions b/‎bindings/python/quantcpp/__init__.py‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎bindings/python/quantcpp/cli.py‎
Lines changed: 19 additions & 10 deletions b/‎bindings/python/quantcpp/cli.py‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎docs/feedback/2026-04-12_0900.md‎
Lines changed: 195 additions & 0 deletions b/‎docs/feedback/2026-04-12_0900.md‎
Lines changed: 195 additions & 0 deletions
@@ -4,11 +4,18 @@
 Quick start:
 
     from quantcpp import Model
-    m = Model.from_pretrained("Llama-3.2-1B")
+    m = Model.from_pretrained("SmolLM2-1.7B")
     print(m.ask("What is gravity?"))
 
-Note: SmolLM2-135M downloads faster but produces low-quality output.
-Use Llama-3.2-1B (~750 MB, one-time download) for good results.
+Model selection guide:
+    SmolLM2-1.7B  (1.7 GB, vocab 49K)  — recommended. ~12 tok/s on Apple M3.
+    Llama-3.2-1B  (750 MB, vocab 128K) — smaller download but slower
+                                          due to large vocab (~2 tok/s on M3).
+    SmolLM2-135M  (138 MB, vocab 49K)  — demo only, low quality output.
+
+Larger vocab = slower lm_head matmul → smaller params with smaller vocab
+often beats larger params with larger vocab. See docs/supported_models.md
+for the architecture support matrix.
 """
 
 try:
@@ -53,17 +60,37 @@ class ChatContextOverflow(RuntimeError):
                                   Path.home() / ".cache" / "quantcpp"))
 
 # name → (HuggingFace repo, filename, approx size in MB)
+# Note: download URL is constructed as
+#   https://huggingface.co/{repo}/resolve/main/{filename}
+# Verify both fields against the actual HuggingFace listing before
+# adding new entries — there is no integrity check at runtime.
 _MODEL_REGISTRY = {
+    # 138 MB demo model. Tokenizer + arch are llama-compatible but the
+    # model is too small to produce coherent output for general chat.
+    # Listed only so users can verify the install/load path quickly.
     "SmolLM2-135M": (
         "Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct",
         "smollm2-135m-instruct-q8_0.gguf",
         135,
     ),
+    # Recommended default for first-time users on Apple Silicon / typical
+    # laptops. vocab 49K keeps the lm_head matmul small, so even on a
+    # mid-range M-series chip we measure ~12 tok/s — comfortable for
+    # interactive chat. Same llama arch family as SmolLM2-135M, so it
+    # exercises the most-tested code path.
+    "SmolLM2-1.7B": (
+        "bartowski/SmolLM2-1.7B-Instruct-GGUF",
+        "SmolLM2-1.7B-Instruct-Q8_0.gguf",
+        1700,
+    ),
     "Qwen3.5-0.8B": (
         "unsloth/Qwen3.5-0.8B-GGUF",
         "Qwen3.5-0.8B-Q4_K_M.gguf",
         508,
     ),
+    # Smaller download than SmolLM2-1.7B but slower at inference time
+    # because of the 128K Llama-3 vocab (~5x slower lm_head matmul on M3).
+    # Kept in the registry for users who specifically want a Llama model.
     "Llama-3.2-1B": (
         "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
         "llama-3.2-1b-instruct-q4_k_m.gguf",
@@ -170,7 +197,7 @@ class Model:
 
     Examples
     --------
-    >>> m = Model.from_pretrained("SmolLM2-135M")
+    >>> m = Model.from_pretrained("SmolLM2-1.7B")
     >>> m.ask("What is gravity?")
     'Gravity is a force that attracts ...'
 
 
@@ -18,9 +18,13 @@
 import json
 
 
-# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
+# Ollama-style short aliases → canonical _MODEL_REGISTRY keys.
+# Plain "smollm2" without a size suffix points at the 1.7B model — that's
+# the recommended default. Users who explicitly want the 135M demo model
+# need to ask for it by full name.
 MODEL_ALIASES = {
-    "smollm2":      "SmolLM2-135M",
+    "smollm2":      "SmolLM2-1.7B",
+    "smollm2:1.7b": "SmolLM2-1.7B",
     "smollm2:135m": "SmolLM2-135M",
     "qwen3.5":      "Qwen3.5-0.8B",
     "qwen3.5:0.8b": "Qwen3.5-0.8B",
@@ -329,8 +333,13 @@ def cmd_client(args):
 
 
 def cmd_chat_default(args):
-    """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
-    args.model = args.model or "Llama-3.2-1B"
+    """Backwards-compatible default: auto-download SmolLM2-1.7B and chat.
+
+    Default switched from Llama-3.2-1B to SmolLM2-1.7B (2026-04-12) after
+    user feedback that Llama-3.2-1B's 128K vocab makes it ~5x slower at
+    interactive chat than SmolLM2-1.7B's 49K vocab on Apple Silicon.
+    """
+    args.model = args.model or "SmolLM2-1.7B"
     args.threads = getattr(args, "threads", 4)
     args.max_tokens = getattr(args, "max_tokens", 256)
     args.temperature = getattr(args, "temperature", 0.7)
@@ -354,19 +363,19 @@ def main():
   client PROMPT         Send a request to a running serve (default: SSE streaming)
 
 examples:
-  quantcpp pull llama3.2:1b
+  quantcpp pull smollm2              # recommended: small vocab → fast
   quantcpp list
-  quantcpp run llama3.2:1b
-  quantcpp run llama3.2:1b "What is gravity?"
-  quantcpp serve llama3.2:1b --port 8080
+  quantcpp run smollm2
+  quantcpp run smollm2 "What is gravity?"
+  quantcpp serve smollm2 --port 8080
   quantcpp client "What is gravity?"                  # streams from :8080
   quantcpp client "Hi" --url http://localhost:8081
   quantcpp client "Hi" --no-stream                    # single JSON response
 
 backwards-compat (no subcommand):
-  quantcpp                          # default chat with Llama-3.2-1B
+  quantcpp                          # default chat with SmolLM2-1.7B
   quantcpp "What is gravity?"       # one-shot
-  quantcpp --model SmolLM2-135M     # different model
+  quantcpp --model llama3.2:1b      # different model
 """,
     )
 
 
@@ -0,0 +1,195 @@
+# quant.cpp User Feedback — First-Time Setup & Usage Experience
+
+**Date**: 2026-04-12
+**Environment**: macOS (Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory)
+**Version tested**: v0.10.1 → v0.12.0 (pip) + latest main (source build)
+**Tested by**: End-user (developer, first-time quant.cpp user)
+
+---
+
+## Summary
+
+pip install부터 `quantcpp serve`, Metal GPU 빌드, 채팅 웹 UI 연동, 다양한 모델 비교까지의 전 과정을 체험했습니다. 전반적으로 "설치 → 모델 다운로드 → 추론"까지의 흐름은 매우 간결했으나, 모델 호환성과 속도 면에서 개선점이 발견되었습니다.
+
+---
+
+## 1. 좋았던 점
+
+### 1.1 설치가 매우 간단
+- `pip install quantcpp` 한 줄로 설치 완료. 의존성 zero.
+- `Model.from_pretrained("Llama-3.2-1B")`으로 모델 자동 다운로드 + 캐시. 매우 편리.
+
+### 1.2 OpenAI 호환 API 서버
+- `quantcpp serve llama3.2:1b --port 8080` 한 줄로 서버 기동.
+- `/v1/chat/completions` 엔드포인트가 OpenAI SDK와 호환되어 기존 코드 재사용 가능.
+- SSE 스트리밍(`stream: true`) 정상 동작.
+- CORS 헤더 (`Access-Control-Allow-Origin: *`) 기본 포함 — 프론트엔드 연동 즉시 가능.
+
+### 1.3 v0.12.0의 CLI 추가
+- `quantcpp "What is gravity?"` 한 줄 질문이 가능해져 체험 진입장벽이 크게 낮아짐.
+- `quantcpp` (인터랙티브 모드)도 직관적.
+
+### 1.4 KV cache reuse (최신 main)
+- 연속 대화 시 두 번째 요청부터 prefill이 생략되어 응답 시간이 ~50% 단축됨.
+- 첫 요청 27초 → 두 번째 요청 14초 (Llama-3.2-1B 기준).
+
+### 1.5 Metal GPU 자동 감지
+- `TQ_BUILD_METAL=ON`으로 빌드하면 Apple Silicon GPU를 자동 감지하여 활성화.
+- 별도 설정 없이 matmul 배치 디스패치가 Metal로 전환됨.
+
+### 1.6 SmolLM2-1.7B에서의 우수한 성능
+- vocab size가 작은 모델(49K)에서 ~12.5 tok/s 달성. 실시간 대화 가능 수준.
+- 출력 품질도 깨끗하고 정확함 (예: "The capital of South Korea is Seoul.").
+
+---
+
+## 2. 개선이 필요한 점
+
+### 2.1 pip 패키지에서 CLI가 누락 (v0.10.1)
+- **문제**: PyPI v0.10.1에는 `quantcpp` CLI entry point가 없었음. `zsh: command not found: quantcpp`.
+- **해결**: v0.11.0부터 `cli.py` + entry point 추가로 해결됨.
+- **제안**: PyPI에 최신 버전을 빠르게 배포하면 첫 경험이 크게 개선될 것.
+
+### 2.2 `quantcpp serve`에 quant-server 바이너리 필요
+- **문제**: `pip install quantcpp` 후 `quantcpp serve`를 실행하면 `quant-server binary not found` 에러.
+- 사용자가 직접 CMake로 `TQ_BUILD_SERVER=ON` 빌드 후 PATH에 복사해야 함.
+- **제안**: pip 패키지에 서버 바이너리를 포함하거나, 순수 Python fallback 서버를 제공.
+
+### 2.3 Llama-3.2-1B의 극심한 느린 속도
+- **문제**: Llama-3.2-1B (Q4_K_M)가 Apple M3에서 ~2.3 tok/s로 매우 느림.
+  - 60토큰 생성에 ~27초, 200토큰에 ~67초 소요.
+  - 대화형 사용이 사실상 불가능한 수준.
+- **원인 분석**: vocab size 128,256이 병목. 매 토큰마다 128K 차원의 output projection 필요.
+- **대비**: 동일 환경에서 SmolLM2-1.7B (Q8, vocab 49K)는 ~12.5 tok/s로 5배 빠름.
+- **제안**:
+  - 기본 추천 모델을 SmolLM2-1.7B로 변경 검토.
+  - 또는 모델 선택 가이드에 "vocab size가 클수록 느려진다"는 안내 추가.
+
+### 2.4 SmolLM2-135M의 출력 품질 문제
+- **문제**: SmolLM2-135M은 속도는 빠르지만(0.3초) 출력이 HTML 쓰레기 텍스트.
+- **제안**: 135M 모델은 "quantization 데모용"으로만 안내하고, 추론 품질 기대를 낮추는 문구 추가.
+
+### 2.5 Gemma-4-E2B 호환성 문제
+- **문제**: gemma-4-E2B-it-Q4_K_M.gguf 로딩은 성공하나, 추론 출력이 완전히 깨짐 (다국어 쓰레기 토큰).
+- 서버 로그에는 정상 로딩으로 표시되어 사용자가 원인을 파악하기 어려움.
+- **제안**: 지원되는 모델/아키텍처 목록을 명시하고, 미지원 모델 로딩 시 경고 표시.
+
+### 2.6 Phi-3.5-mini-instruct 아키텍처 미지원 (신규)
+- **문제**: `Phi-3.5-mini-instruct-Q8_0.gguf` (3.9GB) 로딩은 성공하나, attention 레이어 매핑 실패.
+  - 서버 로그: `loaded 32 layers (0 self_attn)` — self_attn이 0으로 인식됨.
+  - 출력: 완전한 쓰레기 토큰 (`uffrasspkeryensonisatcreteBUG...`).
+  - 속도 자체는 0.85초/80토큰으로 극도로 빠름 (vocab 32K 효과).
+- **영향**: Phi-3/Phi-3.5는 vocab 32K로 속도 면에서 최적의 모델이나 사용 불가.
+- **제안**:
+  - Phi-3 (`phi3`) 아키텍처의 attention 레이어 매핑 지원 추가.
+  - 이 모델이 지원되면 "속도 + 품질" 모두에서 최적의 추천 모델이 될 수 있음.
+  - `self_attn=0`으로 감지된 경우 사용자에게 경고 메시지 표시 필요.
+
+### 2.7 Qwen3.5-0.8B 출력 품질 문제 (신규)
+- **문제**: Qwen3.5-0.8B (Q4_K_M) 서버 로딩은 성공하나, 출력이 완전히 깨짐.
+  - DeltaNet hybrid 아키텍처 특성으로 인한 호환성 문제 추정.
+  - 33초/60토큰으로 속도도 느림 (vocab 248K).
+- **제안**: Qwen 계열의 지원 상태를 문서에 명시.
+
+### 2.8 Metal GPU 가속 효과 제한적 (소형 모델)
+- **문제**: 1B 모델에서 Metal GPU가 활성화되어 있으나 체감 속도 차이 없음.
+- 소스 코드 주석에도 "Metal Q4 batch → 38 tok/s vs CPU Q4 → 95 tok/s (SmolLM2)" 명시.
+- 소형 모델에서는 GPU 디스패치 오버헤드가 연산 시간보다 큼.
+- **제안**: 모델 크기에 따라 CPU/GPU 자동 전환 로직 추가, 또는 `--device cpu/gpu` 옵션 제공.
+
+### 2.9 서버 단일 요청 처리 (동시성 없음)
+- **문제**: 첫 번째 요청 처리 중 두 번째 요청이 완전히 블로킹됨.
+- 채팅 UI에서 연속 질문 시 두 번째 질문이 3분+ 대기.
+- **제안**: 요청 큐잉 + 처리 중 상태 반환 (429 or retry-after), 또는 요청 취소 API.
+
+### 2.10 chat template 잔여물
+- **문제**: 응답에 `<|im_start|>`, `<|im_end|>`, `<line>assistant</line>` 등 template 토큰이 노출됨.
+- Llama-3.2-1B에서 특히 빈번. SmolLM2-1.7B에서는 `<|im_ennd|>` 정도로 경미.
+- **제안**: 서버 측에서 stop tokens/template markers를 자동 strip.
+
+---
+
+## 3. 모델별 벤치마크 (Apple M3, 16GB RAM, Metal GPU 빌드)
+
+| Model | Quant | File Size | Vocab | tok/s | 60-token Time | Quality | Architecture |
+|-------|-------|-----------|------:|------:|--------------:|---------|-------------|
+| SmolLM2-135M | Q8 | 138MB | 49K | ~300 | 0.3s | Unusable (garbage) | llama |
+| Qwen3.5-0.8B | Q4_K_M | 508MB | 248K | ~1.8 | ~33s | Broken (garbage) | qwen/deltanet |
+| Llama-3.2-1B | Q4_K_M | 770MB | 128K | ~2.3 | ~27s | Usable (artifacts) | llama |
+| **SmolLM2-1.7B** | **Q8** | **1.7GB** | **49K** | **~12.5** | **~5s** | **Good (clean)** | **llama** |
+| Gemma-4-E2B | Q4_K_M | 2.9GB | 262K | ~10 | ~5s | Broken (compat) | gemma4 hybrid |
+| Phi-3.5-mini | Q8 | 3.9GB | 32K | ~94* | ~0.85s* | Broken (0 self_attn) | phi3 |
+
+*\* Phi-3.5 속도는 attention이 작동하지 않아 실제 추론이 아님. 정상 지원 시 예상 속도.*
+
+### Key Insights
+
+1. **vocab size가 속도에 가장 큰 영향을 미침.** 파라미터 수보다 vocab size와 양자화 방식이 실사용 속도를 결정.
+   - SmolLM2-1.7B (vocab 49K): 12.5 tok/s
+   - Llama-3.2-1B (vocab 128K): 2.3 tok/s — 2.6x vocab → 5.4x 느림
+2. **Q8이 Q4보다 빠를 수 있음.** Q4의 디퀀타이즈 오버헤드가 Q8보다 크며, NEON SIMD에서 Q8이 더 효율적.
+3. **llama 아키텍처만 안정적으로 동작.** phi3, gemma4, qwen/deltanet 아키텍처는 로딩은 되지만 추론이 깨짐.
+4. **Phi-3.5가 지원되면 게임 체인저.** vocab 32K + 3.8B params로 "속도 + 품질" 최적 조합 가능.
+
+---
+
+## 4. 아키텍처 호환성 매트릭스 (신규)
+
+| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status |
+|-------------|-----------|-----------|-----------|-----------|--------|
+| llama (SmolLM2, Llama) | OK | OK | OK | OK | **Fully supported** |
+| llama (Llama-3.2 GQA) | OK | OK | OK | Slow | Supported (vocab bottleneck) |
+| phi3 (Phi-3.5-mini) | OK | OK | **FAIL (0 self_attn)** | Garbage | **Not supported** |
+| gemma4 (Gemma-4-E2B) | OK | OK | Partial | Garbage | **Not supported** |
+| qwen/deltanet (Qwen3.5) | OK | OK | Unknown | Garbage | **Not supported** |
+
+**제안**: 이 매트릭스를 README 또는 docs에 포함하여 사용자가 모델 선택 전에 호환성을 확인할 수 있게 해주세요.
+
+---
+
+## 5. 제안 우선순위
+
+| Priority | Item | Impact | Effort |
+|----------|------|--------|--------|
+| **P0** | Phi-3 (`phi3`) 아키텍처 attention 매핑 지원 | 최적 모델 활용 가능 | Medium |
+| **P0** | chat template 토큰 자동 strip | 출력 품질 즉시 개선 | Low |
+| **P0** | 기본 추천 모델을 SmolLM2-1.7B로 변경 | 첫 경험 대폭 개선 | Low |
+| P1 | pip 패키지에 서버 바이너리 포함 | 설치 → 서버 기동 원스텝 | Medium |
+| P1 | 미지원 아키텍처 로딩 시 경고/에러 | 디버깅 시간 절약 | Low |
+| P1 | `self_attn=0` 감지 시 경고 메시지 | 호환성 문제 즉시 인지 | Low |
+| P2 | 서버 동시 요청 처리 (또는 큐잉) | 다중 사용자/연속 대화 | High |
+| P2 | 아키텍처 호환성 매트릭스 문서화 | 모델 선택 가이드 | Low |
+| P2 | vocab size 기반 CPU/GPU 자동 전환 | 최적 성능 자동 선택 | Medium |
+| P3 | `--device cpu/gpu` CLI 옵션 | 사용자 제어권 | Low |
+
+---
+
+## 6. 테스트 환경 상세
+
+```
+Hardware: Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory
+OS: macOS 15 (Darwin 24.5.0)
+Python: 3.14.3
+Compiler: AppleClang 16.0.0
+Xcode: installed (Metal shader compilation enabled)
+quantcpp: v0.10.1 (pip) → v0.12.0 (pip) → latest main (source)
+Build: cmake -DTQ_BUILD_METAL=ON -DTQ_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
+```
+
+---
+
+## 7. 테스트한 모델 파일 목록
+
+```
+~/.cache/quantcpp/smollm2-135m-instruct-q8_0.gguf          (138 MB)
+~/.cache/quantcpp/Qwen3.5-0.8B-Q4_K_M.gguf                 (508 MB)
+~/.cache/quantcpp/llama-3.2-1b-instruct-q4_k_m.gguf        (770 MB)
+~/.cache/quantcpp/Phi-3.5-mini-instruct-Q8_0.gguf          (3.9 GB) — NEW
+~/dev/projects/TurboQuant.cpp/models/SmolLM2-1.7B-Instruct-Q8_0.gguf  (1.7 GB)
+~/dev/projects/TurboQuant.cpp/models/gemma-4-E2B-it-Q4_K_M.gguf       (2.9 GB)
+```
+
+---
+
+*This feedback was generated based on a hands-on first-time user experience session on 2026-04-12.*
+*Updated with Phi-3.5-mini-instruct and Qwen3.5-0.8B architecture compatibility findings.*