fix: Qwen3 thinking model output leak — correct sampling, think filter, search thinking block

ijbo · ijbo · commit 8bc8b5199aa5 · 2026-03-13T14:58:46.000+09:00
- Replace greedy decoding with Qwen3 model card sampling params (temp=0.6/top_p=0.95 for thinking)
- Filter thinking content in ai-worker via skip_special_tokens:false + state machine
- Increase thinking model token limit from 1024 to 4096
- Add I'll/I'm contraction patterns + trailing cleanup to cleanThinkingArtifacts
- Search results shown in collapsible thinking block before AI response
- Move changelog to changelogs/ directory
diff --git a/ai-worker-common.js b/ai-worker-common.js
@@ -37,7 +37,7 @@ const SYSTEM_PROMPTS = {
     autocomplete:
         'You are a helpful writing assistant. Continue writing the text naturally. Only output the continuation, do not repeat the existing text. Write 1-2 sentences.',
     generate:
-        'You are a helpful content generation assistant. Generate content based on the user\'s request. Output in well-formatted markdown.',
+        'You are a helpful content generation assistant. Generate content based on the user\'s request. Output in well-formatted markdown. Do NOT use LaTeX $...$ or $$...$$ notation for math — use plain text or Unicode instead (e.g. write "x²" not "$x^2$"). Do NOT include any internal thinking, reasoning process, mental notes, or meta-commentary. Output ONLY the final answer.',
     markdown:
         'You are a markdown expert. Generate well-formatted markdown content based on the user\'s request. Use headings, lists, tables, code blocks, and other markdown features as appropriate.',
     explain:
@@ -52,8 +52,8 @@ const SYSTEM_PROMPTS = {
         'You are a helpful writing assistant. Elaborate on the following text by adding more details, examples, and explanations to make it more comprehensive. Output in markdown format.',
     shorten:
         'You are a concise writing editor. Shorten the following text while preserving all key information. Remove redundancy and use fewer words. Only output the shortened text.',
-    qa: 'You are a helpful assistant. Answer the user\'s question based on the provided document context. Be concise. If the answer cannot be found in the context, say so.',
-    chat: 'You are a helpful AI assistant integrated into a Markdown editor. Help the user with writing, editing, and formatting tasks. Be concise. Output in markdown format.',
+    qa: 'You are a helpful assistant. The user may have document context open in their editor. If the question relates to the provided context, use it to answer. If the question is unrelated to the context, answer directly from your knowledge. Be concise. Do NOT use LaTeX $...$ or $$...$$ notation — use plain text or Unicode for math. Do NOT include any internal reasoning, thinking process, or meta-commentary. Output in markdown format.',
+    chat: 'You are a helpful AI assistant integrated into a Markdown editor. Help the user with writing, editing, and formatting tasks. Be concise. Output in markdown format. Do NOT use LaTeX $...$ or $$...$$ notation for math — use plain text or Unicode instead. Do NOT include any internal thinking, reasoning steps, drafting notes, or meta-commentary. Output ONLY the final polished answer.',
 };
 
 /**
diff --git a/ai-worker-gemini.js b/ai-worker-gemini.js
@@ -146,16 +146,16 @@ function buildMessages(taskType, context, userPrompt) {
         rephrase: 'You are a helpful writing assistant. Rephrase the following text to improve clarity and readability while preserving the meaning. Output in markdown format.',
         grammar: 'You are a helpful writing assistant. Fix any grammar, spelling, and punctuation errors in the following text. Only output the corrected text, nothing else.',
         autocomplete: 'You are a helpful writing assistant. Continue writing the text naturally. Only output the continuation, do not repeat the existing text. Write 1-2 sentences.',
-        generate: 'You are a helpful content generation assistant. Generate content based on the user\'s request. Output in well-formatted markdown.',
+        generate: 'You are a helpful content generation assistant. Generate content based on the user\'s request. Output in well-formatted markdown. Do NOT use LaTeX $...$ or $$...$$ notation for math — use plain text or Unicode instead (e.g. write "x²" not "$x^2$"). Do NOT include any internal thinking, reasoning process, mental notes, or meta-commentary. Output ONLY the final answer.',
         markdown: 'You are a markdown expert. Generate well-formatted markdown content based on the user\'s request. Use headings, lists, tables, code blocks, and other markdown features as appropriate.',
         explain: 'You are a helpful assistant. Explain the following text in simple, easy-to-understand terms. Be concise. Output in markdown format.',
         simplify: 'You are a helpful writing assistant. Simplify the following text to make it easier to understand. Use shorter sentences and simpler words. Output in markdown format.',
         polish: 'You are a skilled writing editor. Polish the following text to improve flow, word choice, and overall quality while preserving the meaning and tone. Only output the polished text.',
         formalize: 'You are a professional writing assistant. Rewrite the following text in a more formal, professional tone suitable for business or academic contexts. Only output the formalized text.',
         elaborate: 'You are a helpful writing assistant. Elaborate on the following text by adding more details, examples, and explanations to make it more comprehensive. Output in markdown format.',
         shorten: 'You are a concise writing editor. Shorten the following text while preserving all key information. Remove redundancy and use fewer words. Only output the shortened text.',
-        qa: 'You are a helpful assistant. Answer the user\'s question based on the provided document context. Be concise. If the answer cannot be found in the context, say so.',
-        chat: 'You are a helpful AI assistant integrated into a Markdown editor. Help the user with writing, editing, and formatting tasks. Be concise. Output in markdown format.',
+        qa: 'You are a helpful assistant. The user may have document context open in their editor. If the question relates to the provided context, use it to answer. If the question is unrelated to the context, answer directly from your knowledge. Be concise. Do NOT use LaTeX $...$ or $$...$$ notation — use plain text or Unicode for math. Do NOT include any internal reasoning, thinking process, or meta-commentary. Output in markdown format.',
+        chat: 'You are a helpful AI assistant integrated into a Markdown editor. Help the user with writing, editing, and formatting tasks. Be concise. Output in markdown format. Do NOT use LaTeX $...$ or $$...$$ notation for math — use plain text or Unicode instead. Do NOT include any internal thinking, reasoning steps, drafting notes, or meta-commentary. Output ONLY the final polished answer.',
     };
     const systemMessage = systemPrompts[taskType] || systemPrompts.chat;
     const messages = [{ role: 'system', content: systemMessage }];
diff --git a/ai-worker.js b/ai-worker.js
@@ -259,27 +259,62 @@ async function generate(taskType, context, userPrompt, messageId, enableThinking
             // Process text + image together
             const inputs = await processor(prompt, rawImage);
 
-            // Collect streamed text
+            // Collect streamed text — filter thinking content
+            // Use skip_special_tokens:false when thinking is on so we see <think>/</ think> markers
             let fullText = '';
+            let inThinkingPhase = !!enableThinking;
+            let thinkingBuffer = '';
             const streamer = new TextStreamer(processor.tokenizer, {
                 skip_prompt: true,
-                skip_special_tokens: true,
+                skip_special_tokens: !enableThinking,
                 callback_function: (token) => {
-                    fullText += token;
+                    if (!enableThinking) {
+                        fullText += token;
+                        self.postMessage({ type: "token", token, messageId });
+                        return;
+                    }
+                    if (inThinkingPhase) {
+                        thinkingBuffer += token;
+                        if (thinkingBuffer.includes('</think>')) {
+                            inThinkingPhase = false;
+                            const afterThink = thinkingBuffer.substring(
+                                thinkingBuffer.indexOf('</think>') + '</think>'.length
+                            );
+                            const cleaned = afterThink.replace(/<\|[^|]*\|>/g, '').replace(/<\/?(?:think|thinking|thought)>/gi, '');
+                            if (cleaned.trim()) {
+                                fullText += cleaned;
+                                self.postMessage({ type: "token", token: cleaned, messageId });
+                            }
+                        }
+                        return;
+                    }
+                    const cleaned = token.replace(/<\|[^|]*\|>/g, '').replace(/<\/?(?:think|thinking|thought)>/gi, '');
+                    if (cleaned) {
+                        fullText += cleaned;
+                        self.postMessage({ type: "token", token: cleaned, messageId });
+                    }
                 },
             });
 
-            // Generate
-            await model.generate({
-                ...inputs,
-                do_sample: true,
-                max_new_tokens: maxTokens,
-                streamer,
-            });
+            // Generate — Qwen3 model card: use sampling, NOT greedy, for thinking mode
+            const genConfig = enableThinking
+                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096) }
+                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens };
+            await model.generate({ ...inputs, ...genConfig, streamer });
+
+            // Final cleanup — strip any remaining think tags or special tokens
+            let cleanedText = fullText.trim();
+            cleanedText = cleanedText.replace(/<(?:think|thinking|thought)>[\s\S]*?<\/(?:think|thinking|thought)>/gi, '');
+            cleanedText = cleanedText.replace(/<(?:think|thinking|thought)>[\s\S]*$/gi, '');
+            const closeMatch = cleanedText.match(/<\/(?:think|thinking|thought)>/i);
+            if (closeMatch) {
+                cleanedText = cleanedText.substring(cleanedText.indexOf(closeMatch[0]) + closeMatch[0].length);
+            }
+            cleanedText = cleanedText.replace(/<\|[^|]*\|>/g, '').trim();
 
             self.postMessage({
                 type: "complete",
-                text: fullText.trim(),
+                text: cleanedText.trim(),
                 messageId,
             });
         } else {
@@ -294,27 +329,82 @@ async function generate(taskType, context, userPrompt, messageId, enableThinking
                 return_tensors: "pt",
             });
 
-            // Collect streamed text
+            // --- Thinking-aware streaming ---
+            // When enableThinking is on, the model generates:
+            //   <think>...thinking content...</think>\n\nactual response
+            //
+            // Problem: skip_special_tokens:true strips <think> and </think> markers,
+            // making it impossible to detect where thinking ends.
+            // Solution: use skip_special_tokens:false so we see the markers,
+            // then manually filter thinking content and strip special tokens.
             let fullText = "";
+            let inThinkingPhase = !!enableThinking;
+            let thinkingBuffer = "";  // buffer thinking content (not forwarded)
+
             const streamer = new TextStreamer(processor.tokenizer, {
                 skip_prompt: true,
-                skip_special_tokens: true,
+                skip_special_tokens: !enableThinking,  // false when thinking, so we see markers
                 callback_function: (token) => {
-                    fullText += token;
+                    if (!enableThinking) {
+                        // Normal mode: forward everything
+                        fullText += token;
+                        self.postMessage({ type: "token", token, messageId });
+                        return;
+                    }
+
+                    // Thinking mode: track <think>...</think> boundary
+                    if (inThinkingPhase) {
+                        thinkingBuffer += token;
+                        // Check if we've seen the </think> closing marker
+                        if (thinkingBuffer.includes('</think>')) {
+                            inThinkingPhase = false;
+                            // Extract anything after </think> (there might be content)
+                            const afterThink = thinkingBuffer.substring(
+                                thinkingBuffer.indexOf('</think>') + '</think>'.length
+                            );
+                            // Clean special tokens from the after-think content
+                            const cleaned = afterThink
+                                .replace(/<\|[^|]*\|>/g, '')  // strip <|im_start|>, <|im_end|>, etc.
+                                .replace(/<\/?(?:think|thinking|thought)>/gi, '');
+                            if (cleaned.trim()) {
+                                fullText += cleaned;
+                                self.postMessage({ type: "token", token: cleaned, messageId });
+                            }
+                        }
+                        return; // don't forward thinking tokens
+                    }
+
+                    // Post-thinking: forward real content, strip any special tokens
+                    const cleaned = token
+                        .replace(/<\|[^|]*\|>/g, '')
+                        .replace(/<\/?(?:think|thinking|thought)>/gi, '');
+                    if (cleaned) {
+                        fullText += cleaned;
+                        self.postMessage({ type: "token", token: cleaned, messageId });
+                    }
                 },
             });
 
-            // Generate
-            await model.generate({
-                ...inputs,
-                do_sample: false,
-                max_new_tokens: maxTokens,
-                streamer,
-            });
+            // Generate — Qwen3 model card: use sampling, NOT greedy, for thinking mode
+            // Thinking: temp=0.6, top_p=0.95, top_k=20 | Non-thinking: temp=0.7, top_p=0.8, top_k=20
+            const genConfig = enableThinking
+                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096) }
+                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens };
+            await model.generate({ ...inputs, ...genConfig, streamer });
+
+            // Final cleanup: strip any remaining think tags or reasoning artifacts
+            let cleanedText = fullText.trim();
+            cleanedText = cleanedText.replace(/<(?:think|thinking|thought)>[\s\S]*?<\/(?:think|thinking|thought)>/gi, '');
+            cleanedText = cleanedText.replace(/<(?:think|thinking|thought)>[\s\S]*$/gi, '');
+            const closeMatch = cleanedText.match(/<\/(?:think|thinking|thought)>/i);
+            if (closeMatch) {
+                cleanedText = cleanedText.substring(cleanedText.indexOf(closeMatch[0]) + closeMatch[0].length);
+            }
+            cleanedText = cleanedText.replace(/<\|[^|]*\|>/g, '').trim();
 
             self.postMessage({
                 type: "complete",
-                text: fullText.trim(),
+                text: cleanedText.trim(),
                 messageId,
             });
         }
diff --git a/changelogs/CHANGELOG-search-thinking-block.md b/changelogs/CHANGELOG-search-thinking-block.md
@@ -38,12 +38,29 @@ Refactors the AI chat search flow to show web search results in a collapsible "t
 **What:** Added `.ai-thinking-block` container with green-accented border and fade-in animation, `.ai-thinking-spin` rotation keyframe for the search spinner, `.ai-thinking-searching` for the loading state, and `.ai-thinking-no-results` for the empty state with amber info icon. Dark mode variants included.
 **Impact:** Consistent, polished visual treatment matching the existing AI panel design.
 
+## 5. Qwen3 Thinking Model — Correct Sampling Parameters
+**Files:** `ai-worker.js`
+**What:** Replaced greedy decoding (`do_sample: false`) with sampling using Qwen3 model card recommended parameters: `temperature=0.6, top_p=0.95, top_k=20` for thinking mode and `temperature=0.7, top_p=0.8, top_k=20` for non-thinking mode. Greedy decoding causes "performance degradation and endless repetitions" per Qwen3 docs. Increased max tokens from 1024 to 4096 for thinking mode.
+**Impact:** Thinking model no longer gets stuck in infinite thinking loop and actually produces the answer.
+
+## 6. Thinking Content Filter — Worker-level `<think>` Tag Stripping
+**Files:** `ai-worker.js`
+**What:** When `enableThinking` is true, set `skip_special_tokens: false` so `<think>`/`</think>` markers remain visible in the TextStreamer callback. Added state machine that buffers thinking tokens and only forwards content after `</think>`. Strips leftover special tokens (`<|im_start|>`, etc.) from forwarded content. Applied to both text-only and vision generation paths.
+**Impact:** Raw thinking content (planning bullets, reasoning monologue) no longer leaks into the chat response.
+
+## 7. Improved `cleanThinkingArtifacts` — Contraction Patterns & Trailing Cleanup
+**Files:** `js/ai-chat.js`
+**What:** Added `I'll/I'm/I've/I'd` contraction patterns to reasoning detector (previously only matched `I 'll` with a space). Added trailing cleanup that strips planning outlines (`1. What the Black-Scholes equation is...`) and bare numbered items (`4.`) from end of responses.
+**Impact:** Catches residual reasoning that appears after `</think>` in the model's actual response content.
+
 ---
 
-## Files Changed (3 total)
+## Files Changed (5 total)
 
 | File | Lines Changed | Type |
 |------|:---:|------|
-| `js/ai-chat.js` | +217 −48 | Two-phase thinking block, removed inline search duplication |
+| `js/ai-chat.js` | +217 −48 | Two-phase thinking block, reasoning cleanup, removed inline search duplication |
 | `js/ai-assistant.js` | +4 −3 | Fixed user message dedup check |
 | `css/ai-panel.css` | +194 −0 | Thinking block styles, spinner, no-results state |
+| `ai-worker.js` | ~+80 −30 | Correct sampling params, thinking content filter |
+| `ai-worker-common.js` | modified | Supporting worker changes |
diff --git a/js/ai-docgen-generate.js b/js/ai-docgen-generate.js
@@ -214,7 +214,10 @@
     function cleanGeneratedOutput(text) {
         if (!text) return text;
 
-        text = text.replace(/<thinking>[\s\S]*?<\/thinking>/gi, '');
+        text = text.replace(/<(?:think|thinking|thought)>[\s\S]*?<\/(?:think|thinking|thought)>/gi, '');
+        text = text.replace(/<(?:think|thinking|thought)>[\s\S]*$/gi, '');
+        var closeMatch = text.match(/<\/(?:think|thinking|thought)>/i);
+        if (closeMatch) { text = text.substring(text.indexOf(closeMatch[0]) + closeMatch[0].length); }
 
         var thinkingPatterns = [
             /^[\s\S]*?Thinking Process:[\s\S]*?(?=^#|\n#)/m,
diff --git a/js/ai-web-search.js b/js/ai-web-search.js
diff --git a/styles.css b/styles.css