fix(chunkers): fix remaining audit issues across all chunkers

waleedlatif1 · waleedlatif1 · commit 211fe904e313 · 2026-04-10T18:01:28.000-07:00
- DocsChunker: extract headers from cleaned content (not raw markdown)
  to fix position mismatch between header positions and chunk positions
- DocsChunker: strip export statements and JSX expressions in cleanContent
- DocsChunker: fix table merge dedup using equality instead of includes
- JsonYamlChunker: preserve path breadcrumbs when nested value fits in
  one chunk, matching LangChain RecursiveJsonSplitter behavior
- StructuredDataChunker: detect 2-column CSV (lowered threshold from &gt;2
  to &gt;=1) and use 20% relative tolerance instead of absolute +/-2
- TokenChunker: use sliding window overlap (matching LangChain/Chonkie)
  where chunks stay within chunkSize instead of exceeding it
- utils: splitAtWordBoundaries accepts optional stepChars for sliding
  window overlap; addOverlap uses newline join instead of space
diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts
@@ -76,11 +76,11 @@ export class DocsChunker {
 
     const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
 
-    const headers = this.extractHeaders(markdownContent)
-
     const documentUrl = this.generateDocumentUrl(relativePath)
 
-    const textChunks = await this.splitContent(markdownContent)
+    const { chunks: textChunks, cleanedContent } = await this.splitContent(markdownContent)
+
+    const headers = this.extractHeaders(cleanedContent)
 
     logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
     const embeddings: number[][] =
@@ -214,9 +214,11 @@ export class DocsChunker {
   }
 
   /**
-   * Split content into chunks using the existing TextChunker with table awareness
+   * Split content into chunks using the existing TextChunker with table awareness.
+   * Returns both the chunks and the cleaned content so header extraction
+   * operates on the same text that was chunked (aligned positions).
    */
-  private async splitContent(content: string): Promise<string[]> {
+  private async splitContent(content: string): Promise<{ chunks: string[]; cleanedContent: string }> {
     const cleanedContent = this.cleanContent(content)
 
     const tableBoundaries = this.detectTableBoundaries(cleanedContent)
@@ -231,7 +233,7 @@ export class DocsChunker {
 
     const finalChunks = this.enforceSizeLimit(processedChunks)
 
-    return finalChunks
+    return { chunks: finalChunks, cleanedContent }
   }
 
   /**
@@ -243,8 +245,10 @@ export class DocsChunker {
         .replace(/\r\n/g, '\n')
         .replace(/\r/g, '\n')
         .replace(/^import\s+.*$/gm, '')
-        .replace(/<[^>]+>/g, ' ')
+        .replace(/^export\s+.*$/gm, '')
+        .replace(/<\/?[a-zA-Z][^>]*>/g, ' ')
         .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
+        .replace(/\{[^{}]*\}/g, ' ')
         .replace(/\n{3,}/g, '\n\n')
         .replace(/[ \t]{2,}/g, ' ')
         .trim()
@@ -368,7 +372,7 @@ export class DocsChunker {
         const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
         const completeChunk = originalContent.slice(minStart, maxEnd).trim()
 
-        if (completeChunk && !mergedChunks.some((existing) => existing.includes(completeChunk))) {
+        if (completeChunk && !mergedChunks.some((existing) => existing === completeChunk)) {
           mergedChunks.push(completeChunk)
         }
       } else {
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -150,10 +150,12 @@ export class JsonYamlChunker {
     const fullTokens = estimateTokens(fullContent)
 
     if (fullTokens <= this.chunkSize) {
+      const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
+      const text = contextHeader + fullContent
       return [{
-        text: fullContent,
-        tokenCount: fullTokens,
-        metadata: { startIndex: 0, endIndex: fullContent.length },
+        text,
+        tokenCount: estimateTokens(text),
+        metadata: { startIndex: 0, endIndex: text.length },
       }]
     }
 
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -184,7 +184,8 @@ export class StructuredDataChunker {
       )
       const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length
 
-      if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= 2)) {
+      const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))
+      if (avgCount >= 1 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {
         return true
       }
     }
diff --git a/apps/sim/lib/chunkers/token-chunker.ts b/apps/sim/lib/chunkers/token-chunker.ts
@@ -1,7 +1,6 @@
 import { createLogger } from '@sim/logger'
 import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
 import {
-  addOverlap,
   buildChunks,
   cleanText,
   estimateTokens,
@@ -15,7 +14,8 @@ const logger = createLogger('TokenChunker')
 /**
  * Fixed-size token chunker
  * Splits text into chunks of a fixed token size with configurable overlap.
- * Snaps boundaries to word boundaries for cleaner output.
+ * Uses a sliding window approach (matching LangChain/Chonkie) where chunks
+ * stay within the size limit. The window advances by chunkSize - overlap.
  */
 export class TokenChunker {
   private readonly chunkSize: number
@@ -42,19 +42,17 @@ export class TokenChunker {
     }
 
     const chunkSizeChars = tokensToChars(this.chunkSize)
-    const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
+    const overlapChars = tokensToChars(this.chunkOverlap)
+    const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
+
+    const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
 
     const filtered =
       rawChunks.length > 1
         ? rawChunks.filter((c) => c.length >= this.minCharactersPerChunk)
         : rawChunks
 
-    let chunks = filtered.length > 0 ? filtered : rawChunks
-
-    if (this.chunkOverlap > 0) {
-      const overlapChars = tokensToChars(this.chunkOverlap)
-      chunks = addOverlap(chunks, overlapChars)
-    }
+    const chunks = filtered.length > 0 ? filtered : rawChunks
 
     logger.info(`Chunked into ${chunks.length} token-based chunks`)
     return buildChunks(chunks, this.chunkOverlap)
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
@@ -54,7 +54,7 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
         : overlapText
 
       if (cleanOverlap.trim()) {
-        chunk = `${cleanOverlap.trim()} ${chunk}`
+        chunk = `${cleanOverlap.trim()}\n${chunk}`
       }
     }
 
@@ -65,9 +65,17 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
 }
 
 /**
- * Split text at word boundaries into segments of approximately chunkSizeChars
+ * Split text at word boundaries into segments of approximately chunkSizeChars.
+ * When stepChars is provided (< chunkSizeChars), produces overlapping chunks
+ * using a sliding window, matching LangChain/Chonkie behavior where
+ * chunks stay within the size limit.
  */
-export function splitAtWordBoundaries(text: string, chunkSizeChars: number): string[] {
+export function splitAtWordBoundaries(
+  text: string,
+  chunkSizeChars: number,
+  stepChars?: number
+): string[] {
+  const step = stepChars ?? chunkSizeChars
   const parts: string[] = []
   let pos = 0
 
@@ -85,7 +93,10 @@ export function splitAtWordBoundaries(text: string, chunkSizeChars: number): str
     if (part) {
       parts.push(part)
     }
-    pos = end
+
+    const nextPos = pos + step
+    if (nextPos >= text.length) break
+    pos = nextPos
     while (pos < text.length && text[pos] === ' ') pos++
   }
 

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,8 @@ export class StructuredDataChunker {`
`184`	`184`	`)`
`185`	`185`	`const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length`
`186`	`186`
`187`		`- if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= 2)) {`
	`187`	`+ const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))`
	`188`	`+ if (avgCount >= 1 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {`
`188`	`189`	`return true`
`189`	`190`	`}`
`190`	`191`	`}`