Skip to content

Commit 211fe90

Browse files
committed
fix(chunkers): fix remaining audit issues across all chunkers
- DocsChunker: extract headers from cleaned content (not raw markdown) to fix position mismatch between header positions and chunk positions - DocsChunker: strip export statements and JSX expressions in cleanContent - DocsChunker: fix table merge dedup using equality instead of includes - JsonYamlChunker: preserve path breadcrumbs when nested value fits in one chunk, matching LangChain RecursiveJsonSplitter behavior - StructuredDataChunker: detect 2-column CSV (lowered threshold from >2 to >=1) and use 20% relative tolerance instead of absolute +/-2 - TokenChunker: use sliding window overlap (matching LangChain/Chonkie) where chunks stay within chunkSize instead of exceeding it - utils: splitAtWordBoundaries accepts optional stepChars for sliding window overlap; addOverlap uses newline join instead of space
1 parent 25abb8a commit 211fe90

File tree

5 files changed

+41
-25
lines changed

5 files changed

+41
-25
lines changed

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,11 @@ export class DocsChunker {
7676

7777
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
7878

79-
const headers = this.extractHeaders(markdownContent)
80-
8179
const documentUrl = this.generateDocumentUrl(relativePath)
8280

83-
const textChunks = await this.splitContent(markdownContent)
81+
const { chunks: textChunks, cleanedContent } = await this.splitContent(markdownContent)
82+
83+
const headers = this.extractHeaders(cleanedContent)
8484

8585
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
8686
const embeddings: number[][] =
@@ -214,9 +214,11 @@ export class DocsChunker {
214214
}
215215

216216
/**
217-
* Split content into chunks using the existing TextChunker with table awareness
217+
* Split content into chunks using the existing TextChunker with table awareness.
218+
* Returns both the chunks and the cleaned content so header extraction
219+
* operates on the same text that was chunked (aligned positions).
218220
*/
219-
private async splitContent(content: string): Promise<string[]> {
221+
private async splitContent(content: string): Promise<{ chunks: string[]; cleanedContent: string }> {
220222
const cleanedContent = this.cleanContent(content)
221223

222224
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
@@ -231,7 +233,7 @@ export class DocsChunker {
231233

232234
const finalChunks = this.enforceSizeLimit(processedChunks)
233235

234-
return finalChunks
236+
return { chunks: finalChunks, cleanedContent }
235237
}
236238

237239
/**
@@ -243,8 +245,10 @@ export class DocsChunker {
243245
.replace(/\r\n/g, '\n')
244246
.replace(/\r/g, '\n')
245247
.replace(/^import\s+.*$/gm, '')
246-
.replace(/<[^>]+>/g, ' ')
248+
.replace(/^export\s+.*$/gm, '')
249+
.replace(/<\/?[a-zA-Z][^>]*>/g, ' ')
247250
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
251+
.replace(/\{[^{}]*\}/g, ' ')
248252
.replace(/\n{3,}/g, '\n\n')
249253
.replace(/[ \t]{2,}/g, ' ')
250254
.trim()
@@ -368,7 +372,7 @@ export class DocsChunker {
368372
const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
369373
const completeChunk = originalContent.slice(minStart, maxEnd).trim()
370374

371-
if (completeChunk && !mergedChunks.some((existing) => existing.includes(completeChunk))) {
375+
if (completeChunk && !mergedChunks.some((existing) => existing === completeChunk)) {
372376
mergedChunks.push(completeChunk)
373377
}
374378
} else {

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,12 @@ export class JsonYamlChunker {
150150
const fullTokens = estimateTokens(fullContent)
151151

152152
if (fullTokens <= this.chunkSize) {
153+
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
154+
const text = contextHeader + fullContent
153155
return [{
154-
text: fullContent,
155-
tokenCount: fullTokens,
156-
metadata: { startIndex: 0, endIndex: fullContent.length },
156+
text,
157+
tokenCount: estimateTokens(text),
158+
metadata: { startIndex: 0, endIndex: text.length },
157159
}]
158160
}
159161

apps/sim/lib/chunkers/structured-data-chunker.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ export class StructuredDataChunker {
184184
)
185185
const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length
186186

187-
if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= 2)) {
187+
const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))
188+
if (avgCount >= 1 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {
188189
return true
189190
}
190191
}

apps/sim/lib/chunkers/token-chunker.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import { createLogger } from '@sim/logger'
22
import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
33
import {
4-
addOverlap,
54
buildChunks,
65
cleanText,
76
estimateTokens,
@@ -15,7 +14,8 @@ const logger = createLogger('TokenChunker')
1514
/**
1615
* Fixed-size token chunker
1716
* Splits text into chunks of a fixed token size with configurable overlap.
18-
* Snaps boundaries to word boundaries for cleaner output.
17+
* Uses a sliding window approach (matching LangChain/Chonkie) where chunks
18+
* stay within the size limit. The window advances by chunkSize - overlap.
1919
*/
2020
export class TokenChunker {
2121
private readonly chunkSize: number
@@ -42,19 +42,17 @@ export class TokenChunker {
4242
}
4343

4444
const chunkSizeChars = tokensToChars(this.chunkSize)
45-
const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
45+
const overlapChars = tokensToChars(this.chunkOverlap)
46+
const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
47+
48+
const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
4649

4750
const filtered =
4851
rawChunks.length > 1
4952
? rawChunks.filter((c) => c.length >= this.minCharactersPerChunk)
5053
: rawChunks
5154

52-
let chunks = filtered.length > 0 ? filtered : rawChunks
53-
54-
if (this.chunkOverlap > 0) {
55-
const overlapChars = tokensToChars(this.chunkOverlap)
56-
chunks = addOverlap(chunks, overlapChars)
57-
}
55+
const chunks = filtered.length > 0 ? filtered : rawChunks
5856

5957
logger.info(`Chunked into ${chunks.length} token-based chunks`)
6058
return buildChunks(chunks, this.chunkOverlap)

apps/sim/lib/chunkers/utils.ts

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
5454
: overlapText
5555

5656
if (cleanOverlap.trim()) {
57-
chunk = `${cleanOverlap.trim()} ${chunk}`
57+
chunk = `${cleanOverlap.trim()}\n${chunk}`
5858
}
5959
}
6060

@@ -65,9 +65,17 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
6565
}
6666

6767
/**
68-
* Split text at word boundaries into segments of approximately chunkSizeChars
68+
* Split text at word boundaries into segments of approximately chunkSizeChars.
69+
* When stepChars is provided (< chunkSizeChars), produces overlapping chunks
70+
* using a sliding window, matching LangChain/Chonkie behavior where
71+
* chunks stay within the size limit.
6972
*/
70-
export function splitAtWordBoundaries(text: string, chunkSizeChars: number): string[] {
73+
export function splitAtWordBoundaries(
74+
text: string,
75+
chunkSizeChars: number,
76+
stepChars?: number
77+
): string[] {
78+
const step = stepChars ?? chunkSizeChars
7179
const parts: string[] = []
7280
let pos = 0
7381

@@ -85,7 +93,10 @@ export function splitAtWordBoundaries(text: string, chunkSizeChars: number): str
8593
if (part) {
8694
parts.push(part)
8795
}
88-
pos = end
96+
97+
const nextPos = pos + step
98+
if (nextPos >= text.length) break
99+
pos = nextPos
89100
while (pos < text.length && text[pos] === ' ') pos++
90101
}
91102

0 commit comments

Comments
 (0)