Skip to content

Commit 3a26dad

Browse files
committed
fix(chunkers): prevent content loss in word boundary splitting
When splitAtWordBoundaries snaps end back to a word boundary, advance pos from end (not pos + step) in non-overlapping mode. The step-based advancement is preserved for the sliding window case (TokenChunker).
1 parent 4c3508b commit 3a26dad

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

apps/sim/lib/chunkers/utils.ts

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ export function splitAtWordBoundaries(
6060
chunkSizeChars: number,
6161
stepChars?: number
6262
): string[] {
63-
const step = Math.max(1, stepChars ?? chunkSizeChars)
6463
const parts: string[] = []
6564
let pos = 0
6665

@@ -79,9 +78,16 @@ export function splitAtWordBoundaries(
7978
parts.push(part)
8079
}
8180

82-
const nextPos = pos + step
83-
if (nextPos >= text.length) break
84-
pos = nextPos
81+
if (stepChars !== undefined) {
82+
// Sliding window: advance by step for predictable overlap
83+
const nextPos = pos + Math.max(1, stepChars)
84+
if (nextPos >= text.length) break
85+
pos = nextPos
86+
} else {
87+
// Non-overlapping: advance from end of extracted content
88+
if (end >= text.length) break
89+
pos = end
90+
}
8591
while (pos < text.length && text[pos] === ' ') pos++
8692
}
8793

0 commit comments

Comments
 (0)