fix(chunkers): prevent content loss in word boundary splitting

waleedlatif1 · waleedlatif1 · commit 3a26dad205e1 · 2026-04-10T19:00:50.000-07:00
When splitAtWordBoundaries snaps end back to a word boundary, advance
pos from end (not pos + step) in non-overlapping mode. The step-based
advancement is preserved for the sliding window case (TokenChunker).
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
@@ -60,7 +60,6 @@ export function splitAtWordBoundaries(
   chunkSizeChars: number,
   stepChars?: number
 ): string[] {
-  const step = Math.max(1, stepChars ?? chunkSizeChars)
   const parts: string[] = []
   let pos = 0
 
@@ -79,9 +78,16 @@ export function splitAtWordBoundaries(
       parts.push(part)
     }
 
-    const nextPos = pos + step
-    if (nextPos >= text.length) break
-    pos = nextPos
+    if (stepChars !== undefined) {
+      // Sliding window: advance by step for predictable overlap
+      const nextPos = pos + Math.max(1, stepChars)
+      if (nextPos >= text.length) break
+      pos = nextPos
+    } else {
+      // Non-overlapping: advance from end of extracted content
+      if (end >= text.length) break
+      pos = end
+    }
     while (pos < text.length && text[pos] === ' ') pos++
   }