Skip to content

Commit 4c3508b

Browse files
committed
fix(chunkers): use consistent overlap pattern in regex fallback
Use addOverlap + buildChunks(chunks, overlap) in the regex fallback path to match the main path and all other chunkers (TextChunker, RecursiveChunker). The sliding window approach was inconsistent.
1 parent 899fc68 commit 4c3508b

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,12 @@ export class RegexChunker {
8686
if (segments.length <= 1) {
8787
logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
8888
const chunkSizeChars = tokensToChars(this.chunkSize)
89-
const overlapChars = tokensToChars(this.chunkOverlap)
90-
const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
91-
const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
92-
return buildChunks(chunks, 0)
89+
let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
90+
if (this.chunkOverlap > 0) {
91+
const overlapChars = tokensToChars(this.chunkOverlap)
92+
chunks = addOverlap(chunks, overlapChars)
93+
}
94+
return buildChunks(chunks, this.chunkOverlap)
9395
}
9496

9597
const merged = this.mergeSegments(segments)

0 commit comments

Comments
 (0)