Skip to content

Commit e391efa

Browse files
committed
fix(chunkers): fix log message and add missing month abbreviations
- Fix regex fallback log: "character splitting" → "word-boundary splitting" - Add Jun and Jul to sentence chunker abbreviation list
1 parent ec6fa58 commit e391efa

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ export class RegexChunker {
8484
const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0)
8585

8686
if (segments.length <= 1) {
87-
logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
87+
logger.warn('Regex pattern did not produce any splits, falling back to word-boundary splitting')
8888
const chunkSizeChars = tokensToChars(this.chunkSize)
8989
let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
9090
if (this.chunkOverlap > 0) {

apps/sim/lib/chunkers/sentence-chunker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ export class SentenceChunker {
2828
private splitSentences(text: string): string[] {
2929
return text
3030
.split(
31-
/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|Capt|Lt|Col|Maj|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|Co|approx|dept|est|govt|Ave|Blvd|Rd|Jan|Feb|Mar|Apr|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g)\.)(?<![A-Z]\.)(?<!\.\.)(?<!\d\.)(?<=[.!?])\s+/
31+
/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|Capt|Lt|Col|Maj|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|Co|approx|dept|est|govt|Ave|Blvd|Rd|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g)\.)(?<![A-Z]\.)(?<!\.\.)(?<!\d\.)(?<=[.!?])\s+/
3232
)
3333
.filter((s) => s.trim().length > 0)
3434
}

0 commit comments

Comments
 (0)