Skip to content

Commit 9f83f87

Browse files
committed
feat(knowledge): add token, sentence, recursive, and regex chunkers
1 parent 34f77e0 commit 9f83f87

File tree

13 files changed

+924
-27
lines changed

13 files changed

+924
-27
lines changed

apps/sim/app/api/knowledge/route.ts

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,22 @@ const CreateKnowledgeBaseSchema = z.object({
3737
minSize: z.number().min(1).max(2000).default(100),
3838
/** Overlap between chunks in tokens (1 token ≈ 4 characters) */
3939
overlap: z.number().min(0).max(500).default(200),
40+
/** Chunking strategy */
41+
strategy: z
42+
.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token'])
43+
.default('auto')
44+
.optional(),
45+
/** Strategy-specific options */
46+
strategyOptions: z
47+
.object({
48+
/** Regex pattern for 'regex' strategy */
49+
pattern: z.string().optional(),
50+
/** Custom separator hierarchy for 'recursive' strategy */
51+
separators: z.array(z.string()).optional(),
52+
/** Pre-built separator recipe for 'recursive' strategy */
53+
recipe: z.enum(['plain', 'markdown', 'code']).optional(),
54+
})
55+
.optional(),
4056
})
4157
.default({
4258
maxSize: 1024,
@@ -45,13 +61,23 @@ const CreateKnowledgeBaseSchema = z.object({
4561
})
4662
.refine(
4763
(data) => {
48-
// Convert maxSize from tokens to characters for comparison (1 token ≈ 4 chars)
4964
const maxSizeInChars = data.maxSize * 4
5065
return data.minSize < maxSizeInChars
5166
},
5267
{
5368
message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)',
5469
}
70+
)
71+
.refine(
72+
(data) => {
73+
if (data.strategy === 'regex' && !data.strategyOptions?.pattern) {
74+
return false
75+
}
76+
return true
77+
},
78+
{
79+
message: 'Regex pattern is required when using the regex chunking strategy',
80+
}
5581
),
5682
})
5783

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ import {
1818
ModalHeader,
1919
Textarea,
2020
} from '@/components/emcn'
21+
import {
22+
Select,
23+
SelectContent,
24+
SelectItem,
25+
SelectTrigger,
26+
SelectValue,
27+
} from '@/components/ui/select'
28+
import type { StrategyOptions } from '@/lib/chunkers/types'
2129
import { cn } from '@/lib/core/utils/cn'
2230
import { formatFileSize, validateKnowledgeBaseFile } from '@/lib/uploads/utils/file-utils'
2331
import { ACCEPT_ATTRIBUTE } from '@/lib/uploads/utils/validation'
@@ -35,6 +43,15 @@ interface CreateBaseModalProps {
3543
onOpenChange: (open: boolean) => void
3644
}
3745

46+
const STRATEGY_OPTIONS = [
47+
{ value: 'auto', label: 'Auto (detect from content)' },
48+
{ value: 'text', label: 'Text (hierarchical splitting)' },
49+
{ value: 'recursive', label: 'Recursive (configurable separators)' },
50+
{ value: 'sentence', label: 'Sentence' },
51+
{ value: 'token', label: 'Token (fixed-size)' },
52+
{ value: 'regex', label: 'Regex (custom pattern)' },
53+
] as const
54+
3855
const FormSchema = z
3956
.object({
4057
name: z
@@ -58,10 +75,17 @@ const FormSchema = z
5875
.number()
5976
.min(0, 'Overlap must be non-negative')
6077
.max(500, 'Overlap must be less than 500 tokens'),
78+
/** Chunking strategy */
79+
strategy: z
80+
.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token'])
81+
.default('auto'),
82+
/** Regex pattern (required when strategy is 'regex') */
83+
regexPattern: z.string().optional(),
84+
/** Custom separators for recursive strategy (comma-separated) */
85+
customSeparators: z.string().optional(),
6186
})
6287
.refine(
6388
(data) => {
64-
// Convert maxChunkSize from tokens to characters for comparison (1 token ≈ 4 chars)
6589
const maxChunkSizeInChars = data.maxChunkSize * 4
6690
return data.minChunkSize < maxChunkSizeInChars
6791
},
@@ -70,6 +94,18 @@ const FormSchema = z
7094
path: ['minChunkSize'],
7195
}
7296
)
97+
.refine(
98+
(data) => {
99+
if (data.strategy === 'regex' && !data.regexPattern?.trim()) {
100+
return false
101+
}
102+
return true
103+
},
104+
{
105+
message: 'Regex pattern is required when using the regex strategy',
106+
path: ['regexPattern'],
107+
}
108+
)
73109

74110
type FormValues = z.infer<typeof FormSchema>
75111

@@ -124,6 +160,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
124160
handleSubmit,
125161
reset,
126162
watch,
163+
setValue,
127164
formState: { errors },
128165
} = useForm<FormValues>({
129166
resolver: zodResolver(FormSchema),
@@ -133,11 +170,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
133170
minChunkSize: 100,
134171
maxChunkSize: 1024,
135172
overlapSize: 200,
173+
strategy: 'auto',
174+
regexPattern: '',
175+
customSeparators: '',
136176
},
137177
mode: 'onSubmit',
138178
})
139179

140180
const nameValue = watch('name')
181+
const strategyValue = watch('strategy')
141182

142183
useEffect(() => {
143184
if (open) {
@@ -153,6 +194,9 @@ export const CreateBaseModal = memo(function CreateBaseModal({
153194
minChunkSize: 100,
154195
maxChunkSize: 1024,
155196
overlapSize: 200,
197+
strategy: 'auto',
198+
regexPattern: '',
199+
customSeparators: '',
156200
})
157201
}
158202
}, [open, reset])
@@ -255,6 +299,17 @@ export const CreateBaseModal = memo(function CreateBaseModal({
255299
setSubmitStatus(null)
256300

257301
try {
302+
const strategyOptions: StrategyOptions | undefined =
303+
data.strategy === 'regex' && data.regexPattern
304+
? { pattern: data.regexPattern }
305+
: data.strategy === 'recursive' && data.customSeparators?.trim()
306+
? {
307+
separators: data.customSeparators
308+
.split(',')
309+
.map((s) => s.trim().replace(/\\n/g, '\n').replace(/\\t/g, '\t')),
310+
}
311+
: undefined
312+
258313
const newKnowledgeBase = await createKnowledgeBaseMutation.mutateAsync({
259314
name: data.name,
260315
description: data.description || undefined,
@@ -263,6 +318,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({
263318
maxSize: data.maxChunkSize,
264319
minSize: data.minChunkSize,
265320
overlap: data.overlapSize,
321+
...(data.strategy !== 'auto' && { strategy: data.strategy }),
322+
...(strategyOptions && { strategyOptions }),
266323
},
267324
})
268325

@@ -403,6 +460,69 @@ export const CreateBaseModal = memo(function CreateBaseModal({
403460
</p>
404461
</div>
405462

463+
<div className='flex flex-col gap-2'>
464+
<Label htmlFor='strategy'>Chunking Strategy</Label>
465+
<Select
466+
value={strategyValue}
467+
onValueChange={(value) =>
468+
setValue('strategy', value as FormValues['strategy'])
469+
}
470+
>
471+
<SelectTrigger id='strategy'>
472+
<SelectValue placeholder='Auto (detect from content)' />
473+
</SelectTrigger>
474+
<SelectContent>
475+
{STRATEGY_OPTIONS.map((option) => (
476+
<SelectItem key={option.value} value={option.value}>
477+
{option.label}
478+
</SelectItem>
479+
))}
480+
</SelectContent>
481+
</Select>
482+
<p className='text-[var(--text-muted)] text-xs'>
483+
Auto detects the best strategy based on file content type.
484+
</p>
485+
</div>
486+
487+
{strategyValue === 'regex' && (
488+
<div className='flex flex-col gap-2'>
489+
<Label htmlFor='regexPattern'>Regex Pattern</Label>
490+
<Input
491+
id='regexPattern'
492+
placeholder='e.g. \\n\\n or (?<=\\})\\s*(?=\\{)'
493+
{...register('regexPattern')}
494+
className={cn(errors.regexPattern && 'border-[var(--text-error)]')}
495+
autoComplete='off'
496+
data-form-type='other'
497+
/>
498+
{errors.regexPattern && (
499+
<p className='text-[var(--text-error)] text-xs'>
500+
{errors.regexPattern.message}
501+
</p>
502+
)}
503+
<p className='text-[var(--text-muted)] text-xs'>
504+
Text will be split at each match of this regex pattern.
505+
</p>
506+
</div>
507+
)}
508+
509+
{strategyValue === 'recursive' && (
510+
<div className='flex flex-col gap-2'>
511+
<Label htmlFor='customSeparators'>Custom Separators (optional)</Label>
512+
<Input
513+
id='customSeparators'
514+
placeholder='e.g. \n\n, \n, . , '
515+
{...register('customSeparators')}
516+
autoComplete='off'
517+
data-form-type='other'
518+
/>
519+
<p className='text-[var(--text-muted)] text-xs'>
520+
Comma-separated list of delimiters in priority order. Leave empty for default
521+
separators.
522+
</p>
523+
</div>
524+
)}
525+
406526
<div className='flex flex-col gap-2'>
407527
<Label>Upload Documents</Label>
408528
<Button

apps/sim/hooks/queries/kb/knowledge.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { createLogger } from '@sim/logger'
22
import { keepPreviousData, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
33
import { toast } from '@/components/emcn'
4+
import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
45
import type {
56
ChunkData,
67
ChunksPagination,
@@ -707,6 +708,8 @@ export interface CreateKnowledgeBaseParams {
707708
maxSize: number
708709
minSize: number
709710
overlap: number
711+
strategy?: ChunkingStrategy
712+
strategyOptions?: StrategyOptions
710713
}
711714
}
712715

apps/sim/lib/chunkers/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
export { DocsChunker } from './docs-chunker'
22
export { JsonYamlChunker } from './json-yaml-chunker'
3+
export { RecursiveChunker } from './recursive-chunker'
4+
export { RegexChunker } from './regex-chunker'
5+
export { SentenceChunker } from './sentence-chunker'
36
export { StructuredDataChunker } from './structured-data-chunker'
47
export { TextChunker } from './text-chunker'
8+
export { TokenChunker } from './token-chunker'
59
export * from './types'

0 commit comments

Comments
 (0)