@@ -18,6 +18,14 @@ import {
1818 ModalHeader ,
1919 Textarea ,
2020} from '@/components/emcn'
21+ import {
22+ Select ,
23+ SelectContent ,
24+ SelectItem ,
25+ SelectTrigger ,
26+ SelectValue ,
27+ } from '@/components/ui/select'
28+ import type { StrategyOptions } from '@/lib/chunkers/types'
2129import { cn } from '@/lib/core/utils/cn'
2230import { formatFileSize , validateKnowledgeBaseFile } from '@/lib/uploads/utils/file-utils'
2331import { ACCEPT_ATTRIBUTE } from '@/lib/uploads/utils/validation'
@@ -35,6 +43,15 @@ interface CreateBaseModalProps {
3543 onOpenChange : ( open : boolean ) => void
3644}
3745
46+ const STRATEGY_OPTIONS = [
47+ { value : 'auto' , label : 'Auto (detect from content)' } ,
48+ { value : 'text' , label : 'Text (hierarchical splitting)' } ,
49+ { value : 'recursive' , label : 'Recursive (configurable separators)' } ,
50+ { value : 'sentence' , label : 'Sentence' } ,
51+ { value : 'token' , label : 'Token (fixed-size)' } ,
52+ { value : 'regex' , label : 'Regex (custom pattern)' } ,
53+ ] as const
54+
3855const FormSchema = z
3956 . object ( {
4057 name : z
@@ -58,10 +75,17 @@ const FormSchema = z
5875 . number ( )
5976 . min ( 0 , 'Overlap must be non-negative' )
6077 . max ( 500 , 'Overlap must be less than 500 tokens' ) ,
78+ /** Chunking strategy */
79+ strategy : z
80+ . enum ( [ 'auto' , 'text' , 'regex' , 'recursive' , 'sentence' , 'token' ] )
81+ . default ( 'auto' ) ,
82+ /** Regex pattern (required when strategy is 'regex') */
83+ regexPattern : z . string ( ) . optional ( ) ,
84+ /** Custom separators for recursive strategy (comma-separated) */
85+ customSeparators : z . string ( ) . optional ( ) ,
6186 } )
6287 . refine (
6388 ( data ) => {
64- // Convert maxChunkSize from tokens to characters for comparison (1 token ≈ 4 chars)
6589 const maxChunkSizeInChars = data . maxChunkSize * 4
6690 return data . minChunkSize < maxChunkSizeInChars
6791 } ,
@@ -70,6 +94,18 @@ const FormSchema = z
7094 path : [ 'minChunkSize' ] ,
7195 }
7296 )
97+ . refine (
98+ ( data ) => {
99+ if ( data . strategy === 'regex' && ! data . regexPattern ?. trim ( ) ) {
100+ return false
101+ }
102+ return true
103+ } ,
104+ {
105+ message : 'Regex pattern is required when using the regex strategy' ,
106+ path : [ 'regexPattern' ] ,
107+ }
108+ )
73109
74110type FormValues = z . infer < typeof FormSchema >
75111
@@ -124,6 +160,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
124160 handleSubmit,
125161 reset,
126162 watch,
163+ setValue,
127164 formState : { errors } ,
128165 } = useForm < FormValues > ( {
129166 resolver : zodResolver ( FormSchema ) ,
@@ -133,11 +170,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
133170 minChunkSize : 100 ,
134171 maxChunkSize : 1024 ,
135172 overlapSize : 200 ,
173+ strategy : 'auto' ,
174+ regexPattern : '' ,
175+ customSeparators : '' ,
136176 } ,
137177 mode : 'onSubmit' ,
138178 } )
139179
140180 const nameValue = watch ( 'name' )
181+ const strategyValue = watch ( 'strategy' )
141182
142183 useEffect ( ( ) => {
143184 if ( open ) {
@@ -153,6 +194,9 @@ export const CreateBaseModal = memo(function CreateBaseModal({
153194 minChunkSize : 100 ,
154195 maxChunkSize : 1024 ,
155196 overlapSize : 200 ,
197+ strategy : 'auto' ,
198+ regexPattern : '' ,
199+ customSeparators : '' ,
156200 } )
157201 }
158202 } , [ open , reset ] )
@@ -255,6 +299,17 @@ export const CreateBaseModal = memo(function CreateBaseModal({
255299 setSubmitStatus ( null )
256300
257301 try {
302+ const strategyOptions : StrategyOptions | undefined =
303+ data . strategy === 'regex' && data . regexPattern
304+ ? { pattern : data . regexPattern }
305+ : data . strategy === 'recursive' && data . customSeparators ?. trim ( )
306+ ? {
307+ separators : data . customSeparators
308+ . split ( ',' )
309+ . map ( ( s ) => s . trim ( ) . replace ( / \\ n / g, '\n' ) . replace ( / \\ t / g, '\t' ) ) ,
310+ }
311+ : undefined
312+
258313 const newKnowledgeBase = await createKnowledgeBaseMutation . mutateAsync ( {
259314 name : data . name ,
260315 description : data . description || undefined ,
@@ -263,6 +318,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({
263318 maxSize : data . maxChunkSize ,
264319 minSize : data . minChunkSize ,
265320 overlap : data . overlapSize ,
321+ ...( data . strategy !== 'auto' && { strategy : data . strategy } ) ,
322+ ...( strategyOptions && { strategyOptions } ) ,
266323 } ,
267324 } )
268325
@@ -403,6 +460,69 @@ export const CreateBaseModal = memo(function CreateBaseModal({
403460 </ p >
404461 </ div >
405462
463+ < div className = 'flex flex-col gap-2' >
464+ < Label htmlFor = 'strategy' > Chunking Strategy</ Label >
465+ < Select
466+ value = { strategyValue }
467+ onValueChange = { ( value ) =>
468+ setValue ( 'strategy' , value as FormValues [ 'strategy' ] )
469+ }
470+ >
471+ < SelectTrigger id = 'strategy' >
472+ < SelectValue placeholder = 'Auto (detect from content)' />
473+ </ SelectTrigger >
474+ < SelectContent >
475+ { STRATEGY_OPTIONS . map ( ( option ) => (
476+ < SelectItem key = { option . value } value = { option . value } >
477+ { option . label }
478+ </ SelectItem >
479+ ) ) }
480+ </ SelectContent >
481+ </ Select >
482+ < p className = 'text-[var(--text-muted)] text-xs' >
483+ Auto detects the best strategy based on file content type.
484+ </ p >
485+ </ div >
486+
487+ { strategyValue === 'regex' && (
488+ < div className = 'flex flex-col gap-2' >
489+ < Label htmlFor = 'regexPattern' > Regex Pattern</ Label >
490+ < Input
491+ id = 'regexPattern'
492+ placeholder = 'e.g. \\n\\n or (?<=\\})\\s*(?=\\{)'
493+ { ...register ( 'regexPattern' ) }
494+ className = { cn ( errors . regexPattern && 'border-[var(--text-error)]' ) }
495+ autoComplete = 'off'
496+ data-form-type = 'other'
497+ />
498+ { errors . regexPattern && (
499+ < p className = 'text-[var(--text-error)] text-xs' >
500+ { errors . regexPattern . message }
501+ </ p >
502+ ) }
503+ < p className = 'text-[var(--text-muted)] text-xs' >
504+ Text will be split at each match of this regex pattern.
505+ </ p >
506+ </ div >
507+ ) }
508+
509+ { strategyValue === 'recursive' && (
510+ < div className = 'flex flex-col gap-2' >
511+ < Label htmlFor = 'customSeparators' > Custom Separators (optional)</ Label >
512+ < Input
513+ id = 'customSeparators'
514+ placeholder = 'e.g. \n\n, \n, . , '
515+ { ...register ( 'customSeparators' ) }
516+ autoComplete = 'off'
517+ data-form-type = 'other'
518+ />
519+ < p className = 'text-[var(--text-muted)] text-xs' >
520+ Comma-separated list of delimiters in priority order. Leave empty for default
521+ separators.
522+ </ p >
523+ </ div >
524+ ) }
525+
406526 < div className = 'flex flex-col gap-2' >
407527 < Label > Upload Documents</ Label >
408528 < Button
0 commit comments