Skip to content

Commit 5e8b051

Browse files
committed
fix(chunkers): restore structured data token ratio and overlap joiner
- Restore /3 token estimation for StructuredDataChunker (structured data is denser than prose, ~3 chars/token vs ~4) - Change addOverlap joiner from \n to space to match original TextChunker behavior
1 parent 3a26dad commit 5e8b051

File tree

3 files changed

+15
-13
lines changed

3 files changed

+15
-13
lines changed

apps/sim/lib/chunkers/structured-data-chunker.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import { createLogger } from '@sim/logger'
22
import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types'
3-
import { estimateTokens } from '@/lib/chunkers/utils'
3+
/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
4+
function estimateStructuredTokens(text: string): number {
5+
if (!text?.trim()) return 0
6+
return Math.ceil(text.length / 3)
7+
}
48

59
const logger = createLogger('StructuredDataChunker')
610

@@ -28,7 +32,7 @@ export class StructuredDataChunker {
2832
const headerLine = options.headers?.join('\t') || lines[0]
2933
const dataStartIndex = options.headers ? 0 : 1
3034

31-
const estimatedTokensPerRow = StructuredDataChunker.estimateTokensPerRow(
35+
const estimatedTokensPerRow = StructuredDataChunker.estimateStructuredTokensPerRow(
3236
lines.slice(dataStartIndex, Math.min(10, lines.length))
3337
)
3438
const optimalRowsPerChunk = StructuredDataChunker.calculateOptimalRowsPerChunk(
@@ -42,12 +46,12 @@ export class StructuredDataChunker {
4246

4347
let currentChunkRows: string[] = []
4448
let currentTokenEstimate = 0
45-
const headerTokens = estimateTokens(headerLine)
49+
const headerTokens = estimateStructuredTokens(headerLine)
4650
let chunkStartRow = dataStartIndex
4751

4852
for (let i = dataStartIndex; i < lines.length; i++) {
4953
const row = lines[i]
50-
const rowTokens = estimateTokens(row)
54+
const rowTokens = estimateStructuredTokens(row)
5155

5256
const projectedTokens =
5357
currentTokenEstimate +
@@ -111,18 +115,18 @@ export class StructuredDataChunker {
111115
private static createChunk(content: string, startRow: number, endRow: number): Chunk {
112116
return {
113117
text: content,
114-
tokenCount: estimateTokens(content),
118+
tokenCount: estimateStructuredTokens(content),
115119
metadata: {
116120
startIndex: startRow,
117121
endIndex: endRow,
118122
},
119123
}
120124
}
121125

122-
private static estimateTokensPerRow(sampleRows: string[]): number {
126+
private static estimateStructuredTokensPerRow(sampleRows: string[]): number {
123127
if (sampleRows.length === 0) return 50
124128

125-
const totalTokens = sampleRows.reduce((sum, row) => sum + estimateTokens(row), 0)
129+
const totalTokens = sampleRows.reduce((sum, row) => sum + estimateStructuredTokens(row), 0)
126130
return Math.ceil(totalTokens / sampleRows.length)
127131
}
128132

apps/sim/lib/chunkers/utils.test.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,18 +94,16 @@ describe('addOverlap', () => {
9494
expect(result[1].length).toBeGreaterThan('second chunk here'.length)
9595
})
9696

97-
it('joins overlap text with \\n', () => {
97+
it('joins overlap text with space', () => {
9898
const chunks = ['first chunk here', 'second chunk here']
9999
const result = addOverlap(chunks, 10)
100-
expect(result[1]).toContain('\n')
100+
expect(result[1]).toContain('here second')
101101
})
102102

103103
it('snaps overlap to word boundary', () => {
104104
const chunks = ['hello beautiful world', 'next chunk']
105105
const result = addOverlap(chunks, 15)
106-
const overlapPart = result[1].split('\n')[0]
107-
expect(overlapPart).toBe('beautiful world')
108-
expect(result[1]).toBe('beautiful world\nnext chunk')
106+
expect(result[1]).toBe('beautiful world next chunk')
109107
})
110108
})
111109

apps/sim/lib/chunkers/utils.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
4141
: overlapText
4242

4343
if (cleanOverlap.trim()) {
44-
chunk = `${cleanOverlap.trim()}\n${chunk}`
44+
chunk = `${cleanOverlap.trim()} ${chunk}`
4545
}
4646
}
4747

0 commit comments

Comments
 (0)