11import { createLogger } from '@sim/logger'
22import type { Chunk , StructuredDataOptions } from '@/lib/chunkers/types'
3- import { estimateTokens } from '@/lib/chunkers/utils'
3+ /** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
4+ function estimateStructuredTokens ( text : string ) : number {
5+ if ( ! text ?. trim ( ) ) return 0
6+ return Math . ceil ( text . length / 3 )
7+ }
48
59const logger = createLogger ( 'StructuredDataChunker' )
610
@@ -28,7 +32,7 @@ export class StructuredDataChunker {
2832 const headerLine = options . headers ?. join ( '\t' ) || lines [ 0 ]
2933 const dataStartIndex = options . headers ? 0 : 1
3034
31- const estimatedTokensPerRow = StructuredDataChunker . estimateTokensPerRow (
35+ const estimatedTokensPerRow = StructuredDataChunker . estimateStructuredTokensPerRow (
3236 lines . slice ( dataStartIndex , Math . min ( 10 , lines . length ) )
3337 )
3438 const optimalRowsPerChunk = StructuredDataChunker . calculateOptimalRowsPerChunk (
@@ -42,12 +46,12 @@ export class StructuredDataChunker {
4246
4347 let currentChunkRows : string [ ] = [ ]
4448 let currentTokenEstimate = 0
45- const headerTokens = estimateTokens ( headerLine )
49+ const headerTokens = estimateStructuredTokens ( headerLine )
4650 let chunkStartRow = dataStartIndex
4751
4852 for ( let i = dataStartIndex ; i < lines . length ; i ++ ) {
4953 const row = lines [ i ]
50- const rowTokens = estimateTokens ( row )
54+ const rowTokens = estimateStructuredTokens ( row )
5155
5256 const projectedTokens =
5357 currentTokenEstimate +
@@ -111,18 +115,18 @@ export class StructuredDataChunker {
111115 private static createChunk ( content : string , startRow : number , endRow : number ) : Chunk {
112116 return {
113117 text : content ,
114- tokenCount : estimateTokens ( content ) ,
118+ tokenCount : estimateStructuredTokens ( content ) ,
115119 metadata : {
116120 startIndex : startRow ,
117121 endIndex : endRow ,
118122 } ,
119123 }
120124 }
121125
122- private static estimateTokensPerRow ( sampleRows : string [ ] ) : number {
126+ private static estimateStructuredTokensPerRow ( sampleRows : string [ ] ) : number {
123127 if ( sampleRows . length === 0 ) return 50
124128
125- const totalTokens = sampleRows . reduce ( ( sum , row ) => sum + estimateTokens ( row ) , 0 )
129+ const totalTokens = sampleRows . reduce ( ( sum , row ) => sum + estimateStructuredTokens ( row ) , 0 )
126130 return Math . ceil ( totalTokens / sampleRows . length )
127131 }
128132
0 commit comments